diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 00000000000..660c62884be --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,18 @@ +--- +name: Bug report +about: Create a report to help us improve +title: '' +labels: bug +assignees: '' + +--- + + diff --git a/.github/ISSUE_TEMPLATE/feature-proposal-discussion.md b/.github/ISSUE_TEMPLATE/feature-proposal-discussion.md new file mode 100644 index 00000000000..61e797b9ca1 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature-proposal-discussion.md @@ -0,0 +1,18 @@ +--- +name: Feature proposal or discussion +about: Suggest an idea for Kaldi +title: '' +labels: discussion +assignees: '' + +--- + + diff --git a/.gitignore b/.gitignore index df7cb26de9f..ed66c79d1c8 100644 --- a/.gitignore +++ b/.gitignore @@ -73,15 +73,19 @@ GSYMS /src/kaldi.mk.bak # /egs/ -/egs/*/s*/mfcc -/egs/*/s*/plp -/egs/*/s*/exp -/egs/*/s*/data +/egs/*/*/mfcc +/egs/*/*/plp +/egs/*/*/exp +/egs/*/*/data +/egs/*/*/wav +/egs/*/*/enhan # /tools/ +/tools/pocolm/ /tools/ATLAS/ /tools/atlas3.8.3.tar.gz /tools/irstlm/ +/tools/mitlm/ /tools/openfst /tools/openfst-1.3.2.tar.gz /tools/openfst-1.3.2/ @@ -143,3 +147,12 @@ GSYMS /tools/mmseg-1.3.0.tar.gz /tools/mmseg-1.3.0/ /kaldiwin_vs* +/tools/cub-1.8.0.zip +/tools/cub-1.8.0/ +/tools/cub +/tools/python/ + +# These CMakeLists.txt files are all genareted on the fly at the moment. +# They are added here to avoid accidently checkin. +/src/**/CMakeLists.txt +/build* diff --git a/.travis.yml b/.travis.yml index 23507297413..51e49653efc 100644 --- a/.travis.yml +++ b/.travis.yml @@ -49,7 +49,7 @@ script: # for the explanation why extra switches needed for clang with ccache. - CXX="ccache clang++-3.8 -Qunused-arguments -fcolor-diagnostics -Wno-tautological-compare" CFLAGS="" - LDFLAGS="-llapack" + LDFLAGS="-llapack -Wl,-fuse-ld=gold" INCDIRS="$XROOT/usr/include" LIBDIRS="$XROOT/usr/lib" tools/extras/travis_script.sh diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 00000000000..748d88a351f --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,195 @@ +cmake_minimum_required(VERSION 3.5) +project(kaldi) + +set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake;${CMAKE_MODULE_PATH}") +include(GNUInstallDirs) +include(Utils) +include(third_party/get_third_party) + +message(STATUS "Running gen_cmake_skeleton.py") +execute_process(COMMAND python + "${CMAKE_CURRENT_SOURCE_DIR}/cmake/gen_cmake_skeleton.py" + "${CMAKE_CURRENT_SOURCE_DIR}/src" + "--quiet" +) + +set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_EXTENSIONS OFF) +set(CMAKE_INSTALL_MESSAGE LAZY) # hide "-- Up-to-date: ..." +if(BUILD_SHARED_LIBS) + set(CMAKE_POSITION_INDEPENDENT_CODE ON) + if(WIN32) + set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) + message(FATAL_ERROR "DLL is not supported currently") + elseif(APPLE) + set(CMAKE_INSTALL_RPATH "@loader_path") + else() + set(CMAKE_INSTALL_RPATH "$ORIGIN;$ORIGIN/../lib") + endif() +endif() + +set(MATHLIB "OpenBLAS" CACHE STRING "OpenBLAS|MKL|Accelerate") +option(KALDI_BUILD_EXE "If disabled, will make add_kaldi_executable a no-op" ON) +option(KALDI_BUILD_TEST "If disabled, will make add_kaldi_test_executable a no-op" ON) +option(KALDI_USE_PATCH_NUMBER "Use MAJOR.MINOR.PATCH format, otherwise MAJOR.MINOR" OFF) + +link_libraries(${CMAKE_DL_LIBS}) + +find_package(Threads) +link_libraries(Threads::Threads) + +if(MATHLIB STREQUAL "OpenBLAS") + set(BLA_VENDOR "OpenBLAS") + find_package(LAPACK REQUIRED) + add_definitions(-DHAVE_CLAPACK=1) + include_directories(${CMAKE_CURRENT_SOURCE_DIR}/tools/CLAPACK) + link_libraries(${BLAS_LIBRARIES} ${LAPACK_LIBRARIES}) +elseif(MATHLIB STREQUAL "MKL") + set(BLA_VENDOR "Intel10_64lp") + # find_package(BLAS REQUIRED) + normalize_env_path(ENV{MKLROOT}) + find_package(LAPACK REQUIRED) + add_definitions(-DHAVE_MKL=1) + include_directories($ENV{MKLROOT}/include) # TODO: maybe not use env, idk, find_package doesnt handle includes... + link_libraries(${BLAS_LIBRARIES} ${LAPACK_LIBRARIES}) +elseif(MATHLIB STREQUAL "Accelerate") + set(BLA_VENDOR "Apple") + find_package(BLAS REQUIRED) + find_package(LAPACK REQUIRED) + add_definitions(-DHAVE_CLAPACK=1) + link_libraries(${BLAS_LIBRARIES} ${LAPACK_LIBRARIES}) +else() + message(FATAL_ERROR "${MATHLIB} is not tested and supported, you are on your own now.") +endif() + +if(MSVC) + # Added in source, but we actually should do it in build script, whatever... + # add_definitions(-DWIN32_LEAN_AND_MEAN=1) + + add_compile_options(/permissive- /FS /wd4819 /EHsc /bigobj) + + # some warnings related with fst + add_compile_options(/wd4018 /wd4244 /wd4267 /wd4291 /wd4305) + + set(CUDA_USE_STATIC_CUDA_RUNTIME OFF CACHE INTERNAL "") + if(NOT DEFINED ENV{CUDAHOSTCXX}) + set(ENV{CUDAHOSTCXX} ${CMAKE_CXX_COMPILER}) + endif() + if(NOT DEFINED CUDA_HOST_COMPILER) + set(CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER}) + endif() +endif() + +find_package(CUDA) +if(CUDA_FOUND) + set(CUB_ROOT_DIR "${PROJECT_SOURCE_DIR}/tools/cub") + + set(CUDA_PROPAGATE_HOST_FLAGS ON) + set(KALDI_CUDA_NVCC_FLAGS "--default-stream=per-thread;-std=c++${CMAKE_CXX_STANDARD}") + if(MSVC) + list(APPEND KALDI_CUDA_NVCC_FLAGS "-Xcompiler /permissive-,/FS,/wd4819,/EHsc,/bigobj") + list(APPEND KALDI_CUDA_NVCC_FLAGS "-Xcompiler /wd4018,/wd4244,/wd4267,/wd4291,/wd4305") + if(BUILD_SHARED_LIBS) + list(APPEND CUDA_NVCC_FLAGS_RELEASE -Xcompiler /MD) + list(APPEND CUDA_NVCC_FLAGS_DEBUG -Xcompiler /MDd) + endif() + else() + # list(APPEND KALDI_CUDA_NVCC_FLAGS "-Xcompiler -std=c++${CMAKE_CXX_STANDARD}") + list(APPEND KALDI_CUDA_NVCC_FLAGS "-Xcompiler -fPIC") + endif() + set(CUDA_NVCC_FLAGS ${KALDI_CUDA_NVCC_FLAGS} ${CUDA_NVCC_FLAGS}) + + add_definitions(-DHAVE_CUDA=1) + add_definitions(-DCUDA_API_PER_THREAD_DEFAULT_STREAM=1) + include_directories(${CUDA_INCLUDE_DIRS}) + link_libraries( + ${CUDA_LIBRARIES} + ${CUDA_CUDA_LIBRARY} + ${CUDA_CUBLAS_LIBRARIES} + ${CUDA_CUFFT_LIBRARIES} + ${CUDA_curand_LIBRARY} + ${CUDA_cusolver_LIBRARY} + ${CUDA_cusparse_LIBRARY}) + + find_package(NvToolExt REQUIRED) + include_directories(${NvToolExt_INCLUDE_DIR}) + link_libraries(${NvToolExt_LIBRARIES}) + + find_package(CUB REQUIRED) + include_directories(${CUB_INCLUDE_DIR}) +endif() + +add_definitions(-DKALDI_NO_PORTAUDIO=1) + +include(VersionHelper) +get_version() # this will set KALDI_VERSION and KALDI_PATCH_NUMBER +if(${KALDI_USE_PATCH_NUMBER}) + set(KALDI_VERSION "${KALDI_VERSION}.${KALDI_PATCH_NUMBER}") +endif() + +get_third_party(openfst) +set(OPENFST_ROOT_DIR ${CMAKE_CURRENT_BINARY_DIR}/openfst) +include(third_party/openfst_lib_target) +link_libraries(fst) + +# add all native libraries +add_subdirectory(src/base) # NOTE, we need to patch the target with version from outside +set_property(TARGET kaldi-base PROPERTY COMPILE_DEFINITIONS "KALDI_VERSION=\"${KALDI_VERSION}\"") +add_subdirectory(src/matrix) +add_subdirectory(src/cudamatrix) +add_subdirectory(src/util) +add_subdirectory(src/feat) +add_subdirectory(src/tree) +add_subdirectory(src/gmm) +add_subdirectory(src/transform) +add_subdirectory(src/sgmm2) +add_subdirectory(src/fstext) +add_subdirectory(src/hmm) +add_subdirectory(src/lm) +add_subdirectory(src/decoder) +add_subdirectory(src/lat) +add_subdirectory(src/nnet) +add_subdirectory(src/nnet2) +add_subdirectory(src/nnet3) +add_subdirectory(src/rnnlm) +add_subdirectory(src/chain) +add_subdirectory(src/ivector) +add_subdirectory(src/online) +add_subdirectory(src/online2) +add_subdirectory(src/kws) + +add_subdirectory(src/itf) + +# add all cuda libraries +if(CUDA_FOUND) + add_subdirectory(src/cudafeat) + add_subdirectory(src/cudadecoder) +endif() + +# add all native executables +add_subdirectory(src/gmmbin) +add_subdirectory(src/featbin) +add_subdirectory(src/onlinebin) + +# add all cuda executables +if(CUDA_FOUND) + add_subdirectory(src/cudafeatbin) + add_subdirectory(src/cudadecoderbin) +endif() + +include(CMakePackageConfigHelpers) +# maybe we should put this into subfolder? +configure_package_config_file( + ${CMAKE_CURRENT_SOURCE_DIR}/cmake/kaldi-config.cmake.in + ${CMAKE_BINARY_DIR}/cmake/kaldi-config.cmake + INSTALL_DESTINATION lib/cmake/kaldi +) +write_basic_package_version_file( + ${CMAKE_BINARY_DIR}/cmake/kaldi-config-version.cmake + VERSION ${KALDI_VERSION} + COMPATIBILITY AnyNewerVersion +) +install(FILES ${CMAKE_BINARY_DIR}/cmake/kaldi-config.cmake ${CMAKE_BINARY_DIR}/cmake/kaldi-config-version.cmake + DESTINATION lib/cmake/kaldi +) +install(EXPORT kaldi-targets DESTINATION ${CMAKE_INSTALL_PREFIX}/lib/cmake/kaldi) diff --git a/INSTALL b/INSTALL index 2dbf318118c..7beb79a7336 100644 --- a/INSTALL +++ b/INSTALL @@ -1,9 +1,16 @@ This is the official Kaldi INSTALL. Look also at INSTALL.md for the git mirror installation. -[for native Windows install, see windows/INSTALL] +[Option 1 in the following does not apply to native Windows install, see windows/INSTALL or following Option 2] -(1) -go to tools/ and follow INSTALL instructions there. +Option 1 (bash + makefile): -(2) -go to src/ and follow INSTALL instructions there. + Steps: + (1) + go to tools/ and follow INSTALL instructions there. + (2) + go to src/ and follow INSTALL instructions there. + +Option 2 (cmake): + + Go to cmake/ and follow INSTALL.md instructions there. + Note, it may not be well tested and some features are missing currently. diff --git a/cmake/FindBLAS.cmake b/cmake/FindBLAS.cmake new file mode 100644 index 00000000000..67676110c6d --- /dev/null +++ b/cmake/FindBLAS.cmake @@ -0,0 +1,816 @@ +# Distributed under the OSI-approved BSD 3-Clause License. See accompanying +# file Copyright.txt or https://cmake.org/licensing for details. + +#[=======================================================================[.rst: +FindBLAS +-------- + +Find Basic Linear Algebra Subprograms (BLAS) library + +This module finds an installed Fortran library that implements the +BLAS linear-algebra interface (see http://www.netlib.org/blas/). The +list of libraries searched for is taken from the ``autoconf`` macro file, +``acx_blas.m4`` (distributed at +http://ac-archive.sourceforge.net/ac-archive/acx_blas.html). + +Input Variables +^^^^^^^^^^^^^^^ + +The following variables may be set to influence this module's behavior: + +``BLA_STATIC`` + if ``ON`` use static linkage + +``BLA_VENDOR`` + If set, checks only the specified vendor, if not set checks all the + possibilities. List of vendors valid in this module: + + * Goto + * OpenBLAS + * FLAME + * ATLAS PhiPACK + * CXML + * DXML + * SunPerf + * SCSL + * SGIMATH + * IBMESSL + * Intel10_32 (intel mkl v10 32 bit) + * Intel10_64lp (intel mkl v10+ 64 bit, threaded code, lp64 model) + * Intel10_64lp_seq (intel mkl v10+ 64 bit, sequential code, lp64 model) + * Intel10_64ilp (intel mkl v10+ 64 bit, threaded code, ilp64 model) + * Intel10_64ilp_seq (intel mkl v10+ 64 bit, sequential code, ilp64 model) + * Intel (obsolete versions of mkl 32 and 64 bit) + * ACML + * ACML_MP + * ACML_GPU + * Apple + * NAS + * Generic + +``BLA_F95`` + if ``ON`` tries to find the BLAS95 interfaces + +``BLA_PREFER_PKGCONFIG`` + if set ``pkg-config`` will be used to search for a BLAS library first + and if one is found that is preferred + +Result Variables +^^^^^^^^^^^^^^^^ + +This module defines the following variables: + +``BLAS_FOUND`` + library implementing the BLAS interface is found +``BLAS_LINKER_FLAGS`` + uncached list of required linker flags (excluding ``-l`` and ``-L``). +``BLAS_LIBRARIES`` + uncached list of libraries (using full path name) to link against + to use BLAS (may be empty if compiler implicitly links BLAS) +``BLAS95_LIBRARIES`` + uncached list of libraries (using full path name) to link against + to use BLAS95 interface +``BLAS95_FOUND`` + library implementing the BLAS95 interface is found + +.. note:: + + C or CXX must be enabled to use Intel Math Kernel Library (MKL) + + For example, to use Intel MKL libraries and/or Intel compiler: + + .. code-block:: cmake + + set(BLA_VENDOR Intel10_64lp) + find_package(BLAS) + +Hints +^^^^^ + +Set ``MKLROOT`` environment variable to a directory that contains an MKL +installation. + +#]=======================================================================] + +include(CheckFunctionExists) +include(CheckFortranFunctionExists) +include(CMakePushCheckState) +include(FindPackageHandleStandardArgs) +cmake_push_check_state() +set(CMAKE_REQUIRED_QUIET ${BLAS_FIND_QUIETLY}) + +set(_blas_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES}) + +# Check the language being used +if( NOT (CMAKE_C_COMPILER_LOADED OR CMAKE_CXX_COMPILER_LOADED OR CMAKE_Fortran_COMPILER_LOADED) ) + if(BLAS_FIND_REQUIRED) + message(FATAL_ERROR "FindBLAS requires Fortran, C, or C++ to be enabled.") + else() + message(STATUS "Looking for BLAS... - NOT found (Unsupported languages)") + return() + endif() +endif() + +if(BLA_PREFER_PKGCONFIG) + find_package(PkgConfig) + pkg_check_modules(PKGC_BLAS blas) + if(PKGC_BLAS_FOUND) + set(BLAS_FOUND ${PKGC_BLAS_FOUND}) + set(BLAS_LIBRARIES "${PKGC_BLAS_LINK_LIBRARIES}") + return() + endif() +endif() + +macro(Check_Fortran_Libraries LIBRARIES _prefix _name _flags _list _thread) + # This macro checks for the existence of the combination of fortran libraries + # given by _list. If the combination is found, this macro checks (using the + # Check_Fortran_Function_Exists macro) whether can link against that library + # combination using the name of a routine given by _name using the linker + # flags given by _flags. If the combination of libraries is found and passes + # the link test, LIBRARIES is set to the list of complete library paths that + # have been found. Otherwise, LIBRARIES is set to FALSE. + + # N.B. _prefix is the prefix applied to the names of all cached variables that + # are generated internally and marked advanced by this macro. + + set(_libdir ${ARGN}) + + set(_libraries_work TRUE) + set(${LIBRARIES}) + set(_combined_name) + if (NOT _libdir) + if (WIN32) + set(_libdir ENV LIB) + elseif (APPLE) + set(_libdir ENV DYLD_LIBRARY_PATH) + else () + set(_libdir ENV LD_LIBRARY_PATH) + endif () + endif () + + list(APPEND _libdir "${CMAKE_C_IMPLICIT_LINK_DIRECTORIES}") + + foreach(_library ${_list}) + set(_combined_name ${_combined_name}_${_library}) + if(NOT "${_thread}" STREQUAL "") + set(_combined_name ${_combined_name}_thread) + endif() + if(_libraries_work) + if (BLA_STATIC) + if (WIN32) + set(CMAKE_FIND_LIBRARY_SUFFIXES .lib ${CMAKE_FIND_LIBRARY_SUFFIXES}) + endif () + if (APPLE) + set(CMAKE_FIND_LIBRARY_SUFFIXES .lib ${CMAKE_FIND_LIBRARY_SUFFIXES}) + else () + set(CMAKE_FIND_LIBRARY_SUFFIXES .a ${CMAKE_FIND_LIBRARY_SUFFIXES}) + endif () + else () + if (CMAKE_SYSTEM_NAME STREQUAL "Linux") + # for ubuntu's libblas3gf and liblapack3gf packages + set(CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES} .so.3gf) + endif () + endif () + find_library(${_prefix}_${_library}_LIBRARY + NAMES ${_library} + PATHS ${_libdir} + ) + mark_as_advanced(${_prefix}_${_library}_LIBRARY) + set(${LIBRARIES} ${${LIBRARIES}} ${${_prefix}_${_library}_LIBRARY}) + set(_libraries_work ${${_prefix}_${_library}_LIBRARY}) + endif() + endforeach() + if(_libraries_work) + # Test this combination of libraries. + set(CMAKE_REQUIRED_LIBRARIES ${_flags} ${${LIBRARIES}} ${_thread}) + # message("DEBUG: CMAKE_REQUIRED_LIBRARIES = ${CMAKE_REQUIRED_LIBRARIES}") + if (CMAKE_Fortran_COMPILER_LOADED) + check_fortran_function_exists("${_name}" ${_prefix}${_combined_name}_WORKS) + else() + check_function_exists("${_name}_" ${_prefix}${_combined_name}_WORKS) + endif() + set(CMAKE_REQUIRED_LIBRARIES) + set(_libraries_work ${${_prefix}${_combined_name}_WORKS}) + endif() + if(_libraries_work) + if("${_list}" STREQUAL "") + set(${LIBRARIES} "${LIBRARIES}-PLACEHOLDER-FOR-EMPTY-LIBRARIES") + else() + set(${LIBRARIES} ${${LIBRARIES}} ${_thread}) # for static link + endif() + else() + set(${LIBRARIES} FALSE) + endif() + #message("DEBUG: ${LIBRARIES} = ${${LIBRARIES}}") +endmacro() + +set(BLAS_LINKER_FLAGS) +set(BLAS_LIBRARIES) +set(BLAS95_LIBRARIES) +if (NOT $ENV{BLA_VENDOR} STREQUAL "") + set(BLA_VENDOR $ENV{BLA_VENDOR}) +else () + if(NOT BLA_VENDOR) + set(BLA_VENDOR "All") + endif() +endif () + +if (BLA_VENDOR STREQUAL "All") + if(NOT BLAS_LIBRARIES) + # Implicitly linked BLAS libraries + check_fortran_libraries( + BLAS_LIBRARIES + BLAS + sgemm + "" + "" + "" + ) + endif() +endif () + +#BLAS in intel mkl 10+ library? (em64t 64bit) +if (BLA_VENDOR MATCHES "Intel" OR BLA_VENDOR STREQUAL "All") + if (NOT BLAS_LIBRARIES) + + # System-specific settings + if (WIN32) + if (BLA_STATIC) + set(BLAS_mkl_DLL_SUFFIX "") + else() + set(BLAS_mkl_DLL_SUFFIX "_dll") + endif() + else() + # Switch to GNU Fortran support layer if needed (but not on Apple, where MKL does not provide it) + if(CMAKE_Fortran_COMPILER_LOADED AND CMAKE_Fortran_COMPILER_ID STREQUAL "GNU" AND NOT APPLE) + set(BLAS_mkl_INTFACE "gf") + set(BLAS_mkl_THREADING "gnu") + set(BLAS_mkl_OMP "gomp") + else() + set(BLAS_mkl_INTFACE "intel") + set(BLAS_mkl_THREADING "intel") + set(BLAS_mkl_OMP "iomp5") + endif() + set(BLAS_mkl_LM "-lm") + set(BLAS_mkl_LDL "-ldl") + endif() + + if (BLA_VENDOR MATCHES "_64ilp") + set(BLAS_mkl_ILP_MODE "ilp64") + else () + set(BLAS_mkl_ILP_MODE "lp64") + endif () + + if (CMAKE_C_COMPILER_LOADED OR CMAKE_CXX_COMPILER_LOADED) + if(BLAS_FIND_QUIETLY OR NOT BLAS_FIND_REQUIRED) + find_package(Threads) + else() + find_package(Threads REQUIRED) + endif() + + set(BLAS_SEARCH_LIBS "") + + if(BLA_F95) + set(BLAS_mkl_SEARCH_SYMBOL sgemm_f95) + set(_LIBRARIES BLAS95_LIBRARIES) + if (WIN32) + # Find the main file (32-bit or 64-bit) + set(BLAS_SEARCH_LIBS_WIN_MAIN "") + if (BLA_VENDOR STREQUAL "Intel10_32" OR BLA_VENDOR STREQUAL "All") + list(APPEND BLAS_SEARCH_LIBS_WIN_MAIN + "mkl_blas95${BLAS_mkl_DLL_SUFFIX} mkl_intel_c${BLAS_mkl_DLL_SUFFIX}") + endif() + if (BLA_VENDOR MATCHES "^Intel10_64i?lp" OR BLA_VENDOR STREQUAL "All") + list(APPEND BLAS_SEARCH_LIBS_WIN_MAIN + "mkl_blas95_${BLAS_mkl_ILP_MODE}${BLAS_mkl_DLL_SUFFIX} mkl_intel_${BLAS_mkl_ILP_MODE}${BLAS_mkl_DLL_SUFFIX}") + endif () + + # Add threading/sequential libs + set(BLAS_SEARCH_LIBS_WIN_THREAD "") + if (BLA_VENDOR MATCHES "_seq$" OR BLA_VENDOR STREQUAL "All") + list(APPEND BLAS_SEARCH_LIBS_WIN_THREAD + "mkl_sequential${BLAS_mkl_DLL_SUFFIX}") + endif() + if (NOT BLA_VENDOR MATCHES "_seq$" OR BLA_VENDOR STREQUAL "All") + # old version + list(APPEND BLAS_SEARCH_LIBS_WIN_THREAD + "libguide40 mkl_intel_thread${BLAS_mkl_DLL_SUFFIX}") + # mkl >= 10.3 + list(APPEND BLAS_SEARCH_LIBS_WIN_THREAD + "libiomp5md mkl_intel_thread${BLAS_mkl_DLL_SUFFIX}") + endif() + + # Cartesian product of the above + foreach (MAIN ${BLAS_SEARCH_LIBS_WIN_MAIN}) + foreach (THREAD ${BLAS_SEARCH_LIBS_WIN_THREAD}) + list(APPEND BLAS_SEARCH_LIBS + "${MAIN} ${THREAD} mkl_core${BLAS_mkl_DLL_SUFFIX}") + endforeach() + endforeach() + else () + if (BLA_VENDOR STREQUAL "Intel10_32" OR BLA_VENDOR STREQUAL "All") + # old version + list(APPEND BLAS_SEARCH_LIBS + "mkl_blas95 mkl_${BLAS_mkl_INTFACE} mkl_${BLAS_mkl_THREADING}_thread mkl_core guide") + + # mkl >= 10.3 + list(APPEND BLAS_SEARCH_LIBS + "mkl_blas95 mkl_${BLAS_mkl_INTFACE} mkl_${BLAS_mkl_THREADING}_thread mkl_core ${BLAS_mkl_OMP}") + endif () + if (BLA_VENDOR MATCHES "^Intel10_64i?lp$" OR BLA_VENDOR STREQUAL "All") + # old version + list(APPEND BLAS_SEARCH_LIBS + "mkl_blas95 mkl_${BLAS_mkl_INTFACE}_${BLAS_mkl_ILP_MODE} mkl_${BLAS_mkl_THREADING}_thread mkl_core guide") + + # mkl >= 10.3 + list(APPEND BLAS_SEARCH_LIBS + "mkl_blas95_${BLAS_mkl_ILP_MODE} mkl_${BLAS_mkl_INTFACE}_${BLAS_mkl_ILP_MODE} mkl_${BLAS_mkl_THREADING}_thread mkl_core ${BLAS_mkl_OMP}") + endif () + if (BLA_VENDOR MATCHES "^Intel10_64i?lp_seq$" OR BLA_VENDOR STREQUAL "All") + list(APPEND BLAS_SEARCH_LIBS + "mkl_blas95_${BLAS_mkl_ILP_MODE} mkl_${BLAS_mkl_INTFACE}_${BLAS_mkl_ILP_MODE} mkl_sequential mkl_core") + endif () + endif () + else () + set(BLAS_mkl_SEARCH_SYMBOL sgemm) + set(_LIBRARIES BLAS_LIBRARIES) + if (WIN32) + # Find the main file (32-bit or 64-bit) + set(BLAS_SEARCH_LIBS_WIN_MAIN "") + if (BLA_VENDOR STREQUAL "Intel10_32" OR BLA_VENDOR STREQUAL "All") + list(APPEND BLAS_SEARCH_LIBS_WIN_MAIN + "mkl_intel_c${BLAS_mkl_DLL_SUFFIX}") + endif() + if (BLA_VENDOR MATCHES "^Intel10_64i?lp" OR BLA_VENDOR STREQUAL "All") + list(APPEND BLAS_SEARCH_LIBS_WIN_MAIN + "mkl_intel_${BLAS_mkl_ILP_MODE}${BLAS_mkl_DLL_SUFFIX}") + endif () + + # Add threading/sequential libs + set(BLAS_SEARCH_LIBS_WIN_THREAD "") + if (NOT BLA_VENDOR MATCHES "_seq$" OR BLA_VENDOR STREQUAL "All") + # old version + list(APPEND BLAS_SEARCH_LIBS_WIN_THREAD + "libguide40 mkl_intel_thread${BLAS_mkl_DLL_SUFFIX}") + # mkl >= 10.3 + list(APPEND BLAS_SEARCH_LIBS_WIN_THREAD + "libiomp5md mkl_intel_thread${BLAS_mkl_DLL_SUFFIX}") + endif() + if (BLA_VENDOR MATCHES "_seq$" OR BLA_VENDOR STREQUAL "All") + list(APPEND BLAS_SEARCH_LIBS_WIN_THREAD + "mkl_sequential${BLAS_mkl_DLL_SUFFIX}") + endif() + + # Cartesian product of the above + foreach (MAIN ${BLAS_SEARCH_LIBS_WIN_MAIN}) + foreach (THREAD ${BLAS_SEARCH_LIBS_WIN_THREAD}) + list(APPEND BLAS_SEARCH_LIBS + "${MAIN} ${THREAD} mkl_core${BLAS_mkl_DLL_SUFFIX}") + endforeach() + endforeach() + else () + if (BLA_VENDOR STREQUAL "Intel10_32" OR BLA_VENDOR STREQUAL "All") + # old version + list(APPEND BLAS_SEARCH_LIBS + "mkl_${BLAS_mkl_INTFACE} mkl_${BLAS_mkl_THREADING}_thread mkl_core guide") + + # mkl >= 10.3 + list(APPEND BLAS_SEARCH_LIBS + "mkl_${BLAS_mkl_INTFACE} mkl_${BLAS_mkl_THREADING}_thread mkl_core ${BLAS_mkl_OMP}") + endif () + if (BLA_VENDOR MATCHES "^Intel10_64i?lp$" OR BLA_VENDOR STREQUAL "All") + # old version + list(APPEND BLAS_SEARCH_LIBS + "mkl_${BLAS_mkl_INTFACE}_${BLAS_mkl_ILP_MODE} mkl_${BLAS_mkl_THREADING}_thread mkl_core guide") + + # mkl >= 10.3 + list(APPEND BLAS_SEARCH_LIBS + "mkl_${BLAS_mkl_INTFACE}_${BLAS_mkl_ILP_MODE} mkl_${BLAS_mkl_THREADING}_thread mkl_core ${BLAS_mkl_OMP}") + endif () + if (BLA_VENDOR MATCHES "^Intel10_64i?lp_seq$" OR BLA_VENDOR STREQUAL "All") + list(APPEND BLAS_SEARCH_LIBS + "mkl_${BLAS_mkl_INTFACE}_${BLAS_mkl_ILP_MODE} mkl_sequential mkl_core") + endif () + + #older vesions of intel mkl libs + if (BLA_VENDOR STREQUAL "Intel" OR BLA_VENDOR STREQUAL "All") + list(APPEND BLAS_SEARCH_LIBS + "mkl") + list(APPEND BLAS_SEARCH_LIBS + "mkl_ia32") + list(APPEND BLAS_SEARCH_LIBS + "mkl_em64t") + endif () + endif () + endif () + + if (DEFINED ENV{MKLROOT}) + if (BLA_VENDOR STREQUAL "Intel10_32") + set(_BLAS_MKLROOT_LIB_DIR "$ENV{MKLROOT}/lib/ia32") + elseif (BLA_VENDOR MATCHES "^Intel10_64i?lp$" OR BLA_VENDOR MATCHES "^Intel10_64i?lp_seq$") + set(_BLAS_MKLROOT_LIB_DIR "$ENV{MKLROOT}/lib/intel64") + endif () + endif () + if (_BLAS_MKLROOT_LIB_DIR) + if (WIN32) + string(APPEND _BLAS_MKLROOT_LIB_DIR "_win") + elseif (APPLE) + string(APPEND _BLAS_MKLROOT_LIB_DIR "_mac") + else () + string(APPEND _BLAS_MKLROOT_LIB_DIR "_lin") + endif () + endif () + + foreach (IT ${BLAS_SEARCH_LIBS}) + string(REPLACE " " ";" SEARCH_LIBS ${IT}) + if (NOT ${_LIBRARIES}) + check_fortran_libraries( + ${_LIBRARIES} + BLAS + ${BLAS_mkl_SEARCH_SYMBOL} + "" + "${SEARCH_LIBS}" + "${CMAKE_THREAD_LIBS_INIT};${BLAS_mkl_LM};${BLAS_mkl_LDL}" + "${_BLAS_MKLROOT_LIB_DIR}" + ) + endif () + endforeach () + + endif () + unset(BLAS_mkl_ILP_MODE) + unset(BLAS_mkl_INTFACE) + unset(BLAS_mkl_THREADING) + unset(BLAS_mkl_OMP) + unset(BLAS_mkl_DLL_SUFFIX) + unset(BLAS_mkl_LM) + unset(BLAS_mkl_LDL) + endif () +endif () + +if(BLA_F95) + find_package_handle_standard_args(BLAS REQUIRED_VARS BLAS95_LIBRARIES) + set(BLAS95_FOUND ${BLAS_FOUND}) + if(BLAS_FOUND) + set(BLAS_LIBRARIES "${BLAS95_LIBRARIES}") + endif() +endif() + +if (BLA_VENDOR STREQUAL "Goto" OR BLA_VENDOR STREQUAL "All") + if(NOT BLAS_LIBRARIES) + # gotoblas (http://www.tacc.utexas.edu/tacc-projects/gotoblas2) + check_fortran_libraries( + BLAS_LIBRARIES + BLAS + sgemm + "" + "goto2" + "" + ) + endif() +endif () + +if (BLA_VENDOR STREQUAL "OpenBLAS" OR BLA_VENDOR STREQUAL "All") + if(NOT BLAS_LIBRARIES) + # OpenBLAS (http://www.openblas.net) + check_fortran_libraries( + BLAS_LIBRARIES + BLAS + sgemm + "" + "openblas" + "" + ) + endif() + if(NOT BLAS_LIBRARIES) + find_package(Threads) + # OpenBLAS (http://www.openblas.net) + check_fortran_libraries( + BLAS_LIBRARIES + BLAS + sgemm + "" + "openblas" + "${CMAKE_THREAD_LIBS_INIT}" + ) + endif() +endif () + +if (BLA_VENDOR STREQUAL "FLAME" OR BLA_VENDOR STREQUAL "All") + if(NOT BLAS_LIBRARIES) + # FLAME's blis library (https://github.com/flame/blis) + check_fortran_libraries( + BLAS_LIBRARIES + BLAS + sgemm + "" + "blis" + "" + ) + endif() +endif () + +if (BLA_VENDOR STREQUAL "ATLAS" OR BLA_VENDOR STREQUAL "All") + if(NOT BLAS_LIBRARIES) + # BLAS in ATLAS library? (http://math-atlas.sourceforge.net/) + check_fortran_libraries( + BLAS_LIBRARIES + BLAS + dgemm + "" + "f77blas;atlas" + "" + ) + endif() +endif () + +# BLAS in PhiPACK libraries? (requires generic BLAS lib, too) +if (BLA_VENDOR STREQUAL "PhiPACK" OR BLA_VENDOR STREQUAL "All") + if(NOT BLAS_LIBRARIES) + check_fortran_libraries( + BLAS_LIBRARIES + BLAS + sgemm + "" + "sgemm;dgemm;blas" + "" + ) + endif() +endif () + +# BLAS in Alpha CXML library? +if (BLA_VENDOR STREQUAL "CXML" OR BLA_VENDOR STREQUAL "All") + if(NOT BLAS_LIBRARIES) + check_fortran_libraries( + BLAS_LIBRARIES + BLAS + sgemm + "" + "cxml" + "" + ) + endif() +endif () + +# BLAS in Alpha DXML library? (now called CXML, see above) +if (BLA_VENDOR STREQUAL "DXML" OR BLA_VENDOR STREQUAL "All") + if(NOT BLAS_LIBRARIES) + check_fortran_libraries( + BLAS_LIBRARIES + BLAS + sgemm + "" + "dxml" + "" + ) + endif() +endif () + +# BLAS in Sun Performance library? +if (BLA_VENDOR STREQUAL "SunPerf" OR BLA_VENDOR STREQUAL "All") + if(NOT BLAS_LIBRARIES) + check_fortran_libraries( + BLAS_LIBRARIES + BLAS + sgemm + "-xlic_lib=sunperf" + "sunperf;sunmath" + "" + ) + if(BLAS_LIBRARIES) + set(BLAS_LINKER_FLAGS "-xlic_lib=sunperf") + endif() + endif() +endif () + +# BLAS in SCSL library? (SGI/Cray Scientific Library) +if (BLA_VENDOR STREQUAL "SCSL" OR BLA_VENDOR STREQUAL "All") + if(NOT BLAS_LIBRARIES) + check_fortran_libraries( + BLAS_LIBRARIES + BLAS + sgemm + "" + "scsl" + "" + ) + endif() +endif () + +# BLAS in SGIMATH library? +if (BLA_VENDOR STREQUAL "SGIMATH" OR BLA_VENDOR STREQUAL "All") + if(NOT BLAS_LIBRARIES) + check_fortran_libraries( + BLAS_LIBRARIES + BLAS + sgemm + "" + "complib.sgimath" + "" + ) + endif() +endif () + +# BLAS in IBM ESSL library? (requires generic BLAS lib, too) +if (BLA_VENDOR STREQUAL "IBMESSL" OR BLA_VENDOR STREQUAL "All") + if(NOT BLAS_LIBRARIES) + check_fortran_libraries( + BLAS_LIBRARIES + BLAS + sgemm + "" + "essl;blas" + "" + ) + endif() +endif () + +#BLAS in acml library? +if (BLA_VENDOR MATCHES "ACML" OR BLA_VENDOR STREQUAL "All") + if( ((BLA_VENDOR STREQUAL "ACML") AND (NOT BLAS_ACML_LIB_DIRS)) OR + ((BLA_VENDOR STREQUAL "ACML_MP") AND (NOT BLAS_ACML_MP_LIB_DIRS)) OR + ((BLA_VENDOR STREQUAL "ACML_GPU") AND (NOT BLAS_ACML_GPU_LIB_DIRS)) + ) + # try to find acml in "standard" paths + if( WIN32 ) + file( GLOB _ACML_ROOT "C:/AMD/acml*/ACML-EULA.txt" ) + else() + file( GLOB _ACML_ROOT "/opt/acml*/ACML-EULA.txt" ) + endif() + if( WIN32 ) + file( GLOB _ACML_GPU_ROOT "C:/AMD/acml*/GPGPUexamples" ) + else() + file( GLOB _ACML_GPU_ROOT "/opt/acml*/GPGPUexamples" ) + endif() + list(GET _ACML_ROOT 0 _ACML_ROOT) + list(GET _ACML_GPU_ROOT 0 _ACML_GPU_ROOT) + if( _ACML_ROOT ) + get_filename_component( _ACML_ROOT ${_ACML_ROOT} PATH ) + if( SIZEOF_INTEGER EQUAL 8 ) + set( _ACML_PATH_SUFFIX "_int64" ) + else() + set( _ACML_PATH_SUFFIX "" ) + endif() + if( CMAKE_Fortran_COMPILER_ID STREQUAL "Intel" ) + set( _ACML_COMPILER32 "ifort32" ) + set( _ACML_COMPILER64 "ifort64" ) + elseif( CMAKE_Fortran_COMPILER_ID STREQUAL "SunPro" ) + set( _ACML_COMPILER32 "sun32" ) + set( _ACML_COMPILER64 "sun64" ) + elseif( CMAKE_Fortran_COMPILER_ID STREQUAL "PGI" ) + set( _ACML_COMPILER32 "pgi32" ) + if( WIN32 ) + set( _ACML_COMPILER64 "win64" ) + else() + set( _ACML_COMPILER64 "pgi64" ) + endif() + elseif( CMAKE_Fortran_COMPILER_ID STREQUAL "Open64" ) + # 32 bit builds not supported on Open64 but for code simplicity + # We'll just use the same directory twice + set( _ACML_COMPILER32 "open64_64" ) + set( _ACML_COMPILER64 "open64_64" ) + elseif( CMAKE_Fortran_COMPILER_ID STREQUAL "NAG" ) + set( _ACML_COMPILER32 "nag32" ) + set( _ACML_COMPILER64 "nag64" ) + else() + set( _ACML_COMPILER32 "gfortran32" ) + set( _ACML_COMPILER64 "gfortran64" ) + endif() + + if( BLA_VENDOR STREQUAL "ACML_MP" ) + set(_ACML_MP_LIB_DIRS + "${_ACML_ROOT}/${_ACML_COMPILER32}_mp${_ACML_PATH_SUFFIX}/lib" + "${_ACML_ROOT}/${_ACML_COMPILER64}_mp${_ACML_PATH_SUFFIX}/lib" ) + else() + set(_ACML_LIB_DIRS + "${_ACML_ROOT}/${_ACML_COMPILER32}${_ACML_PATH_SUFFIX}/lib" + "${_ACML_ROOT}/${_ACML_COMPILER64}${_ACML_PATH_SUFFIX}/lib" ) + endif() + endif() +elseif(BLAS_${BLA_VENDOR}_LIB_DIRS) + set(_${BLA_VENDOR}_LIB_DIRS ${BLAS_${BLA_VENDOR}_LIB_DIRS}) +endif() + +if( BLA_VENDOR STREQUAL "ACML_MP" ) + foreach( BLAS_ACML_MP_LIB_DIRS ${_ACML_MP_LIB_DIRS}) + check_fortran_libraries ( + BLAS_LIBRARIES + BLAS + sgemm + "" "acml_mp;acml_mv" "" ${BLAS_ACML_MP_LIB_DIRS} + ) + if( BLAS_LIBRARIES ) + break() + endif() + endforeach() +elseif( BLA_VENDOR STREQUAL "ACML_GPU" ) + foreach( BLAS_ACML_GPU_LIB_DIRS ${_ACML_GPU_LIB_DIRS}) + check_fortran_libraries ( + BLAS_LIBRARIES + BLAS + sgemm + "" "acml;acml_mv;CALBLAS" "" ${BLAS_ACML_GPU_LIB_DIRS} + ) + if( BLAS_LIBRARIES ) + break() + endif() + endforeach() +else() + foreach( BLAS_ACML_LIB_DIRS ${_ACML_LIB_DIRS} ) + check_fortran_libraries ( + BLAS_LIBRARIES + BLAS + sgemm + "" "acml;acml_mv" "" ${BLAS_ACML_LIB_DIRS} + ) + if( BLAS_LIBRARIES ) + break() + endif() + endforeach() +endif() + +# Either acml or acml_mp should be in LD_LIBRARY_PATH but not both +if(NOT BLAS_LIBRARIES) + check_fortran_libraries( + BLAS_LIBRARIES + BLAS + sgemm + "" + "acml;acml_mv" + "" + ) +endif() +if(NOT BLAS_LIBRARIES) + check_fortran_libraries( + BLAS_LIBRARIES + BLAS + sgemm + "" + "acml_mp;acml_mv" + "" + ) +endif() +if(NOT BLAS_LIBRARIES) + check_fortran_libraries( + BLAS_LIBRARIES + BLAS + sgemm + "" + "acml;acml_mv;CALBLAS" + "" + ) +endif() +endif () # ACML + +# Apple BLAS library? +if (BLA_VENDOR STREQUAL "Apple" OR BLA_VENDOR STREQUAL "All") + if(NOT BLAS_LIBRARIES) + check_fortran_libraries( + BLAS_LIBRARIES + BLAS + dgemm + "" + "Accelerate" + "" + ) + endif() +endif () + +if (BLA_VENDOR STREQUAL "NAS" OR BLA_VENDOR STREQUAL "All") + if ( NOT BLAS_LIBRARIES ) + check_fortran_libraries( + BLAS_LIBRARIES + BLAS + dgemm + "" + "vecLib" + "" + ) + endif () +endif () + +# Generic BLAS library? +if (BLA_VENDOR STREQUAL "Generic" OR BLA_VENDOR STREQUAL "All") + if(NOT BLAS_LIBRARIES) + check_fortran_libraries( + BLAS_LIBRARIES + BLAS + sgemm + "" + "blas" + "" + ) + endif() +endif () + +if(NOT BLA_F95) + find_package_handle_standard_args(BLAS REQUIRED_VARS BLAS_LIBRARIES) +endif() + +# On compilers that implicitly link BLAS (such as ftn, cc, and CC on Cray HPC machines) +# we used a placeholder for empty BLAS_LIBRARIES to get through our logic above. +if (BLAS_LIBRARIES STREQUAL "BLAS_LIBRARIES-PLACEHOLDER-FOR-EMPTY-LIBRARIES") + set(BLAS_LIBRARIES "") +endif() + +cmake_pop_check_state() +set(CMAKE_FIND_LIBRARY_SUFFIXES ${_blas_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES}) diff --git a/cmake/FindCUB.cmake b/cmake/FindCUB.cmake new file mode 100644 index 00000000000..33c8a926f97 --- /dev/null +++ b/cmake/FindCUB.cmake @@ -0,0 +1,25 @@ +# Try to find the CUB library and headers. +# CUB_ROOT_DIR - where to find + +# CUB_FOUND - system has CUB +# CUB_INCLUDE_DIRS - the CUB include directory + + +find_path(CUB_INCLUDE_DIR + NAMES cub/cub.cuh + HINTS ${CUB_ROOT_DIR} + DOC "The directory where CUB includes reside" +) + +set(CUB_INCLUDE_DIRS ${CUB_INCLUDE_DIR}) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(CUB + FOUND_VAR CUB_FOUND + REQUIRED_VARS CUB_INCLUDE_DIR +) + +mark_as_advanced(CUB_FOUND) + +add_library(CUB INTERFACE) +target_include_directories(CUB INTERFACE ${CUB_INCLUDE_DIR}) diff --git a/cmake/FindICU.cmake b/cmake/FindICU.cmake new file mode 100644 index 00000000000..8c460082c36 --- /dev/null +++ b/cmake/FindICU.cmake @@ -0,0 +1,428 @@ +# Distributed under the OSI-approved BSD 3-Clause License. See accompanying +# file Copyright.txt or https://cmake.org/licensing for details. + +#[=======================================================================[.rst: +FindICU +------- + +Find the International Components for Unicode (ICU) libraries and +programs. + +This module supports multiple components. +Components can include any of: ``data``, ``i18n``, ``io``, ``le``, +``lx``, ``test``, ``tu`` and ``uc``. + +Note that on Windows ``data`` is named ``dt`` and ``i18n`` is named +``in``; any of the names may be used, and the appropriate +platform-specific library name will be automatically selected. + +This module reports information about the ICU installation in +several variables. General variables:: + + ICU_VERSION - ICU release version + ICU_FOUND - true if the main programs and libraries were found + ICU_LIBRARIES - component libraries to be linked + ICU_INCLUDE_DIRS - the directories containing the ICU headers + +Imported targets:: + + ICU:: + +Where ```` is the name of an ICU component, for example +``ICU::i18n``. + +ICU programs are reported in:: + + ICU_GENCNVAL_EXECUTABLE - path to gencnval executable + ICU_ICUINFO_EXECUTABLE - path to icuinfo executable + ICU_GENBRK_EXECUTABLE - path to genbrk executable + ICU_ICU-CONFIG_EXECUTABLE - path to icu-config executable + ICU_GENRB_EXECUTABLE - path to genrb executable + ICU_GENDICT_EXECUTABLE - path to gendict executable + ICU_DERB_EXECUTABLE - path to derb executable + ICU_PKGDATA_EXECUTABLE - path to pkgdata executable + ICU_UCONV_EXECUTABLE - path to uconv executable + ICU_GENCFU_EXECUTABLE - path to gencfu executable + ICU_MAKECONV_EXECUTABLE - path to makeconv executable + ICU_GENNORM2_EXECUTABLE - path to gennorm2 executable + ICU_GENCCODE_EXECUTABLE - path to genccode executable + ICU_GENSPREP_EXECUTABLE - path to gensprep executable + ICU_ICUPKG_EXECUTABLE - path to icupkg executable + ICU_GENCMN_EXECUTABLE - path to gencmn executable + +ICU component libraries are reported in:: + + ICU__FOUND - ON if component was found + ICU__LIBRARIES - libraries for component + +ICU datafiles are reported in:: + + ICU_MAKEFILE_INC - Makefile.inc + ICU_PKGDATA_INC - pkgdata.inc + +Note that ```` is the uppercased name of the component. + +This module reads hints about search results from:: + + ICU_ROOT - the root of the ICU installation + +The environment variable ``ICU_ROOT`` may also be used; the +ICU_ROOT variable takes precedence. + +The following cache variables may also be set:: + + ICU_

_EXECUTABLE - the path to executable

+ ICU_INCLUDE_DIR - the directory containing the ICU headers + ICU__LIBRARY - the library for component + +.. note:: + + In most cases none of the above variables will require setting, + unless multiple ICU versions are available and a specific version + is required. + +Other variables one may set to control this module are:: + + ICU_DEBUG - Set to ON to enable debug output from FindICU. +#]=======================================================================] + +# Written by Roger Leigh + +set(icu_programs + gencnval + icuinfo + genbrk + icu-config + genrb + gendict + derb + pkgdata + uconv + gencfu + makeconv + gennorm2 + genccode + gensprep + icupkg + gencmn) + +set(icu_data + Makefile.inc + pkgdata.inc) + +# The ICU checks are contained in a function due to the large number +# of temporary variables needed. +function(_ICU_FIND) + # Set up search paths, taking compiler into account. Search ICU_ROOT, + # with ICU_ROOT in the environment as a fallback if unset. + if(ICU_ROOT) + list(APPEND icu_roots "${ICU_ROOT}") + else() + if(NOT "$ENV{ICU_ROOT}" STREQUAL "") + file(TO_CMAKE_PATH "$ENV{ICU_ROOT}" NATIVE_PATH) + list(APPEND icu_roots "${NATIVE_PATH}") + set(ICU_ROOT "${NATIVE_PATH}" + CACHE PATH "Location of the ICU installation" FORCE) + endif() + endif() + + # Find include directory + list(APPEND icu_include_suffixes "include") + find_path(ICU_INCLUDE_DIR + NAMES "unicode/utypes.h" + HINTS ${icu_roots} + PATH_SUFFIXES ${icu_include_suffixes} + DOC "ICU include directory") + set(ICU_INCLUDE_DIR "${ICU_INCLUDE_DIR}" PARENT_SCOPE) + + # Get version + if(ICU_INCLUDE_DIR AND EXISTS "${ICU_INCLUDE_DIR}/unicode/uvernum.h") + file(STRINGS "${ICU_INCLUDE_DIR}/unicode/uvernum.h" icu_header_str + REGEX "^#define[\t ]+U_ICU_VERSION[\t ]+\".*\".*") + + string(REGEX REPLACE "^#define[\t ]+U_ICU_VERSION[\t ]+\"([^ \\n]*)\".*" + "\\1" icu_version_string "${icu_header_str}") + set(ICU_VERSION "${icu_version_string}") + set(ICU_VERSION "${icu_version_string}" PARENT_SCOPE) + unset(icu_header_str) + unset(icu_version_string) + endif() + + if(CMAKE_SIZEOF_VOID_P EQUAL 8) + # 64-bit binary directory + set(_bin64 "bin64") + # 64-bit library directory + set(_lib64 "lib64") + endif() + + + # Find all ICU programs + list(APPEND icu_binary_suffixes "${_bin64}" "bin" "sbin") + foreach(program ${icu_programs}) + string(TOUPPER "${program}" program_upcase) + set(cache_var "ICU_${program_upcase}_EXECUTABLE") + set(program_var "ICU_${program_upcase}_EXECUTABLE") + find_program("${cache_var}" + NAMES "${program}" + HINTS ${icu_roots} + PATH_SUFFIXES ${icu_binary_suffixes} + DOC "ICU ${program} executable" + NO_PACKAGE_ROOT_PATH + ) + mark_as_advanced(cache_var) + set("${program_var}" "${${cache_var}}" PARENT_SCOPE) + endforeach() + + # Find all ICU libraries + list(APPEND icu_library_suffixes "${_lib64}" "lib") + set(ICU_REQUIRED_LIBS_FOUND ON) + set(static_prefix ) + # static icu libraries compiled with MSVC have the prefix 's' + if(MSVC) + set(static_prefix "s") + endif() + foreach(component ${ICU_FIND_COMPONENTS}) + string(TOUPPER "${component}" component_upcase) + set(component_cache "ICU_${component_upcase}_LIBRARY") + set(component_cache_release "${component_cache}_RELEASE") + set(component_cache_debug "${component_cache}_DEBUG") + set(component_found "${component_upcase}_FOUND") + set(component_libnames "icu${component}") + set(component_debug_libnames "icu${component}d") + + # Special case deliberate library naming mismatches between Unix + # and Windows builds + unset(component_libnames) + unset(component_debug_libnames) + list(APPEND component_libnames "icu${component}") + list(APPEND component_debug_libnames "icu${component}d") + if(component STREQUAL "data") + list(APPEND component_libnames "icudt") + # Note there is no debug variant at present + list(APPEND component_debug_libnames "icudtd") + endif() + if(component STREQUAL "dt") + list(APPEND component_libnames "icudata") + # Note there is no debug variant at present + list(APPEND component_debug_libnames "icudatad") + endif() + if(component STREQUAL "i18n") + list(APPEND component_libnames "icuin") + list(APPEND component_debug_libnames "icuind") + endif() + if(component STREQUAL "in") + list(APPEND component_libnames "icui18n") + list(APPEND component_debug_libnames "icui18nd") + endif() + + if(static_prefix) + unset(static_component_libnames) + unset(static_component_debug_libnames) + foreach(component_libname ${component_libnames}) + list(APPEND static_component_libnames + ${static_prefix}${component_libname}) + endforeach() + foreach(component_libname ${component_debug_libnames}) + list(APPEND static_component_debug_libnames + ${static_prefix}${component_libname}) + endforeach() + list(APPEND component_libnames ${static_component_libnames}) + list(APPEND component_debug_libnames ${static_component_debug_libnames}) + endif() + find_library("${component_cache_release}" + NAMES ${component_libnames} + HINTS ${icu_roots} + PATH_SUFFIXES ${icu_library_suffixes} + DOC "ICU ${component} library (release)" + NO_PACKAGE_ROOT_PATH + ) + find_library("${component_cache_debug}" + NAMES ${component_debug_libnames} + HINTS ${icu_roots} + PATH_SUFFIXES ${icu_library_suffixes} + DOC "ICU ${component} library (debug)" + NO_PACKAGE_ROOT_PATH + ) + include(SelectLibraryConfigurations) + select_library_configurations(ICU_${component_upcase}) + mark_as_advanced("${component_cache_release}" "${component_cache_debug}") + if(${component_cache}) + set("${component_found}" ON) + list(APPEND ICU_LIBRARY "${${component_cache}}") + endif() + mark_as_advanced("${component_found}") + set("${component_cache}" "${${component_cache}}" PARENT_SCOPE) + set("${component_found}" "${${component_found}}" PARENT_SCOPE) + if(${component_found}) + if (ICU_FIND_REQUIRED_${component}) + list(APPEND ICU_LIBS_FOUND "${component} (required)") + else() + list(APPEND ICU_LIBS_FOUND "${component} (optional)") + endif() + else() + if (ICU_FIND_REQUIRED_${component}) + set(ICU_REQUIRED_LIBS_FOUND OFF) + list(APPEND ICU_LIBS_NOTFOUND "${component} (required)") + else() + list(APPEND ICU_LIBS_NOTFOUND "${component} (optional)") + endif() + endif() + endforeach() + set(_ICU_REQUIRED_LIBS_FOUND "${ICU_REQUIRED_LIBS_FOUND}" PARENT_SCOPE) + set(ICU_LIBRARY "${ICU_LIBRARY}" PARENT_SCOPE) + + # Find all ICU data files + if(CMAKE_LIBRARY_ARCHITECTURE) + list(APPEND icu_data_suffixes + "${_lib64}/${CMAKE_LIBRARY_ARCHITECTURE}/icu/${ICU_VERSION}" + "lib/${CMAKE_LIBRARY_ARCHITECTURE}/icu/${ICU_VERSION}" + "${_lib64}/${CMAKE_LIBRARY_ARCHITECTURE}/icu" + "lib/${CMAKE_LIBRARY_ARCHITECTURE}/icu") + endif() + list(APPEND icu_data_suffixes + "${_lib64}/icu/${ICU_VERSION}" + "lib/icu/${ICU_VERSION}" + "${_lib64}/icu" + "lib/icu") + foreach(data ${icu_data}) + string(TOUPPER "${data}" data_upcase) + string(REPLACE "." "_" data_upcase "${data_upcase}") + set(cache_var "ICU_${data_upcase}") + set(data_var "ICU_${data_upcase}") + find_file("${cache_var}" + NAMES "${data}" + HINTS ${icu_roots} + PATH_SUFFIXES ${icu_data_suffixes} + DOC "ICU ${data} data file") + mark_as_advanced(cache_var) + set("${data_var}" "${${cache_var}}" PARENT_SCOPE) + endforeach() + + if(NOT ICU_FIND_QUIETLY) + if(ICU_LIBS_FOUND) + message(STATUS "Found the following ICU libraries:") + foreach(found ${ICU_LIBS_FOUND}) + message(STATUS " ${found}") + endforeach() + endif() + if(ICU_LIBS_NOTFOUND) + message(STATUS "The following ICU libraries were not found:") + foreach(notfound ${ICU_LIBS_NOTFOUND}) + message(STATUS " ${notfound}") + endforeach() + endif() + endif() + + if(ICU_DEBUG) + message(STATUS "--------FindICU.cmake search debug--------") + message(STATUS "ICU binary path search order: ${icu_roots}") + message(STATUS "ICU include path search order: ${icu_roots}") + message(STATUS "ICU library path search order: ${icu_roots}") + message(STATUS "----------------") + endif() +endfunction() + +_ICU_FIND() + +include(FindPackageHandleStandardArgs) +FIND_PACKAGE_HANDLE_STANDARD_ARGS(ICU + FOUND_VAR ICU_FOUND + REQUIRED_VARS ICU_INCLUDE_DIR + ICU_LIBRARY + _ICU_REQUIRED_LIBS_FOUND + VERSION_VAR ICU_VERSION + FAIL_MESSAGE "Failed to find all ICU components") + +unset(_ICU_REQUIRED_LIBS_FOUND) + +if(ICU_FOUND) + set(ICU_INCLUDE_DIRS "${ICU_INCLUDE_DIR}") + set(ICU_LIBRARIES "${ICU_LIBRARY}") + foreach(_ICU_component ${ICU_FIND_COMPONENTS}) + string(TOUPPER "${_ICU_component}" _ICU_component_upcase) + set(_ICU_component_cache "ICU_${_ICU_component_upcase}_LIBRARY") + set(_ICU_component_cache_release "ICU_${_ICU_component_upcase}_LIBRARY_RELEASE") + set(_ICU_component_cache_debug "ICU_${_ICU_component_upcase}_LIBRARY_DEBUG") + set(_ICU_component_lib "ICU_${_ICU_component_upcase}_LIBRARIES") + set(_ICU_component_found "${_ICU_component_upcase}_FOUND") + set(_ICU_imported_target "ICU::${_ICU_component}") + if(${_ICU_component_found}) + set("${_ICU_component_lib}" "${${_ICU_component_cache}}") + if(NOT TARGET ${_ICU_imported_target}) + add_library(${_ICU_imported_target} UNKNOWN IMPORTED) + if(ICU_INCLUDE_DIR) + set_target_properties(${_ICU_imported_target} PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${ICU_INCLUDE_DIR}") + endif() + if(EXISTS "${${_ICU_component_cache}}") + set_target_properties(${_ICU_imported_target} PROPERTIES + IMPORTED_LINK_INTERFACE_LANGUAGES "CXX" + IMPORTED_LOCATION "${${_ICU_component_cache}}") + endif() + if(EXISTS "${${_ICU_component_cache_release}}") + set_property(TARGET ${_ICU_imported_target} APPEND PROPERTY + IMPORTED_CONFIGURATIONS RELEASE) + set_target_properties(${_ICU_imported_target} PROPERTIES + IMPORTED_LINK_INTERFACE_LANGUAGES_RELEASE "CXX" + IMPORTED_LOCATION_RELEASE "${${_ICU_component_cache_release}}") + endif() + if(EXISTS "${${_ICU_component_cache_debug}}") + set_property(TARGET ${_ICU_imported_target} APPEND PROPERTY + IMPORTED_CONFIGURATIONS DEBUG) + set_target_properties(${_ICU_imported_target} PROPERTIES + IMPORTED_LINK_INTERFACE_LANGUAGES_DEBUG "CXX" + IMPORTED_LOCATION_DEBUG "${${_ICU_component_cache_debug}}") + endif() + if(CMAKE_DL_LIBS AND _ICU_component STREQUAL "uc") + set_target_properties(${_ICU_imported_target} PROPERTIES + INTERFACE_LINK_LIBRARIES "${CMAKE_DL_LIBS}") + endif() + endif() + endif() + unset(_ICU_component_upcase) + unset(_ICU_component_cache) + unset(_ICU_component_lib) + unset(_ICU_component_found) + unset(_ICU_imported_target) + endforeach() +endif() + +if(ICU_DEBUG) + message(STATUS "--------FindICU.cmake results debug--------") + message(STATUS "ICU found: ${ICU_FOUND}") + message(STATUS "ICU_VERSION number: ${ICU_VERSION}") + message(STATUS "ICU_ROOT directory: ${ICU_ROOT}") + message(STATUS "ICU_INCLUDE_DIR directory: ${ICU_INCLUDE_DIR}") + message(STATUS "ICU_LIBRARIES: ${ICU_LIBRARIES}") + + foreach(program IN LISTS icu_programs) + string(TOUPPER "${program}" program_upcase) + set(program_lib "ICU_${program_upcase}_EXECUTABLE") + message(STATUS "${program} program: ${${program_lib}}") + unset(program_upcase) + unset(program_lib) + endforeach() + + foreach(data IN LISTS icu_data) + string(TOUPPER "${data}" data_upcase) + string(REPLACE "." "_" data_upcase "${data_upcase}") + set(data_lib "ICU_${data_upcase}") + message(STATUS "${data} data: ${${data_lib}}") + unset(data_upcase) + unset(data_lib) + endforeach() + + foreach(component IN LISTS ICU_FIND_COMPONENTS) + string(TOUPPER "${component}" component_upcase) + set(component_lib "ICU_${component_upcase}_LIBRARIES") + set(component_found "${component_upcase}_FOUND") + message(STATUS "${component} library found: ${${component_found}}") + message(STATUS "${component} library: ${${component_lib}}") + unset(component_upcase) + unset(component_lib) + unset(component_found) + endforeach() + message(STATUS "----------------") +endif() + +unset(icu_programs) diff --git a/cmake/FindLAPACK.cmake b/cmake/FindLAPACK.cmake new file mode 100644 index 00000000000..60fbf0726a0 --- /dev/null +++ b/cmake/FindLAPACK.cmake @@ -0,0 +1,430 @@ +# Distributed under the OSI-approved BSD 3-Clause License. See accompanying +# file Copyright.txt or https://cmake.org/licensing for details. + +#[=======================================================================[.rst: +FindLAPACK +---------- + +Find Linear Algebra PACKage (LAPACK) library + +This module finds an installed fortran library that implements the +LAPACK linear-algebra interface (see http://www.netlib.org/lapack/). + +The approach follows that taken for the autoconf macro file, +``acx_lapack.m4`` (distributed at +http://ac-archive.sourceforge.net/ac-archive/acx_lapack.html). + +Input Variables +^^^^^^^^^^^^^^^ + +The following variables may be set to influence this module's behavior: + +``BLA_STATIC`` + if ``ON`` use static linkage + +``BLA_VENDOR`` + If set, checks only the specified vendor, if not set checks all the + possibilities. List of vendors valid in this module: + + * ``Intel10_32`` (intel mkl v10 32 bit) + * ``Intel10_64lp`` (intel mkl v10+ 64 bit, threaded code, lp64 model) + * ``Intel10_64lp_seq`` (intel mkl v10+ 64 bit, sequential code, lp64 model) + * ``Intel10_64ilp`` (intel mkl v10+ 64 bit, threaded code, ilp64 model) + * ``Intel10_64ilp_seq`` (intel mkl v10+ 64 bit, sequential code, ilp64 model) + * ``Intel`` (obsolete versions of mkl 32 and 64 bit) + * ``OpenBLAS`` + * ``FLAME`` + * ``ACML`` + * ``Apple`` + * ``NAS`` + * ``Generic`` + +``BLA_F95`` + if ``ON`` tries to find BLAS95/LAPACK95 + +Result Variables +^^^^^^^^^^^^^^^^ + +This module defines the following variables: + +``LAPACK_FOUND`` + library implementing the LAPACK interface is found +``LAPACK_LINKER_FLAGS`` + uncached list of required linker flags (excluding -l and -L). +``LAPACK_LIBRARIES`` + uncached list of libraries (using full path name) to link against + to use LAPACK +``LAPACK95_LIBRARIES`` + uncached list of libraries (using full path name) to link against + to use LAPACK95 +``LAPACK95_FOUND`` + library implementing the LAPACK95 interface is found + +.. note:: + + C or CXX must be enabled to use Intel MKL + + For example, to use Intel MKL libraries and/or Intel compiler: + + .. code-block:: cmake + + set(BLA_VENDOR Intel10_64lp) + find_package(LAPACK) +#]=======================================================================] + +set(_lapack_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES}) + +# Check the language being used +if( NOT (CMAKE_C_COMPILER_LOADED OR CMAKE_CXX_COMPILER_LOADED OR CMAKE_Fortran_COMPILER_LOADED) ) + if(LAPACK_FIND_REQUIRED) + message(FATAL_ERROR "FindLAPACK requires Fortran, C, or C++ to be enabled.") + else() + message(STATUS "Looking for LAPACK... - NOT found (Unsupported languages)") + return() + endif() +endif() + +if (CMAKE_Fortran_COMPILER_LOADED) +include(CheckFortranFunctionExists) +else () +include(CheckFunctionExists) +endif () +include(CMakePushCheckState) + +cmake_push_check_state() +set(CMAKE_REQUIRED_QUIET ${LAPACK_FIND_QUIETLY}) + +set(LAPACK_FOUND FALSE) +set(LAPACK95_FOUND FALSE) + +# TODO: move this stuff to separate module + +macro(Check_Lapack_Libraries LIBRARIES _prefix _name _flags _list _blas _threads) +# This macro checks for the existence of the combination of fortran libraries +# given by _list. If the combination is found, this macro checks (using the +# Check_Fortran_Function_Exists macro) whether can link against that library +# combination using the name of a routine given by _name using the linker +# flags given by _flags. If the combination of libraries is found and passes +# the link test, LIBRARIES is set to the list of complete library paths that +# have been found. Otherwise, LIBRARIES is set to FALSE. + +# N.B. _prefix is the prefix applied to the names of all cached variables that +# are generated internally and marked advanced by this macro. + +set(_libraries_work TRUE) +set(${LIBRARIES}) +set(_combined_name) +if (NOT _libdir) + if (WIN32) + set(_libdir ENV LIB) + elseif (APPLE) + set(_libdir ENV DYLD_LIBRARY_PATH) + else () + set(_libdir ENV LD_LIBRARY_PATH) + endif () +endif () + +list(APPEND _libdir "${CMAKE_C_IMPLICIT_LINK_DIRECTORIES}") + +foreach(_library ${_list}) + set(_combined_name ${_combined_name}_${_library}) + + if(_libraries_work) + if (BLA_STATIC) + if (WIN32) + set(CMAKE_FIND_LIBRARY_SUFFIXES .lib ${CMAKE_FIND_LIBRARY_SUFFIXES}) + endif () + if (APPLE) + set(CMAKE_FIND_LIBRARY_SUFFIXES .lib ${CMAKE_FIND_LIBRARY_SUFFIXES}) + else () + set(CMAKE_FIND_LIBRARY_SUFFIXES .a ${CMAKE_FIND_LIBRARY_SUFFIXES}) + endif () + else () + if (CMAKE_SYSTEM_NAME STREQUAL "Linux") + # for ubuntu's libblas3gf and liblapack3gf packages + set(CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES} .so.3gf) + endif () + endif () + find_library(${_prefix}_${_library}_LIBRARY + NAMES ${_library} + PATHS ${_libdir} + ) + mark_as_advanced(${_prefix}_${_library}_LIBRARY) + set(${LIBRARIES} ${${LIBRARIES}} ${${_prefix}_${_library}_LIBRARY}) + set(_libraries_work ${${_prefix}_${_library}_LIBRARY}) + endif() +endforeach() + +if(_libraries_work) + # Test this combination of libraries. + if(UNIX AND BLA_STATIC) + set(CMAKE_REQUIRED_LIBRARIES ${_flags} "-Wl,--start-group" ${${LIBRARIES}} ${_blas} "-Wl,--end-group" ${_threads}) + else() + set(CMAKE_REQUIRED_LIBRARIES ${_flags} ${${LIBRARIES}} ${_blas} ${_threads}) + endif() +# message("DEBUG: CMAKE_REQUIRED_LIBRARIES = ${CMAKE_REQUIRED_LIBRARIES}") + if (NOT CMAKE_Fortran_COMPILER_LOADED) + check_function_exists("${_name}_" ${_prefix}${_combined_name}_WORKS) + else () + check_fortran_function_exists(${_name} ${_prefix}${_combined_name}_WORKS) + endif () + set(CMAKE_REQUIRED_LIBRARIES) + set(_libraries_work ${${_prefix}${_combined_name}_WORKS}) + #message("DEBUG: ${LIBRARIES} = ${${LIBRARIES}}") +endif() + +if(_libraries_work) + set(${LIBRARIES} ${${LIBRARIES}} ${_blas} ${_threads}) +else() + set(${LIBRARIES} FALSE) +endif() + +endmacro() + + +set(LAPACK_LINKER_FLAGS) +set(LAPACK_LIBRARIES) +set(LAPACK95_LIBRARIES) + + +if(LAPACK_FIND_QUIETLY OR NOT LAPACK_FIND_REQUIRED) + find_package(BLAS) +else() + find_package(BLAS REQUIRED) +endif() + + +if(BLAS_FOUND) + set(LAPACK_LINKER_FLAGS ${BLAS_LINKER_FLAGS}) + if (NOT $ENV{BLA_VENDOR} STREQUAL "") + set(BLA_VENDOR $ENV{BLA_VENDOR}) + else () + if(NOT BLA_VENDOR) + set(BLA_VENDOR "All") + endif() + endif () + +#intel lapack +if (BLA_VENDOR MATCHES "Intel" OR BLA_VENDOR STREQUAL "All") + if (NOT WIN32) + set(LAPACK_mkl_LM "-lm") + set(LAPACK_mkl_LDL "-ldl") + endif () + if (CMAKE_C_COMPILER_LOADED OR CMAKE_CXX_COMPILER_LOADED) + if(LAPACK_FIND_QUIETLY OR NOT LAPACK_FIND_REQUIRED) + find_PACKAGE(Threads) + else() + find_package(Threads REQUIRED) + endif() + + if (BLA_VENDOR MATCHES "_64ilp") + set(LAPACK_mkl_ILP_MODE "ilp64") + else () + set(LAPACK_mkl_ILP_MODE "lp64") + endif () + + set(LAPACK_SEARCH_LIBS "") + + if (BLA_F95) + set(LAPACK_mkl_SEARCH_SYMBOL "cheev_f95") + set(_LIBRARIES LAPACK95_LIBRARIES) + set(_BLAS_LIBRARIES ${BLAS95_LIBRARIES}) + + # old + list(APPEND LAPACK_SEARCH_LIBS + "mkl_lapack95") + # new >= 10.3 + list(APPEND LAPACK_SEARCH_LIBS + "mkl_intel_c") + list(APPEND LAPACK_SEARCH_LIBS + "mkl_lapack95_${LAPACK_mkl_ILP_MODE}") + else() + set(LAPACK_mkl_SEARCH_SYMBOL "cheev") + set(_LIBRARIES LAPACK_LIBRARIES) + set(_BLAS_LIBRARIES ${BLAS_LIBRARIES}) + + # old + list(APPEND LAPACK_SEARCH_LIBS + "mkl_lapack") + endif() + + # First try empty lapack libs + if (NOT ${_LIBRARIES}) + check_lapack_libraries( + ${_LIBRARIES} + LAPACK + ${LAPACK_mkl_SEARCH_SYMBOL} + "" + "" + "${_BLAS_LIBRARIES}" + "" + ) + endif () + # Then try the search libs + foreach (IT ${LAPACK_SEARCH_LIBS}) + if (NOT ${_LIBRARIES}) + check_lapack_libraries( + ${_LIBRARIES} + LAPACK + ${LAPACK_mkl_SEARCH_SYMBOL} + "" + "${IT}" + "${_BLAS_LIBRARIES}" + "${CMAKE_THREAD_LIBS_INIT};${LAPACK_mkl_LM};${LAPACK_mkl_LDL}" + ) + endif () + endforeach () + + unset(LAPACK_mkl_ILP_MODE) + unset(LAPACK_mkl_SEARCH_SYMBOL) + unset(LAPACK_mkl_LM) + unset(LAPACK_mkl_LDL) + endif () +endif() + +if (BLA_VENDOR STREQUAL "Goto" OR BLA_VENDOR STREQUAL "All") + if(NOT LAPACK_LIBRARIES) + check_lapack_libraries( + LAPACK_LIBRARIES + LAPACK + cheev + "" + "goto2" + "${BLAS_LIBRARIES}" + "" + ) + endif() +endif () + +if (BLA_VENDOR STREQUAL "OpenBLAS" OR BLA_VENDOR STREQUAL "All") + if(NOT LAPACK_LIBRARIES) + check_lapack_libraries( + LAPACK_LIBRARIES + LAPACK + cheev + "" + "openblas" + "${BLAS_LIBRARIES}" + "" + ) + endif() +endif () + +if (BLA_VENDOR STREQUAL "FLAME" OR BLA_VENDOR STREQUAL "All") + if(NOT LAPACK_LIBRARIES) + check_lapack_libraries( + LAPACK_LIBRARIES + LAPACK + cheev + "" + "flame" + "${BLAS_LIBRARIES}" + "" + ) + endif() +endif () + +#acml lapack +if (BLA_VENDOR MATCHES "ACML" OR BLA_VENDOR STREQUAL "All") + if (BLAS_LIBRARIES MATCHES ".+acml.+") + set (LAPACK_LIBRARIES ${BLAS_LIBRARIES}) + endif () +endif () + +# Apple LAPACK library? +if (BLA_VENDOR STREQUAL "Apple" OR BLA_VENDOR STREQUAL "All") + if(NOT LAPACK_LIBRARIES) + check_lapack_libraries( + LAPACK_LIBRARIES + LAPACK + cheev + "" + "Accelerate" + "${BLAS_LIBRARIES}" + "" + ) + endif() +endif () +if (BLA_VENDOR STREQUAL "NAS" OR BLA_VENDOR STREQUAL "All") + if ( NOT LAPACK_LIBRARIES ) + check_lapack_libraries( + LAPACK_LIBRARIES + LAPACK + cheev + "" + "vecLib" + "${BLAS_LIBRARIES}" + "" + ) + endif () +endif () +# Generic LAPACK library? +if (BLA_VENDOR STREQUAL "Generic" OR + BLA_VENDOR STREQUAL "ATLAS" OR + BLA_VENDOR STREQUAL "All") + if ( NOT LAPACK_LIBRARIES ) + check_lapack_libraries( + LAPACK_LIBRARIES + LAPACK + cheev + "" + "lapack" + "${BLAS_LIBRARIES}" + "" + ) + endif () +endif () + +else() + message(STATUS "LAPACK requires BLAS") +endif() + +if(BLA_F95) + if(LAPACK95_LIBRARIES) + set(LAPACK95_FOUND TRUE) + else() + set(LAPACK95_FOUND FALSE) + endif() + if(NOT LAPACK_FIND_QUIETLY) + if(LAPACK95_FOUND) + message(STATUS "A library with LAPACK95 API found.") + else() + if(LAPACK_FIND_REQUIRED) + message(FATAL_ERROR + "A required library with LAPACK95 API not found. Please specify library location." + ) + else() + message(STATUS + "A library with LAPACK95 API not found. Please specify library location." + ) + endif() + endif() + endif() + set(LAPACK_FOUND "${LAPACK95_FOUND}") + set(LAPACK_LIBRARIES "${LAPACK95_LIBRARIES}") +else() + if(LAPACK_LIBRARIES) + set(LAPACK_FOUND TRUE) + else() + set(LAPACK_FOUND FALSE) + endif() + + if(NOT LAPACK_FIND_QUIETLY) + if(LAPACK_FOUND) + message(STATUS "A library with LAPACK API found.") + else() + if(LAPACK_FIND_REQUIRED) + message(FATAL_ERROR + "A required library with LAPACK API not found. Please specify library location." + ) + else() + message(STATUS + "A library with LAPACK API not found. Please specify library location." + ) + endif() + endif() + endif() +endif() + +cmake_pop_check_state() +set(CMAKE_FIND_LIBRARY_SUFFIXES ${_lapack_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES}) diff --git a/cmake/FindNvToolExt.cmake b/cmake/FindNvToolExt.cmake new file mode 100644 index 00000000000..5f2998e442a --- /dev/null +++ b/cmake/FindNvToolExt.cmake @@ -0,0 +1,35 @@ +# The following variables are optionally searched for defaults +# NvToolExt_ROOT_DIR: +# +# The following are set after configuration is done: +# NvToolExt_FOUND +# NvToolExt_INCLUDE_DIR +# NvToolExt_LIBRARIES +# NvToolExt_LIBRARY_DIR +# NvToolExt: a target + +include(FindPackageHandleStandardArgs) + +set(NvToolExt_SEARCH_DIRS ${CUDA_TOOLKIT_ROOT_DIR}) +if(WIN32) + list(APPEND NvToolExt_SEARCH_DIRS "C:/Program Files/NVIDIA Corporation/NvToolsExt") +endif() +set(NvToolExt_SEARCH_DIRS ${NvToolExt_ROOT_DIR} ${NvToolExt_SEARCH_DIRS}) + + +find_path(NvToolExt_INCLUDE_DIR nvToolsExt.h HINTS ${NvToolExt_SEARCH_DIRS} PATH_SUFFIXES include) + +# 32bit not considered +set(NvToolExt_LIBNAME nvToolsExt libnvToolsExt.so libnvToolsExt.a libnvToolsExt.so nvToolsExt64_1.lib) +find_library(NvToolExt_LIBRARIES NAMES ${NvToolExt_LIBNAME} HINTS ${NvToolExt_SEARCH_DIRS} + PATH_SUFFIXES lib lib64 cuda/lib cuda/lib64 lib/x64) + +find_package_handle_standard_args(NvToolExt REQUIRED_VARS NvToolExt_INCLUDE_DIR NvToolExt_LIBRARIES) + +add_library(NvToolExt INTERFACE) +target_include_directories(NvToolExt INTERFACE ${NvToolExt_INCLUDE_DIR}) +# target_link_directories(NvToolExt INTERFACE ${NvToolExt_INCLUDE_DIR}) +target_link_libraries(NvToolExt INTERFACE ${NvToolExt_LIBRARIES}) + +unset(NvToolExt_SEARCH_DIRS) +unset(NvToolExt_LIBNAME) diff --git a/cmake/INSTALL.md b/cmake/INSTALL.md new file mode 100644 index 00000000000..0082212eb9b --- /dev/null +++ b/cmake/INSTALL.md @@ -0,0 +1,49 @@ +# Install Instruction + +Execute following commands in the repo root. + +## Build with Old Style Make Generator +```bash +mkdir -p build && cd build +cmake -DCMAKE_INSTALL_PREFIX=../dist .. # configure +cmake --build . --target install -- -j8 # build && install, substitude -j8 with /m:8 if you are on Windows +``` + +## Build with Ninja Generator +``` bash +mkdir -p build && cd build +cmake -GNinja -DCMAKE_INSTALL_PREFIX=../dist .. +cmake --build . --target install +``` + +After built, you can find all installed files in /dist + +# For Advance Configuration + +Follow options are currently available: + +| Variable | Available Options | Default | +| ---------------------- | ------------------------- | -------- | +| MATHLIB | OpenBLAS, MKL, Accelerate | OpenBLAS | +| KALDI_BUILD_EXE | ON,OFF | ON | +| KALDI_BUILD_TEST | ON,OFF | ON | +| KALDI_USE_PATCH_NUMBER | ON,OFF | OFF | +| BUILD_SHARED_LIBS | ON,OFF | OFF | + +Append `-D=` to the configure command to use it, e.g., +`-DKALDI_BUILD_TEST=OFF` will disable building of test executables. For more +information, please refers to +[CMake Documentation](https://cmake.org/cmake/help/latest/manual/cmake.1.html). +For quick learning CMake usage, LLVM's short introuction will do the trick: +[Basic CMake usage](https://llvm.org/docs/CMake.html#usage), +[Options and variables](https://llvm.org/docs/CMake.html#options-and-variables), +[Frequently-used CMake variables](https://llvm.org/docs/CMake.html#frequently-used-cmake-variables). + +NOTE 1: Currently, BUILD_SHARED_LIBS does not work on Windows due to some symbols + (variables) are not properly exported. + +NOTE 2: For scripts users, since you are doing an out of source build, and the + install destination is at your disposal, the `$PATH` is not configured + properly in this case. Scripts will not work out of box. See how `$PATH` + is modified in [path.sh](../egs/wsj/s5/path.sh). You should add + `/bin` to your `$PATH` before running any scripts. diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake new file mode 100644 index 00000000000..88dbefdacc9 --- /dev/null +++ b/cmake/Utils.cmake @@ -0,0 +1,46 @@ +if(NOT CMAKE_VERSION VERSION_LESS "3.10") + include_guard() +endif() + +# For Windows, some env or vars are using backward slash for pathes, convert +# them to forward slashes will fix some nasty problem in CMake. +macro(normalize_path in_path) + file(TO_CMAKE_PATH "${${in_path}}" normalize_path_out_path) + set(${in_path} "${normalize_path_out_path}") + unset(normalize_path_out_path) +endmacro() + +macro(normalize_env_path in_path) + file(TO_CMAKE_PATH "$${in_path}" normalize_env_path_out_path) + set(${in_path} "${normalize_env_path_out_path}") + unset(normalize_env_path_out_path) +endmacro() + + +macro(add_kaldi_executable) + if(${KALDI_BUILD_EXE}) + cmake_parse_arguments(kaldi_exe "" "NAME" "SOURCES;DEPENDS" ${ARGN}) + add_executable(${kaldi_exe_NAME} ${kaldi_exe_SOURCES}) + target_link_libraries(${kaldi_exe_NAME} PRIVATE ${kaldi_exe_DEPENDS}) + # list(APPEND KALDI_EXECUTABLES ${kaldi_exe_NAME}) + install(TARGETS ${kaldi_exe_NAME} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) + + unset(kaldi_exe_NAME) + unset(kaldi_exe_SOURCES) + unset(kaldi_exe_DEPENDS) + endif() +endmacro() + +macro(add_kaldi_test_executable) + if(${KALDI_BUILD_TEST}) + cmake_parse_arguments(kaldi_test_exe "" "NAME" "SOURCES;DEPENDS" ${ARGN}) + add_executable(${kaldi_test_exe_NAME} ${kaldi_test_exe_SOURCES}) + target_link_libraries(${kaldi_test_exe_NAME} PRIVATE ${kaldi_test_exe_DEPENDS}) + # list(APPEND KALDI_TEST_EXECUTABLES ${kaldi_test_exe_NAME}) + install(TARGETS ${kaldi_test_exe_NAME} RUNTIME DESTINATION testbin) + + unset(kaldi_test_exe_NAME) + unset(kaldi_test_exe_SOURCES) + unset(kaldi_test_exe_DEPENDS) + endif() +endmacro() diff --git a/cmake/VersionHelper.cmake b/cmake/VersionHelper.cmake new file mode 100644 index 00000000000..e494a255663 --- /dev/null +++ b/cmake/VersionHelper.cmake @@ -0,0 +1,14 @@ +function(get_version) + file(READ ${CMAKE_CURRENT_SOURCE_DIR}/src/.version version) + string(STRIP ${version} version) + execute_process(COMMAND git log -n1 --format=%H src/.version + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + OUTPUT_VARIABLE version_commit + OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process(COMMAND git rev-list --count "${version_commit}..HEAD" + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + OUTPUT_VARIABLE patch_number) + + set(KALDI_VERSION ${version} PARENT_SCOPE) + set(KALDI_PATCH_NUMBER ${patch_number} PARENT_SCOPE) +endfunction() diff --git a/cmake/gen_cmake_skeleton.py b/cmake/gen_cmake_skeleton.py new file mode 100644 index 00000000000..fa506943662 --- /dev/null +++ b/cmake/gen_cmake_skeleton.py @@ -0,0 +1,310 @@ +import os +import sys +import re +import argparse + +# earily parse, will refernece args globally +parser = argparse.ArgumentParser() +parser.add_argument("working_dir") +parser.add_argument("--quiet", default=False, action="store_true") +args = parser.parse_args() + +def print_wrapper(*args_, **kwargs): + if not args.quiet: + print(*args_, **kwargs) + +def get_subdirectories(d): + return [name for name in os.listdir(d) if os.path.isdir(os.path.join(d, name))] + +def is_bin_dir(d): + return d.endswith("bin") + +def get_files(d): + return [name for name in os.listdir(d) if os.path.isfile(os.path.join(d, name))] + +def is_header(f): + return f.endswith(".h") + +def is_cu_source(f): + return f.endswith(".cu") + +def is_test_source(f): + return f.endswith("-test.cc") + +def is_source(f): + return f.endswith(".cc") and not is_test_source(f) + +def dir_name_to_lib_target(dir_name): + return "kaldi-" + dir_name + +def wrap_notwin32_condition(should_wrap, lines): + if isinstance(lines, str): + lines = [lines] + if should_wrap: + return ["if(NOT WIN32)"] + list(map(lambda l: " " + l, lines)) + ["endif()"] + else: + return lines + + +def get_exe_additional_depends(t): + additional = { + "transform-feats" : ["transform"], + "interpolate-pitch" : ["transform"], + "post-to-feats" : ["hmm"], + "append-post-to-feats" : ["hmm"], + "gmm-est-fmllr-gpost": ["sgmm2", "hmm"], + "gmm-est-fmllr": ["hmm", "transform"], + "gmm-latgen-faster": ["decoder"], + "gmm-transform-means": ["hmm"], + "gmm-post-to-gpost": ["hmm"], + "gmm-init-lvtln": ["transform"], + "gmm-rescore-lattice": ["hmm", "lat"], + "gmm-est-fmllr-global": ["transform"], + "gmm-copy": ["hmm"], + "gmm-train-lvtln-special": ["transform", "hmm"], + "gmm-est-map": ["hmm"], + "gmm-acc-stats2": ["hmm"], + "gmm-decode-faster-regtree-mllr": ["decoder"], + "gmm-global-est-fmllr": ["transform"], + "gmm-est-basis-fmllr": ["hmm", "transform"], + "gmm-init-model": ["hmm"], + "gmm-est-weights-ebw": ["hmm"], + "gmm-init-biphone": ["hmm"], + "gmm-compute-likes": ["hmm"], + "gmm-est-fmllr-raw-gpost": ["hmm", "transform"], + # gmm-* is a bottom case, it will add link dependencies to all other + # target whose names start with gmm-, it is harmless, but will increase + # link time. Better to avoid it at best. + "gmm-*": ["hmm", "transform", "lat", "decoder"], + } + if t in additional: + return list(map(lambda name: dir_name_to_lib_target(name), additional[t])) + elif (t.split("-", 1)[0] + "-*") in additional: + wildcard = (t.split("-", 1)[0] + "-*") + return list(map(lambda name: dir_name_to_lib_target(name), additional[wildcard])) + else: + return [] + +def disable_for_win32(t): + disabled = [ + "online-audio-client", + "online-net-client", + "online2-tcp-nnet3-decode-faster", + "online-server-gmm-decode-faster", + "online-audio-server-decode-faster" + ] + return t in disabled + +class CMakeListsHeaderLibrary(object): + def __init__(self, dir_name): + self.dir_name = dir_name + self.target_name = dir_name_to_lib_target(self.dir_name) + self.header_list = [] + + def add_header(self, filename): + self.header_list.append(filename) + + def add_source(self, filename): + pass + + def add_cuda_source(self, filename): + pass + + def add_test_source(self, filename): + pass + + def gen_code(self): + ret = [] + if len(self.header_list) > 0: + ret.append("set(PUBLIC_HEADERS") + for f in self.header_list: + ret.append(" " + f) + ret.append(")\n") + + ret.append("add_library(" + self.target_name + " INTERFACE)") + ret.append("target_include_directories(" + self.target_name + " INTERFACE ") + ret.append(" $") + ret.append(" $") + ret.append(")\n") + + ret.append(""" +install(TARGETS {tgt} EXPORT kaldi-targets) + +install(FILES ${{PUBLIC_HEADERS}} DESTINATION include/kaldi/{dir}) +""".format(tgt=self.target_name, dir=self.dir_name)) + + return "\n".join(ret) + +class CMakeListsLibrary(object): + + def __init__(self, dir_name): + self.dir_name = dir_name + self.target_name = dir_name_to_lib_target(self.dir_name) + self.header_list = [] + self.source_list = [] + self.cuda_source_list = [] + self.test_source_list = [] + self.depends = [] + + def add_header(self, filename): + self.header_list.append(filename) + + def add_source(self, filename): + self.source_list.append(filename) + + def add_cuda_source(self, filename): + self.cuda_source_list.append(filename) + + def add_test_source(self, filename): + self.test_source_list.append(filename) + + def load_dependency_from_makefile(self, filename): + with open(filename) as f: + makefile = f.read() + if "ADDLIBS" not in makefile: + print_wrapper("WARNING: non-standard", filename) + return + libs = makefile.split("ADDLIBS")[-1].split("\n\n")[0] + libs = re.findall("[^\s\\\\=]+", libs) + for l in libs: + self.depends.append(os.path.splitext(os.path.basename(l))[0]) + + def gen_code(self): + ret = [] + + if len(self.header_list) > 0: + ret.append("set(PUBLIC_HEADERS") + for f in self.header_list: + ret.append(" " + f) + ret.append(")\n") + + if len(self.cuda_source_list) > 0: + self.source_list.append("${CUDA_OBJS}") + ret.append("cuda_include_directories(${CMAKE_CURRENT_SOURCE_DIR}/..)") + ret.append("cuda_compile(CUDA_OBJS") + for f in self.cuda_source_list: + ret.append(" " + f) + ret.append(")\n") + + ret.append("add_library(" + self.target_name) + for f in self.source_list: + ret.append(" " + f) + ret.append(")\n") + ret.append("target_include_directories(" + self.target_name + " PUBLIC ") + ret.append(" $") + ret.append(" $") + ret.append(")\n") + + if len(self.depends) > 0: + ret.append("target_link_libraries(" + self.target_name + " PUBLIC") + for d in self.depends: + ret.append(" " + d) + ret.append(")\n") + + def get_test_exe_name(filename): + exe_name = os.path.splitext(f)[0] + if self.dir_name.startswith("nnet") and exe_name.startswith("nnet"): + return self.dir_name + "-" + exe_name.split("-", 1)[1] + else: + return exe_name + + if len(self.test_source_list) > 0: + ret.append("if(KALDI_BUILD_TEST)") + for f in self.test_source_list: + exe_target = get_test_exe_name(f) + depends = (self.target_name + " " + " ".join(get_exe_additional_depends(exe_target))).strip() + ret.extend(wrap_notwin32_condition(disable_for_win32(self.target_name), + " add_kaldi_test_executable(NAME " + exe_target + " SOURCES " + f + " DEPENDS " + depends + ")")) + ret.append("endif()") + + ret.append(""" +install(TARGETS {tgt} + EXPORT kaldi-targets + ARCHIVE DESTINATION ${{CMAKE_INSTALL_LIBDIR}} + LIBRARY DESTINATION ${{CMAKE_INSTALL_LIBDIR}} + RUNTIME DESTINATION ${{CMAKE_INSTALL_BINDIR}} +) + +install(FILES ${{PUBLIC_HEADERS}} DESTINATION include/kaldi/{dir}) +""".format(tgt=self.target_name, dir=self.dir_name)) + + return "\n".join(ret) + + + +class CMakeListsExecutable(object): + + def __init__(self, dir_name, filename): + assert(dir_name.endswith("bin")) + self.list = [] + exe_name = os.path.splitext(os.path.basename(filename))[0] + file_name = filename + depend = dir_name_to_lib_target(dir_name[:-3]) + self.list.append((exe_name, file_name, depend)) + + def gen_code(self): + ret = [] + for exe_name, file_name, depend in self.list: + depends = (depend + " " + " ".join(get_exe_additional_depends(exe_name))).strip() + ret.extend(wrap_notwin32_condition(disable_for_win32(exe_name), + "add_kaldi_executable(NAME " + exe_name + " SOURCES " + file_name + " DEPENDS " + depends + ")")) + + return "\n".join(ret) + +class CMakeListsFile(object): + + GEN_CMAKE_HEADER = "# generated with cmake/gen_cmake_skeleton.py, DO NOT MODIFY.\n" + + def __init__(self, directory): + self.path = os.path.realpath(os.path.join(directory, "CMakeLists.txt")) + self.sections = [] + + def add_section(self, section): + self.sections.append(section) + + def write_file(self): + with open(self.path, "w", newline='\n') as f: # good luck for python2 + f.write(CMakeListsFile.GEN_CMAKE_HEADER) + for s in self.sections: + code = s.gen_code() + f.write(code) + f.write("\n") + print_wrapper(" Writed", self.path) + + +if __name__ == "__main__": + os.chdir(args.working_dir) + print_wrapper("Working in ", args.working_dir) + + subdirs = get_subdirectories(".") + for d in subdirs: + cmakelists = CMakeListsFile(d) + if is_bin_dir(d): + for f in get_files(d): + if is_source(f): + dir_name = os.path.basename(d) + filename = os.path.basename(f) + exe = CMakeListsExecutable(dir_name, filename) + cmakelists.add_section(exe) + else: + dir_name = os.path.basename(d) + lib = None + makefile = os.path.join(d, "Makefile") + if not os.path.exists(makefile): + lib = CMakeListsHeaderLibrary(dir_name) + else: + lib = CMakeListsLibrary(dir_name) + lib.load_dependency_from_makefile(makefile) + cmakelists.add_section(lib) + for f in sorted(get_files(d)): + filename = os.path.basename(f) + if is_source(filename): + lib.add_source(filename) + elif is_cu_source(filename): + lib.add_cuda_source(filename) + elif is_test_source(filename): + lib.add_test_source(filename) + elif is_header(filename): + lib.add_header(filename) + + cmakelists.write_file() diff --git a/cmake/kaldi-config.cmake.in b/cmake/kaldi-config.cmake.in new file mode 100644 index 00000000000..123f58c5699 --- /dev/null +++ b/cmake/kaldi-config.cmake.in @@ -0,0 +1,7 @@ +@PACKAGE_INIT@ + +find_package(Threads) + +if(NOT TARGET kaldi-base) + include(${CMAKE_CURRENT_LIST_DIR}/kaldi-targets.cmake) +endif() diff --git a/cmake/third_party/get_third_party.cmake b/cmake/third_party/get_third_party.cmake new file mode 100644 index 00000000000..8e24dc9f643 --- /dev/null +++ b/cmake/third_party/get_third_party.cmake @@ -0,0 +1,20 @@ +# Download and unpack a third-party library at configure time +# The original code is at the README of google-test: +# https://github.com/google/googletest/tree/master/googletest +function(get_third_party name) + configure_file( + "${PROJECT_SOURCE_DIR}/cmake/third_party/${name}.cmake" + "${CMAKE_CURRENT_BINARY_DIR}/${name}-download/CMakeLists.txt") + execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" . + RESULT_VARIABLE result + WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${name}-download") + if(result) + message(FATAL_ERROR "CMake step for ${name} failed: ${result}") + endif() + execute_process(COMMAND ${CMAKE_COMMAND} --build . + RESULT_VARIABLE result + WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${name}-download") + if(result) + message(FATAL_ERROR "Build step for ${name} failed: ${result}") + endif() +endfunction() diff --git a/cmake/third_party/openfst.cmake b/cmake/third_party/openfst.cmake new file mode 100644 index 00000000000..19a7f527f8f --- /dev/null +++ b/cmake/third_party/openfst.cmake @@ -0,0 +1,14 @@ +cmake_minimum_required(VERSION 2.8.2) +project(openfst-download NONE) + +include(ExternalProject) +ExternalProject_Add(openfst + GIT_REPOSITORY https://github.com/kkm000/openfst + GIT_TAG 0bca6e76d24647427356dc242b0adbf3b5f1a8d9 # tag win/1.7.2.1 + SOURCE_DIR "${CMAKE_BINARY_DIR}/openfst" + BINARY_DIR "" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "" +) diff --git a/cmake/third_party/openfst_lib_target.cmake b/cmake/third_party/openfst_lib_target.cmake new file mode 100644 index 00000000000..dde5efc402a --- /dev/null +++ b/cmake/third_party/openfst_lib_target.cmake @@ -0,0 +1,31 @@ +if(NOT OPENFST_ROOT_DIR) + message(FATAL_ERROR) +endif() + +set(fst_source_dir ${OPENFST_ROOT_DIR}/src/lib) +set(fst_include_dir ${OPENFST_ROOT_DIR}/src/include) + +include_directories(${fst_include_dir}) +file(GLOB fst_sources "${fst_source_dir}/*.cc") + +add_library(fst ${fst_sources}) +target_include_directories(fst PUBLIC + $ + $ +) + +install(TARGETS fst + EXPORT kaldi-targets + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} +) + +install(DIRECTORY ${fst_include_dir}/fst + DESTINATION include/openfst + PATTERN "test/*.h" EXCLUDE +) + +unset(fst_source_dir) +unset(fst_include_dir) +unset(fst_sources) diff --git a/docker/README.md b/docker/README.md new file mode 100644 index 00000000000..852e9531bd6 --- /dev/null +++ b/docker/README.md @@ -0,0 +1,30 @@ +# Kaldi Docker images + +Kaldi offers two set of images: CPU-based images and GPU-based images. Daily builds of the latest version of the master branch (both CPU and GPU images) are pushed daily to [DockerHub](https://hub.docker.com/r/kaldiasr/kaldi). + +## Using pre-built images +Sample usage of the CPU based images: +```bash +docker run -it kaldiasr/kaldi:latest bash +``` + +Sample usage of the GPU based images: + +Note: use [nvidia-docker](https://github.com/NVIDIA/nvidia-docker) to run the GPU images. + +```bash +docker run -it --runtime=nvidia kaldiasr/kaldi:gpu-latest bash +``` + +## Building images locally +For building the CPU-based image: +```bash +cd docker/debian9.8-cpu +docker build --tag kaldiasr/kaldi:latest . +``` + +and for GPU-based image: +```bash +cd docker/ubuntu16.04-gpu +docker build --tag kaldiasr/kaldi:gpu-latest . +``` diff --git a/docker/debian9.8-cpu/Dockerfile b/docker/debian9.8-cpu/Dockerfile new file mode 100644 index 00000000000..db0b9c47a73 --- /dev/null +++ b/docker/debian9.8-cpu/Dockerfile @@ -0,0 +1,41 @@ + +FROM debian:9.8 +LABEL maintainer="mdoulaty@gmail.com" + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + g++ \ + make \ + automake \ + autoconf \ + bzip2 \ + unzip \ + wget \ + sox \ + libtool \ + git \ + subversion \ + python2.7 \ + python3 \ + zlib1g-dev \ + ca-certificates \ + gfortran \ + patch \ + ffmpeg \ + vim && \ + rm -rf /var/lib/apt/lists/* + +RUN ln -s /usr/bin/python2.7 /usr/bin/python + +RUN git clone --depth 1 https://github.com/kaldi-asr/kaldi.git /opt/kaldi && \ + cd /opt/kaldi && \ + cd /opt/kaldi/tools && \ + ./extras/install_mkl.sh && \ + make -j $(nproc) && \ + cd /opt/kaldi/src && \ + ./configure --shared && \ + make depend -j $(nproc) && \ + make -j $(nproc) + +WORKDIR /opt/kaldi/ + diff --git a/docker/ubuntu16.04-gpu/Dockerfile b/docker/ubuntu16.04-gpu/Dockerfile new file mode 100644 index 00000000000..d705a5c1689 --- /dev/null +++ b/docker/ubuntu16.04-gpu/Dockerfile @@ -0,0 +1,41 @@ + +FROM nvidia/cuda:10.0-cudnn7-devel-ubuntu16.04 +LABEL maintainer="mdoulaty@gmail.com" + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + g++ \ + make \ + automake \ + autoconf \ + bzip2 \ + unzip \ + wget \ + sox \ + libtool \ + git \ + subversion \ + python2.7 \ + python3 \ + zlib1g-dev \ + gfortran \ + ca-certificates \ + patch \ + ffmpeg \ + vim && \ + rm -rf /var/lib/apt/lists/* + +RUN ln -s /usr/bin/python2.7 /usr/bin/python + +RUN git clone --depth 1 https://github.com/kaldi-asr/kaldi.git /opt/kaldi && \ + cd /opt/kaldi && \ + cd /opt/kaldi/tools && \ + ./extras/install_mkl.sh && \ + make -j $(nproc) && \ + cd /opt/kaldi/src && \ + ./configure --shared --use-cuda && \ + make depend -j $(nproc) && \ + make -j $(nproc) + +WORKDIR /opt/kaldi/ + diff --git a/egs/aidatatang_200zh/README.md b/egs/aidatatang_200zh/README.md new file mode 100644 index 00000000000..097454d84ce --- /dev/null +++ b/egs/aidatatang_200zh/README.md @@ -0,0 +1,21 @@ +Aidatatang_200zh is a free Chinese Mandarin speech corpus provided by Beijing DataTang Technology Co., Ltd under Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 International Public License. + +**About the aidatatang_200zh corpus:** + +- The corpus contains 200 hours of acoustic data, which is mostly mobile recorded data. +- 600 speakers from different accent areas in China are invited to participate in the recording. +- The transcription accuracy for each sentence is larger than 98%. +- Recordings are conducted in a quiet indoor environment. +- The database is divided into training set, validation set, and testing set in a ratio of 7: 1: 2. +- Detail information such as speech data coding and speaker information is preserved in the metadata file. +- Segmented transcripts are also provided. + +You can get the corpus from [here](https://www.datatang.com/webfront/opensource.html). + +DataTang is a community of creators-of world-changers and future-builders. We're invested in collaborating with a diverse set of voices in the AI world, and are excited about working on large-scale projects. Beyond speech, we're providing multiple resources in image, and text. For more details, please visit [datatang](). + +**About the recipe:** + +To demonstrate that this corpus is a reasonable data resource for Chinese Mandarin speech recognition research, a baseline recipe is provided here for everyone to explore their own systems easily and quickly. + +In this directory, each subdirectory contains the scripts for a sequence of experiments. The recipe in subdirectory "s5" is based on the hkust s5 recipe and aishell s5 recipe. It generates an integrated phonetic lexicon with CMU dictionary and cedit dictionary. This recipe follows the Mono+Triphone+SAT+fMLLR+DNN pipeline. In addition, this directory will be extended as scripts for speaker diarization and so on are created. diff --git a/egs/aidatatang_200zh/s5/RESULTS b/egs/aidatatang_200zh/s5/RESULTS new file mode 100644 index 00000000000..8c458e8015e --- /dev/null +++ b/egs/aidatatang_200zh/s5/RESULTS @@ -0,0 +1,17 @@ +%WER 37.09 [ 173936 / 468933, 4868 ins, 31143 del, 137925 sub ] exp/mono/decode_test/cer_10_0.0 +%WER 17.98 [ 84305 / 468933, 4724 ins, 12637 del, 66944 sub ] exp/tri1/decode_test/cer_13_0.0 +%WER 17.94 [ 84149 / 468933, 5025 ins, 12427 del, 66697 sub ] exp/tri2/decode_test/cer_13_0.0 +%WER 17.26 [ 80945 / 468933, 4421 ins, 12958 del, 63566 sub ] exp/tri3a/decode_test/cer_14_0.0 +%WER 14.16 [ 66424 / 468933, 4567 ins, 10224 del, 51633 sub ] exp/tri4a/decode_test/cer_14_0.0 +%WER 12.22 [ 57304 / 468933, 4799 ins, 8197 del, 44308 sub ] exp/tri5a/decode_test/cer_14_0.0 +%WER 5.59 [ 26232 / 468933, 1701 ins, 4377 del, 20154 sub ] exp/chain/tdnn_1a_sp/decode_test/cer_10_0.0 + +# nnet3 tdnn with online pitch, local/nnet3/tuning/run_tdnn_2a.sh +%WER 7.21 [ 33797 / 468933, 2141 ins, 6117 del, 25539 sub ] exp/nnet3/tdnn_sp/decode_test/cer_13_0.0 +%WER 7.44 [ 34878 / 468933, 2252 ins, 5854 del, 26772 sub ] exp/nnet3/tdnn_sp_online/decode_test/cer_12_0.0 +%WER 7.79 [ 36542 / 468933, 2527 ins, 5674 del, 28341 sub ] exp/nnet3/tdnn_sp_online/decode_test_per_utt/cer_12_0.0 + +# chain with online pitch, local/chain/tuning/run_tdnn_2a.sh +%WER 5.61 [ 26311 / 468933, 1773 ins, 4789 del, 19749 sub ] exp/chain/tdnn_2a_sp/decode_test/cer_11_0.0 +%WER 5.69 [ 26661 / 468933, 1723 ins, 4724 del, 20214 sub ] exp/chain/tdnn_2a_sp_online/decode_test/cer_11_0.0 +%WER 5.98 [ 28046 / 468933, 2031 ins, 4527 del, 21488 sub ] exp/chain/tdnn_2a_sp_online/decode_test_per_utt/cer_11_0.0 diff --git a/egs/aidatatang_200zh/s5/cmd.sh b/egs/aidatatang_200zh/s5/cmd.sh new file mode 100644 index 00000000000..811adcde474 --- /dev/null +++ b/egs/aidatatang_200zh/s5/cmd.sh @@ -0,0 +1,14 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" diff --git a/egs/aidatatang_200zh/s5/conf/cmu2pinyin b/egs/aidatatang_200zh/s5/conf/cmu2pinyin new file mode 100644 index 00000000000..c02eb600fcc --- /dev/null +++ b/egs/aidatatang_200zh/s5/conf/cmu2pinyin @@ -0,0 +1,39 @@ +AA A +AE A +AH A +AO UO +AW U +AY AI +B B +CH CH +D D +DH S I +EH AI +ER E +EY AI +F F +G G +HH H +IH I +IY I +JH ZH +K K +L L +M M +N N +NG N +OW UO +OY UO +P P +R R +S S +SH SH +T T +TH S +UH U +UW U +V W +W W +Y Y +Z Z +ZH X diff --git a/egs/aidatatang_200zh/s5/conf/decode.config b/egs/aidatatang_200zh/s5/conf/decode.config new file mode 100644 index 00000000000..d91f86183af --- /dev/null +++ b/egs/aidatatang_200zh/s5/conf/decode.config @@ -0,0 +1,5 @@ +beam=11.0 # beam for decoding. Was 13.0 in the scripts. +first_beam=8.0 # beam for 1st-pass decoding in SAT. + + + diff --git a/egs/aidatatang_200zh/s5/conf/mfcc.conf b/egs/aidatatang_200zh/s5/conf/mfcc.conf new file mode 100644 index 00000000000..a1aa3d6c158 --- /dev/null +++ b/egs/aidatatang_200zh/s5/conf/mfcc.conf @@ -0,0 +1,2 @@ +--use-energy=false # only non-default option. +--sample-frequency=16000 diff --git a/egs/aidatatang_200zh/s5/conf/mfcc_hires.conf b/egs/aidatatang_200zh/s5/conf/mfcc_hires.conf new file mode 100644 index 00000000000..ca067e77b37 --- /dev/null +++ b/egs/aidatatang_200zh/s5/conf/mfcc_hires.conf @@ -0,0 +1,10 @@ +# config for high-resolution MFCC features, intended for neural network training. +# Note: we keep all cepstra, so it has the same info as filterbank features, +# but MFCC is more easily compressible (because less correlated) which is why +# we prefer this method. +--use-energy=false # use average of log energy, not energy. +--sample-frequency=16000 # Switchboard is sampled at 8kHz +--num-mel-bins=40 # similar to Google's setup. +--num-ceps=40 # there is no dimensionality reduction. +--low-freq=40 # low cutoff frequency for mel bins +--high-freq=-200 # high cutoff frequently, relative to Nyquist of 8000 (=3800) diff --git a/egs/aidatatang_200zh/s5/conf/online_cmvn.conf b/egs/aidatatang_200zh/s5/conf/online_cmvn.conf new file mode 100644 index 00000000000..591367e7ae9 --- /dev/null +++ b/egs/aidatatang_200zh/s5/conf/online_cmvn.conf @@ -0,0 +1 @@ +# configuration file for apply-cmvn-online, used when invoking online2-wav-nnet3-latgen-faster. diff --git a/egs/aidatatang_200zh/s5/conf/online_pitch.conf b/egs/aidatatang_200zh/s5/conf/online_pitch.conf new file mode 100644 index 00000000000..c0f1342160d --- /dev/null +++ b/egs/aidatatang_200zh/s5/conf/online_pitch.conf @@ -0,0 +1,4 @@ +--sample-frequency=16000 +--simulate-first-pass-online=true +--normalization-right-context=25 +--frames-per-chunk=10 diff --git a/egs/aidatatang_200zh/s5/conf/pinyin2cmu b/egs/aidatatang_200zh/s5/conf/pinyin2cmu new file mode 100644 index 00000000000..a6e53620479 --- /dev/null +++ b/egs/aidatatang_200zh/s5/conf/pinyin2cmu @@ -0,0 +1,58 @@ +A AA +AI AY +AN AE N +ANG AE NG +AO AW +B B +CH CH +C T S +D D +E ER +EI EY +EN AH N +ENG AH NG +ER AA R +F F +G G +H HH +IA IY AA +IANG IY AE NG +IAN IY AE N +IAO IY AW +IE IY EH +I IY +ING IY NG +IN IY N +IONG IY UH NG +IU IY UH +J J +K K +L L +M M +N N +O AO +ONG UH NG +OU OW +P P +Q Q +R R +SH SH +S S +T T +UAI UW AY +UANG UW AE NG +UAN UW AE N +UA UW AA +UI UW IY +UN UW AH N +UO UW AO +U UW +UE IY EH +VE IY EH +V IY UW +VN IY N +W W +X X +Y Y +ZH JH +Z Z diff --git a/egs/aidatatang_200zh/s5/conf/pinyin_initial b/egs/aidatatang_200zh/s5/conf/pinyin_initial new file mode 100644 index 00000000000..e263ad07e2a --- /dev/null +++ b/egs/aidatatang_200zh/s5/conf/pinyin_initial @@ -0,0 +1,23 @@ +B +C +CH +D +F +G +H +J +K +L +M +N +P +Q +R +S +SH +T +W +X +Y +Z +ZH diff --git a/egs/aidatatang_200zh/s5/conf/pitch.conf b/egs/aidatatang_200zh/s5/conf/pitch.conf new file mode 100644 index 00000000000..e959a19d5b8 --- /dev/null +++ b/egs/aidatatang_200zh/s5/conf/pitch.conf @@ -0,0 +1 @@ +--sample-frequency=16000 diff --git a/egs/aidatatang_200zh/s5/local/chain/compare_wer.sh b/egs/aidatatang_200zh/s5/local/chain/compare_wer.sh new file mode 100755 index 00000000000..71e6fbe106d --- /dev/null +++ b/egs/aidatatang_200zh/s5/local/chain/compare_wer.sh @@ -0,0 +1,97 @@ +#!/bin/bash +# Copyright 2018 Emotech LTD (Author: Xuechen Liu) + +# compare wer between diff. models in aidatatang_200zh chain directory +# exemplar usage: local/chain/compare_wer.sh --online exp/chain/tdnn_2a_sp +# note: this script is made quite general since we kinda wanna give more flexibility to +# users on adding affix for their own use when training models. + +set -e +. ./cmd.sh +. ./path.sh + +if [ $# == 0 ]; then + echo "Usage: $0: [--online] [ ... ]" + echo "e.g.: $0 --online exp/chain/tdnn_2a_sp" + exit 1 +fi + +echo "# $0 $*" + +include_online=false +if [ "$1" == "--online" ]; then + include_online=true + shift +fi + +set_names() { + if [ $# != 1 ]; then + echo "compare_wer.sh: internal error" + exit 1 # exit the program + fi + dirname=$(echo $1 | cut -d: -f1) +} + +# print model names +echo -n "# Model " +for x in $*; do + printf "% 10s" " $(basename $x)" +done +echo + +# print decode WER results +echo -n "# WER(%) " +for x in $*; do + set_names $x + wer=$([ -d $x ] && grep WER $x/decode_test/cer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer +done +echo + +# so how about online WER? +if $include_online; then + echo -n "# WER(%)[online] " + for x in $*; do + set_names $x + wer=$(cat ${x}_online/decode_test/cer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + echo -n "# WER(%)[per-utt] " + for x in $*; do + set_names $x + wer_per_utt=$(cat ${x}_online/decode_test_per_utt/cer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer_per_utt + done + echo +fi + +# print final log prob for train & validation +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf($8)}' | cut -c1-7) + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf($8)}' | cut -c1-7) + printf "% 10s" $prob +done +echo + +# do the same for xent objective +echo -n "# Final train prob (xent)" +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob (xent)" +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo diff --git a/egs/aidatatang_200zh/s5/local/chain/run_tdnn.sh b/egs/aidatatang_200zh/s5/local/chain/run_tdnn.sh new file mode 120000 index 00000000000..34499362831 --- /dev/null +++ b/egs/aidatatang_200zh/s5/local/chain/run_tdnn.sh @@ -0,0 +1 @@ +tuning/run_tdnn_1a.sh \ No newline at end of file diff --git a/egs/aidatatang_200zh/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/aidatatang_200zh/s5/local/chain/tuning/run_tdnn_1a.sh new file mode 100644 index 00000000000..0be0e2c79c6 --- /dev/null +++ b/egs/aidatatang_200zh/s5/local/chain/tuning/run_tdnn_1a.sh @@ -0,0 +1,193 @@ +#!/bin/bash + +# This script is based on run_tdnn_7h.sh in swbd chain recipe. + +# results +# local/chain/compare_wer.sh exp/chain/tdnn_1a_sp/ +# Model tdnn_1a_sp +# WER(%) 5.59 +# Final train prob -0.0488 +# Final valid prob -0.0925 +# Final train prob (xent) -0.8001 +# Final valid prob (xent) -1.0398 + +set -e + +# configs for 'chain' +affix= +stage=0 +train_stage=-10 +get_egs_stage=-10 +dir=exp/chain/tdnn_1a # Note: _sp will get added to this +decode_iter= + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=2 +num_jobs_final=12 +minibatch_size=128 +frames_per_eg=150,110,90 +remove_egs=true +common_egs_dir= +xent_regularize=0.1 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 9 ]; then + # Build a tree using our new topology. This is the critically different + # step compared with other recipes. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 5000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 10 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=43 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=625 + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=625 + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=625 + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=625 + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=625 + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=625 + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=625 target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=625 target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 11 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aidatatang-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri5a_sp_lats \ + --dir $dir || exit 1; +fi + +if [ $stage -le 12 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph +fi + +graph_dir=$dir/graph +if [ $stage -le 13 ]; then + for test_set in dev test; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 10 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_$test_set \ + $graph_dir data/${test_set}_hires $dir/decode_${test_set} || exit 1; + done +fi + +exit; diff --git a/egs/aidatatang_200zh/s5/local/chain/tuning/run_tdnn_2a.sh b/egs/aidatatang_200zh/s5/local/chain/tuning/run_tdnn_2a.sh new file mode 100644 index 00000000000..78dd4000e58 --- /dev/null +++ b/egs/aidatatang_200zh/s5/local/chain/tuning/run_tdnn_2a.sh @@ -0,0 +1,238 @@ +#!/bin/bash + +# This script is based on run_tdnn_1a.sh. +# This setup used online pitch to train the neural network. +# It requires a online_pitch.conf in the conf dir. + +# results +# local/chain/compare_wer.sh exp/chain/tdnn_2a_sp +# Model tdnn_2a_sp +# WER(%) 5.61 +# Final train prob -0.0502 +# Final valid prob -0.0913 +# Final train prob (xent) -0.8047 +# Final valid prob (xent) -1.0292 + +# local/chain/compare_wer.sh --online exp/chain/tdnn_2a_sp +# Model tdnn_2a_sp +# WER(%) 5.61 +# WER(%)[online] 5.69 +# WER(%)[per-utt] 5.98 +# Final train prob -0.0502 +# Final valid prob -0.0913 +# Final train prob (xent) -0.8047 +# Final valid prob (xent) -1.0292 + +# local/chain/compare_wer.sh exp/chain/tdnn_1a_sp exp/chain/tdnn_2a_sp +# Model tdnn_1a_sp tdnn_2a_sp +# WER(%) 5.59 5.61 +# Final train prob -0.0488 -0.0502 +# Final valid prob -0.0925 -0.0913 +# Final train prob (xent) -0.8001 -0.8047 +# Final valid prob (xent) -1.0398 -1.0292 + +set -e + +# configs for 'chain' +affix= +stage=0 +train_stage=-10 +get_egs_stage=-10 +dir=exp/chain/tdnn_2a # Note: _sp will get added to this +decode_iter= + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=2 +num_jobs_final=12 +minibatch_size=128 +frames_per_eg=150,110,90 +remove_egs=true +common_egs_dir= +xent_regularize=0.1 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 9 ]; then + # Build a tree using our new topology. This is the critically different + # step compared with other recipes. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 5000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 10 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=43 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=625 + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=625 + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=625 + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=625 + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=625 + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=625 + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=625 target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=625 target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 11 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aidatatang-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires_online \ + --tree-dir $treedir \ + --lat-dir exp/tri5a_sp_lats \ + --dir $dir || exit 1; +fi + +if [ $stage -le 12 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph +fi + +graph_dir=$dir/graph +if [ $stage -le 13 ]; then + for test_set in dev test; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 10 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_$test_set \ + $graph_dir data/${test_set}_hires_online $dir/decode_${test_set} || exit 1; + done +fi + +if [ $stage -le 14 ]; then + steps/online/nnet3/prepare_online_decoding.sh --mfcc-config conf/mfcc_hires.conf \ + --add-pitch true \ + $lang exp/nnet3/extractor "$dir" ${dir}_online || exit 1; +fi + +dir=${dir}_online +if [ $stage -le 15 ]; then + for test_set in dev test; do + steps/online/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 10 --cmd "$decode_cmd" \ + --config conf/decode.config \ + $graph_dir data/${test_set}_hires_online $dir/decode_${test_set} || exit 1; + done +fi + +if [ $stage -le 16 ]; then + for test_set in dev test; do + steps/online/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 10 --cmd "$decode_cmd" --per-utt true \ + --config conf/decode.config \ + $graph_dir data/${test_set}_hires_online $dir/decode_${test_set}_per_utt || exit 1; + done +fi + +exit; diff --git a/egs/aidatatang_200zh/s5/local/create_oov_char_lexicon.pl b/egs/aidatatang_200zh/s5/local/create_oov_char_lexicon.pl new file mode 100644 index 00000000000..33e2e8061c3 --- /dev/null +++ b/egs/aidatatang_200zh/s5/local/create_oov_char_lexicon.pl @@ -0,0 +1,48 @@ +#!/usr/bin/env perl +# Copyright 2016 Alibaba Robotics Corp. (Author: Xingyu Na) +# +# A script for char-based Chinese OOV lexicon generation. +# +# Input 1: char-based dictionary, example +# CHAR1 ph1 ph2 +# CHAR2 ph3 +# CHAR3 ph2 ph4 +# +# Input 2: OOV word list, example +# WORD1 +# WORD2 +# WORD3 +# +# where WORD1 is in the format of "CHAR1CHAR2". +# +# Output: OOV lexicon, in the format of normal lexicon + +if($#ARGV != 1) { + print STDERR "usage: perl create_oov_char_lexicon.pl chardict oovwordlist > oovlex\n\n"; + print STDERR "### chardict: a dict in which each line contains the pronunciation of one Chinese char\n"; + print STDERR "### oovwordlist: OOV word list\n"; + print STDERR "### oovlex: output OOV lexicon\n"; + exit; +} + +use utf8; +my %prons; +open(DICT, $ARGV[0]) || die("Can't open dict ".$ARGV[0]."\n"); +binmode(DICT,":encoding(utf8)"); +foreach () { + chomp; @A = split(" ", $_); $prons{$A[0]} = $A[1]; +} +close DICT; + +open(WORDS, $ARGV[1]) || die("Can't open oov word list ".$ARGV[1]."\n"); +binmode(WORDS,":encoding(utf8)"); +while () { + chomp; + print $_; + @A = split("", $_); + foreach (@A) { + print " $prons{$_}"; + } + print "\n"; +} +close WORDS; diff --git a/egs/aidatatang_200zh/s5/local/data_prep.sh b/egs/aidatatang_200zh/s5/local/data_prep.sh new file mode 100644 index 00000000000..bb278a7d904 --- /dev/null +++ b/egs/aidatatang_200zh/s5/local/data_prep.sh @@ -0,0 +1,68 @@ +#!/bin/bash + +# Copyright 2017 Xingyu Na +# Apache 2.0 + +. ./path.sh || exit 1; + +if [ $# != 2 ]; then + echo "Usage: $0 " + echo " $0 /export/a05/xna/data/data_aidatatang_200zh/corpus /export/a05/xna/data/data_aidatatang_200zh/transcript" + exit 1; +fi + +aidatatang_audio_dir=$1 +aidatatang_text=$2/aidatatang_200_zh_transcript.txt + +train_dir=data/local/train +dev_dir=data/local/dev +test_dir=data/local/test +tmp_dir=data/local/tmp + +mkdir -p $train_dir +mkdir -p $dev_dir +mkdir -p $test_dir +mkdir -p $tmp_dir + +# data directory check +if [ ! -d $aidatatang_audio_dir ] || [ ! -f $aidatatang_text ]; then + echo "Error: $0 requires two directory arguments" + exit 1; +fi + +# find wav audio file for train, dev and test resp. +find $aidatatang_audio_dir -iname "*.wav" > $tmp_dir/wav.flist +n=`cat $tmp_dir/wav.flist | wc -l` +[ $n -ne 237265 ] && \ + echo Warning: expected 237265 data files, found $n + +grep -i "corpus/train" $tmp_dir/wav.flist > $train_dir/wav.flist || exit 1; +grep -i "corpus/dev" $tmp_dir/wav.flist > $dev_dir/wav.flist || exit 1; +grep -i "corpus/test" $tmp_dir/wav.flist > $test_dir/wav.flist || exit 1; + +rm -r $tmp_dir + +# Transcriptions preparation +for dir in $train_dir $dev_dir $test_dir; do + echo Preparing $dir transcriptions + sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' > $dir/utt.list + sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{i=NF-1;printf("%s %s\n",$NF,$i)}' > $dir/utt2spk_all + paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all + utils/filter_scp.pl -f 1 $dir/utt.list $aidatatang_text > $dir/transcripts.txt + awk '{print $1}' $dir/transcripts.txt > $dir/utt.list + utils/filter_scp.pl -f 1 $dir/utt.list $dir/utt2spk_all | sort -u > $dir/utt2spk + utils/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp + sort -u $dir/transcripts.txt > $dir/text + utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt +done + +mkdir -p data/train data/dev data/test + +for f in spk2utt utt2spk wav.scp text; do + cp $train_dir/$f data/train/$f || exit 1; + cp $dev_dir/$f data/dev/$f || exit 1; + cp $test_dir/$f data/test/$f || exit 1; +done + +echo "$0: aidatatang_200zh data preparation succeeded" +exit 0; diff --git a/egs/aidatatang_200zh/s5/local/download_and_untar.sh b/egs/aidatatang_200zh/s5/local/download_and_untar.sh new file mode 100644 index 00000000000..39f9ac01ff7 --- /dev/null +++ b/egs/aidatatang_200zh/s5/local/download_and_untar.sh @@ -0,0 +1,110 @@ +#!/bin/bash + +# Copyright 2014 Johns Hopkins University (author: Daniel Povey) +# 2017 Xingyu Na +# Apache 2.0 + +remove_archive=false + +if [ "$1" == --remove-archive ]; then + remove_archive=true + shift +fi + +if [ $# -ne 3 ]; then + echo "Usage: $0 [--remove-archive] " + echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/62 aidatatang_200zh" + echo "With --remove-archive it will remove the archive after successfully un-tarring it." + echo " can be one of: aidatatang_200zh." +fi + +data=$1 +url=$2 +part=$3 + +if [ ! -d "$data" ]; then + echo "$0: no such directory $data" + exit 1; +fi + +part_ok=false +list="aidatatang_200zh" +for x in $list; do + if [ "$part" == $x ]; then part_ok=true; fi +done +if ! $part_ok; then + echo "$0: expected to be one of $list, but got '$part'" + exit 1; +fi + +if [ -z "$url" ]; then + echo "$0: empty URL base." + exit 1; +fi + +if [ -f $data/$part/.complete ]; then + echo "$0: data part $part was already successfully extracted, nothing to do." + exit 0; +fi + +# sizes of the archive files in bytes. +sizes="18756983399" + +if [ -f $data/$part.tgz ]; then + size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}') + size_ok=false + for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done + if ! $size_ok; then + echo "$0: removing existing file $data/$part.tgz because its size in bytes $size" + echo "does not equal the size of one of the archives." + rm $data/$part.gz + else + echo "$data/$part.tgz exists and appears to be complete." + fi +fi + +if [ ! -f $data/$part.tgz ]; then + if ! which wget >/dev/null; then + echo "$0: wget is not installed." + exit 1; + fi + full_url=$url/$part.tgz + echo "$0: downloading data from $full_url. This may take some time, please be patient." + + cd $data + if ! wget --no-check-certificate $full_url; then + echo "$0: error executing wget $full_url" + exit 1; + fi +fi + +cd $data + +if ! tar -xvzf $part.tgz; then + echo "$0: error un-tarring archive $data/$part.tgz" + exit 1; +fi + +touch $data/$part/.complete + +dev_dir=$data/$part/corpus/dev +test_dir=$data/$part/corpus/test +train_dir=$data/$part/corpus/train +if [ $part == "aidatatang_200zh" ]; then + for set in $dev_dir $test_dir $train_dir;do + cd $set + for wav in ./*.tar.gz; do + echo "Extracting wav from $wav" + tar -zxf $wav && rm $wav + done + done +fi + +echo "$0: Successfully downloaded and un-tarred $data/$part.tgz" + +if $remove_archive; then + echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied." + rm $data/$part.tgz +fi + +exit 0; diff --git a/egs/gale_arabic/s5b/local/gale_format_data.sh b/egs/aidatatang_200zh/s5/local/format_data.sh old mode 100755 new mode 100644 similarity index 73% rename from egs/gale_arabic/s5b/local/gale_format_data.sh rename to egs/aidatatang_200zh/s5/local/format_data.sh index b69c34e68b9..47af9dd9dfd --- a/egs/gale_arabic/s5b/local/gale_format_data.sh +++ b/egs/aidatatang_200zh/s5/local/format_data.sh @@ -1,23 +1,25 @@ #!/bin/bash +# -# Copyright 2014 QCRI (author: Ahmed Ali) -# Apache 2.0 +. ./path.sh -if [ -f path.sh ]; then - . ./path.sh; else - echo "$0: missing path.sh"; exit 1; -fi +silprob=0.5 +mkdir -p data/lang_test data/train data/dev -for dir in test train; do - cp -pr data/local/$dir data/$dir -done - - -mkdir -p data/lang_test arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz [ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1; +# Copy stuff into its final locations... + +for f in spk2utt utt2spk wav.scp text; do + cp data/local/train/$f data/train/$f || exit 1; +done + +for f in spk2utt utt2spk wav.scp text; do + cp data/local/dev/$f data/dev/$f || exit 1; +done + rm -r data/lang_test cp -r data/lang data/lang_test @@ -26,15 +28,15 @@ gunzip -c "$arpa_lm" | \ --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst -echo "$0: Checking how stochastic G is (the first of these numbers should be small):" +echo "Checking how stochastic G is (the first of these numbers should be small):" fstisstochastic data/lang_test/G.fst ## Check lexicon. ## just have a look and make sure it seems sane. -echo "$0: First few lines of lexicon FST:" +echo "First few lines of lexicon FST:" fstprint --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt data/lang/L.fst | head -echo "$0: Performing further checks" +echo Performing further checks # Checking that G.fst is determinizable. fstdeterminize data/lang_test/G.fst /dev/null || echo Error determinizing G. @@ -55,6 +57,4 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \ fstisstochastic || echo LG is not stochastic -echo gale_format_data succeeded. - -exit 0 +echo format_data succeeded. diff --git a/egs/aidatatang_200zh/s5/local/nnet3/compare_wer.sh b/egs/aidatatang_200zh/s5/local/nnet3/compare_wer.sh new file mode 100755 index 00000000000..2d85626c356 --- /dev/null +++ b/egs/aidatatang_200zh/s5/local/nnet3/compare_wer.sh @@ -0,0 +1,82 @@ +#!/bin/bash +# Copyright 2018 Emotech LTD (Author: Xuechen Liu) + +# compare wer between diff. models in aidatatang_200zh nnet3 directory +# exemplar usage: local/nnet3/compare_wer.sh exp/nnet3/tdnn_sp +# note: this script is made quite general since we kinda wanna give more flexibility to +# users on adding affix for their own use when training models. + +set -e +. ./cmd.sh +. ./path.sh + +if [ $# == 0 ]; then + echo "Usage: $0: [--online] [ ... ]" + echo "e.g.: $0 exp/nnet3/tdnn_sp exp/nnet3/tdnn_sp_pr" + exit 1 +fi + +echo "# $0 $*" + +include_online=false +if [ "$1" == "--online" ]; then + include_online=true + shift +fi + +set_names() { + if [ $# != 1 ]; then + echo "compare_wer.sh: internal error" + exit 1 # exit the program + fi + dirname=$(echo $1 | cut -d: -f1) +} + +# print model names +echo -n "# Model " +for x in $*; do + printf "% 10s" " $(basename $x)" +done +echo + +# print decode WER results +echo -n "# WER(%) " +for x in $*; do + set_names $x + wer=$([ -d $x ] && grep WER $x/decode_test/cer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer +done +echo + +# so how about online WER? +if $include_online; then + echo -n "# WER(%)[online] " + for x in $*; do + set_names $x + wer=$(cat ${x}_online/decode_test/cer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + echo -n "# WER(%)[per-utt] " + for x in $*; do + set_names $x + wer_per_utt=$(cat ${x}_online/decode_test_per_utt/cer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer_per_utt + done + echo +fi + +# print log for train & validation +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.combined.log | grep log-like | awk '{printf($8)}' | cut -c1-7) + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.combined.log | grep log-like | awk '{printf($8)}' | cut -c1-7) + printf "% 10s" $prob +done +echo diff --git a/egs/aidatatang_200zh/s5/local/nnet3/run_ivector_common.sh b/egs/aidatatang_200zh/s5/local/nnet3/run_ivector_common.sh new file mode 100644 index 00000000000..0fe55ecf000 --- /dev/null +++ b/egs/aidatatang_200zh/s5/local/nnet3/run_ivector_common.sh @@ -0,0 +1,160 @@ +#!/bin/bash + +set -euo pipefail + +# This script is modified based on mini_librispeech/s5/local/nnet3/run_ivector_common.sh + +# This script is called from local/nnet3/run_tdnn.sh and +# local/chain/run_tdnn.sh (and may eventually be called by more +# scripts). It contains the common feature preparation and +# iVector-related parts of the script. See those scripts for examples +# of usage. + +stage=0 +train_set=train +test_sets="dev test" +gmm=tri5a +online=false +nnet3_affix= + +. ./cmd.sh +. ./path.sh +. utils/parse_options.sh + +gmm_dir=exp/${gmm} +ali_dir=exp/${gmm}_sp_ali + +for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do + if [ ! -f $f ]; then + echo "$0: expected file $f to exist" + exit 1 + fi +done + +online_affix= +if [ $online = true ]; then + online_affix=_online +fi + +if [ $stage -le 1 ]; then + # Although the nnet will be trained by high resolution data, we still have to + # perturb the normal data to get the alignment _sp stands for speed-perturbed + echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)" + utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp + echo "$0: making MFCC features for low-resolution speed-perturbed data" + steps/make_mfcc_pitch.sh --cmd "$train_cmd" --nj 70 data/${train_set}_sp \ + exp/make_mfcc/train_sp mfcc_perturbed || exit 1; + steps/compute_cmvn_stats.sh data/${train_set}_sp \ + exp/make_mfcc/train_sp mfcc_perturbed || exit 1; + utils/fix_data_dir.sh data/${train_set}_sp +fi + +if [ $stage -le 2 ]; then + echo "$0: aligning with the perturbed low-resolution data" + steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ + data/${train_set}_sp data/lang $gmm_dir $ali_dir || exit 1 +fi + +if [ $stage -le 3 ]; then + # Create high-resolution MFCC features (with 40 cepstra instead of 13). + # this shows how you can split across multiple file-systems. + echo "$0: creating high-resolution MFCC features" + mfccdir=mfcc_perturbed_hires$online_affix + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/mfcc/aidatatang-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage + fi + + for datadir in ${train_set}_sp ${test_sets}; do + utils/copy_data_dir.sh data/$datadir data/${datadir}_hires$online_affix + done + + # do volume-perturbation on the training data prior to extracting hires + # features; this helps make trained nnets more invariant to test data volume. + utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires$online_affix || exit 1; + + for datadir in ${train_set}_sp ${test_sets}; do + steps/make_mfcc_pitch$online_affix.sh --nj 10 --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/${datadir}_hires$online_affix exp/make_hires/$datadir $mfccdir || exit 1; + steps/compute_cmvn_stats.sh data/${datadir}_hires$online_affix exp/make_hires/$datadir $mfccdir || exit 1; + utils/fix_data_dir.sh data/${datadir}_hires$online_affix || exit 1; + # create MFCC data dir without pitch to extract iVector + utils/data/limit_feature_dim.sh 0:39 data/${datadir}_hires$online_affix data/${datadir}_hires_nopitch || exit 1; + steps/compute_cmvn_stats.sh data/${datadir}_hires_nopitch exp/make_hires/$datadir $mfccdir || exit 1; + done +fi + +if [ $stage -le 4 ]; then + echo "$0: computing a subset of data to train the diagonal UBM." + # We'll use about a quarter of the data. + mkdir -p exp/nnet3${nnet3_affix}/diag_ubm + temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm + + num_utts_total=$(wc -l $dir/configs/network.xconfig + input dim=$ivector_dim name=ivector + input dim=$feat_dim name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=850 + relu-batchnorm-layer name=tdnn2 dim=850 input=Append(-1,0,2) + relu-batchnorm-layer name=tdnn3 dim=850 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn4 dim=850 input=Append(-7,0,2) + relu-batchnorm-layer name=tdnn5 dim=850 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn6 dim=850 + output-layer name=output input=tdnn6 dim=$num_targets max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 8 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aidatatang-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/train_dnn.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --cleanup.preserve-model-interval 500 \ + --feat-dir=data/${train_set}_hires_online \ + --ali-dir $ali_dir \ + --lang data/lang \ + --reporting.email="$reporting_email" \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 9 ]; then + # this version of the decoding treats each utterance separately + # without carrying forward speaker information. + for decode_set in dev test; do + num_jobs=`cat data/${decode_set}_hires_online/utt2spk|cut -d' ' -f2|sort -u|wc -l` + decode_dir=${dir}/decode_$decode_set + steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires_online $decode_dir || exit 1; + done +fi + +if [ $stage -le 10 ]; then + steps/online/nnet3/prepare_online_decoding.sh --mfcc-config conf/mfcc_hires.conf \ + --add-pitch true \ + data/lang exp/nnet3/extractor "$dir" ${dir}_online || exit 1; +fi + +if [ $stage -le 11 ]; then + # do the actual online decoding with iVectors, carrying info forward from + # previous utterances of the same speaker. + for decode_set in dev test; do + num_jobs=`cat data/${decode_set}_hires_online/utt2spk|cut -d' ' -f2|sort -u|wc -l` + decode_dir=${dir}_online/decode_$decode_set + steps/online/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \ + --config conf/decode.config \ + $graph_dir data/${decode_set}_hires_online $decode_dir || exit 1; + done +fi + +if [ $stage -le 12 ]; then + # this version of the decoding treats each utterance separately + # without carrying forward speaker information. + for decode_set in dev test; do + num_jobs=`cat data/${decode_set}_hires_online/utt2spk|cut -d' ' -f2|sort -u|wc -l` + decode_dir=${dir}_online/decode_${decode_set}_per_utt + steps/online/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \ + --config conf/decode.config --per-utt true \ + $graph_dir data/${decode_set}_hires_online $decode_dir || exit 1; + done +fi + +wait; +exit 0; diff --git a/egs/aidatatang_200zh/s5/local/prepare_dict.sh b/egs/aidatatang_200zh/s5/local/prepare_dict.sh new file mode 100644 index 00000000000..aa72bcd48d2 --- /dev/null +++ b/egs/aidatatang_200zh/s5/local/prepare_dict.sh @@ -0,0 +1,320 @@ +#!/bin/bash +#Copyright 2016 LeSpeech (Author: Xingyu Na) + +# prepare dictionary for aidatatang +# it is done for English and Chinese separately, +# For English, we use CMU dictionary, and Sequitur G2P +# for OOVs, while all englist phone set will concert to Chinese +# phone set at the end. For Chinese, we use an online dictionary, +# for OOV, we just produce pronunciation using Charactrt Mapping. + +. ./path.sh + +[ $# != 0 ] && echo "Usage: $0" && exit 1; + +train_dir=data/local/train +dev_dir=data/local/dev +test_dir=data/local/test +dict_dir=data/local/dict +mkdir -p $dict_dir +mkdir -p $dict_dir/lexicon-{en,ch} + +# extract full vocabulary +cat $train_dir/text $dev_dir/text $test_dir/text | awk '{for (i = 2; i <= NF; i++) print $i}' |\ + perl -ape 's/ /\n/g;' | sort -u | grep -v '\[LAUGHTER\]' | grep -v '\[NOISE\]' |\ + grep -v '\[VOCALIZED-NOISE\]' > $dict_dir/words.txt || exit 1; + +# split into English and Chinese +cat $dict_dir/words.txt | grep '[a-zA-Z]' > $dict_dir/lexicon-en/words-en.txt || exit 1; +cat $dict_dir/words.txt | grep -v '[a-zA-Z]' > $dict_dir/lexicon-ch/words-ch.txt || exit 1; + + +##### produce pronunciations for english +if [ ! -f $dict_dir/cmudict/cmudict.0.7a ]; then + echo "--- Downloading CMU dictionary ..." + svn co -r 13068 https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict \ + $dict_dir/cmudict || exit 1; +fi + +# format cmudict +echo "--- Striping stress and pronunciation variant markers from cmudict ..." +perl $dict_dir/cmudict/scripts/make_baseform.pl \ + $dict_dir/cmudict/cmudict.0.7a /dev/stdout |\ + sed -e 's:^\([^\s(]\+\)([0-9]\+)\(\s\+\)\(.*\):\1\2\3:' > $dict_dir/cmudict/cmudict-plain.txt || exit 1; + +# extract in-vocab lexicon and oov words +echo "--- Searching for English OOV words ..." +awk 'NR==FNR{words[$1]; next;} !($1 in words)' \ + $dict_dir/cmudict/cmudict-plain.txt $dict_dir/lexicon-en/words-en.txt |\ + egrep -v '<.?s>' > $dict_dir/lexicon-en/words-en-oov.txt || exit 1; + +awk 'NR==FNR{words[$1]; next;} ($1 in words)' \ + $dict_dir/lexicon-en/words-en.txt $dict_dir/cmudict/cmudict-plain.txt |\ + egrep -v '<.?s>' > $dict_dir/lexicon-en/lexicon-en-iv.txt || exit 1; + +wc -l $dict_dir/lexicon-en/words-en-oov.txt +wc -l $dict_dir/lexicon-en/lexicon-en-iv.txt + +# setup g2p and generate oov lexicon +if [ ! -f conf/g2p_model ]; then + echo "--- Downloading a pre-trained Sequitur G2P model ..." + wget http://sourceforge.net/projects/kaldi/files/sequitur-model4 -O conf/g2p_model + if [ ! -f conf/g2p_model ]; then + echo "Failed to download the g2p model!" + exit 1 + fi +fi + +echo "--- Preparing pronunciations for OOV words ..." +g2p=`which g2p.py` +if [ ! -x $g2p ]; then + echo "g2p.py is not found. Checkout tools/extras/install_sequitur.sh." + exit 1 +fi +g2p.py --model=conf/g2p_model --apply $dict_dir/lexicon-en/words-en-oov.txt \ + > $dict_dir/lexicon-en/lexicon-en-oov.txt || exit 1; + +# merge in-vocab and oov lexicon +cat $dict_dir/lexicon-en/lexicon-en-oov.txt $dict_dir/lexicon-en/lexicon-en-iv.txt |\ + sort > $dict_dir/lexicon-en/lexicon-en-phn.txt || exit 1; + +# convert cmu phoneme to pinyin phonenme +mkdir -p $dict_dir/map +cat conf/cmu2pinyin | awk '{print $1;}' | sort -u > $dict_dir/map/cmu || exit 1; +cat conf/pinyin2cmu | awk -v cmu=$dict_dir/map/cmu \ + 'BEGIN{while((getline $dict_dir/map/cmu-used || exit 1; +cat $dict_dir/map/cmu | awk -v cmu=$dict_dir/map/cmu-used \ + 'BEGIN{while((getline $dict_dir/map/cmu-not-used || exit 1; + +awk 'NR==FNR{words[$1]; next;} ($1 in words)' \ + $dict_dir/map/cmu-not-used conf/cmu2pinyin |\ + egrep -v '<.?s>' > $dict_dir/map/cmu-py || exit 1; + +cat $dict_dir/map/cmu-py | \ + perl -e ' + open(MAPS, $ARGV[0]) or die("could not open map file"); + my %py2ph; + foreach $line () { + @A = split(" ", $line); + $py = shift(@A); + $py2ph{$py} = [@A]; + } + my @entry; + while () { + @A = split(" ", $_); + @entry = (); + $W = shift(@A); + push(@entry, $W); + for($i = 0; $i < @A; $i++) { push(@entry, @{$py2ph{$A[$i]}}); } + print "@entry"; + print "\n"; + } +' conf/pinyin2cmu > $dict_dir/map/cmu-cmu || exit 1; + +cat $dict_dir/lexicon-en/lexicon-en-phn.txt | \ + perl -e ' + open(MAPS, $ARGV[0]) or die("could not open map file"); + my %py2ph; + foreach $line () { + @A = split(" ", $line); + $py = shift(@A); + $py2ph{$py} = [@A]; + } + my @entry; + while () { + @A = split(" ", $_); + @entry = (); + $W = shift(@A); + push(@entry, $W); + for($i = 0; $i < @A; $i++) { + if (exists $py2ph{$A[$i]}) { push(@entry, @{$py2ph{$A[$i]}}); } + else {push(@entry, $A[$i])}; + } + print "@entry"; + print "\n"; + } +' $dict_dir/map/cmu-cmu > $dict_dir/lexicon-en/lexicon-en.txt || exit 1; + + +##### produce pronunciations for chinese +if [ ! -f $dict_dir/cedict/cedict_1_0_ts_utf-8_mdbg.txt ]; then + echo "------------- Downloading cedit dictionary ---------------" + mkdir -p $dict_dir/cedict + wget -P $dict_dir/cedict http://www.mdbg.net/chindict/export/cedict/cedict_1_0_ts_utf-8_mdbg.txt.gz + gunzip $dict_dir/cedict/cedict_1_0_ts_utf-8_mdbg.txt.gz +fi + +cat $dict_dir/cedict/cedict_1_0_ts_utf-8_mdbg.txt | grep -v '#' | awk -F '/' '{print $1}' |\ + perl -e ' + while () { + @A = split(" ", $_); + print $A[1]; + for($n = 2; $n < @A; $n++) { + $A[$n] =~ s:\[?([a-zA-Z0-9\:]+)\]?:$1:; + $tmp = uc($A[$n]); + print " $tmp"; + } + print "\n"; + } + ' | sort -k1 > $dict_dir/cedict/ch-dict.txt || exit 1; + +echo "--- Searching for Chinese OOV words ..." +awk 'NR==FNR{words[$1]; next;} !($1 in words)' \ + $dict_dir/cedict/ch-dict.txt $dict_dir/lexicon-ch/words-ch.txt |\ + egrep -v '<.?s>' > $dict_dir/lexicon-ch/words-ch-oov.txt || exit 1; + +awk 'NR==FNR{words[$1]; next;} ($1 in words)' \ + $dict_dir/lexicon-ch/words-ch.txt $dict_dir/cedict/ch-dict.txt |\ + egrep -v '<.?s>' > $dict_dir/lexicon-ch/lexicon-ch-iv.txt || exit 1; + +wc -l $dict_dir/lexicon-ch/words-ch-oov.txt +wc -l $dict_dir/lexicon-ch/lexicon-ch-iv.txt + + +# validate Chinese dictionary and compose a char-based +# dictionary in order to get OOV pronunciations +cat $dict_dir/cedict/ch-dict.txt |\ + perl -e ' + use utf8; + binmode(STDIN,":encoding(utf8)"); + binmode(STDOUT,":encoding(utf8)"); + while () { + @A = split(" ", $_); + $word_len = length($A[0]); + $proun_len = @A - 1 ; + if ($word_len == $proun_len) {print $_;} + } + ' > $dict_dir/cedict/ch-dict-1.txt || exit 1; + +# extract chars +cat $dict_dir/cedict/ch-dict-1.txt | awk '{print $1}' |\ + perl -e ' + use utf8; + binmode(STDIN,":encoding(utf8)"); + binmode(STDOUT,":encoding(utf8)"); + while () { + @A = split(" ", $_); + @chars = split("", $A[0]); + foreach (@chars) { + print "$_\n"; + } + } + ' | grep -v '^$' > $dict_dir/lexicon-ch/ch-char.txt || exit 1; + +# extract individual pinyins +cat $dict_dir/cedict/ch-dict-1.txt |\ + awk '{for(i=2; i<=NF; i++) print $i}' |\ + perl -ape 's/ /\n/g;' > $dict_dir/lexicon-ch/ch-char-pinyin.txt || exit 1; + +# first make sure number of characters and pinyins +# are equal, so that a char-based dictionary can +# be composed. +nchars=`wc -l < $dict_dir/lexicon-ch/ch-char.txt` +npinyin=`wc -l < $dict_dir/lexicon-ch/ch-char-pinyin.txt` +if [ $nchars -ne $npinyin ]; then + echo "Found $nchars chars and $npinyin pinyin. Please check!" + exit 1 +fi + +paste $dict_dir/lexicon-ch/ch-char.txt $dict_dir/lexicon-ch/ch-char-pinyin.txt |\ + sort -u > $dict_dir/lexicon-ch/ch-char-dict.txt || exit 1; + +# create a multiple pronunciation dictionary +cat $dict_dir/lexicon-ch/ch-char-dict.txt |\ + perl -e ' + my $prev = ""; + my $out_line = ""; + while () { + @A = split(" ", $_); + $cur = $A[0]; + $cur_py = $A[1]; + #print length($prev); + if (length($prev) == 0) { $out_line = $_; chomp($out_line);} + if (length($prev)>0 && $cur ne $prev) { print $out_line; print "\n"; $out_line = $_; chomp($out_line);} + if (length($prev)>0 && $cur eq $prev) { $out_line = $out_line."/"."$cur_py";} + $prev = $cur; + } + print $out_line; + ' > $dict_dir/lexicon-ch/ch-char-dict-mp.txt || exit 1; + +# get lexicon for Chinese OOV words +local/create_oov_char_lexicon.pl $dict_dir/lexicon-ch/ch-char-dict-mp.txt \ + $dict_dir/lexicon-ch/words-ch-oov.txt > $dict_dir/lexicon-ch/lexicon-ch-oov.txt || exit 1; + +# seperate multiple prons for Chinese OOV lexicon +cat $dict_dir/lexicon-ch/lexicon-ch-oov.txt |\ + perl -e ' + my @entry; + my @entry1; + while () { + @A = split(" ", $_); + @entry = (); + push(@entry, $A[0]); + for($i = 1; $i < @A; $i++ ) { + @py = split("/", $A[$i]); + @entry1 = @entry; + @entry = (); + for ($j = 0; $j < @entry1; $j++) { + for ($k = 0; $k < @py; $k++) { + $tmp = $entry1[$j]." ".$py[$k]; + push(@entry, $tmp); + } + } + } + for ($i = 0; $i < @entry; $i++) { + print $entry[$i]; + print "\n"; + } + } + ' > $dict_dir/lexicon-ch/lexicon-ch-oov-mp.txt || exit 1; + +# compose IV and OOV lexicons for Chinese +cat $dict_dir/lexicon-ch/lexicon-ch-oov-mp.txt $dict_dir/lexicon-ch/lexicon-ch-iv.txt |\ + awk '{if (NF > 1 && $2 ~ /[A-Za-z0-9]+/) print $0;}' > $dict_dir/lexicon-ch/lexicon-ch.txt || exit 1; + +# convert Chinese pinyin to CMU format +cat $dict_dir/lexicon-ch/lexicon-ch.txt | sed -e 's/U:/V/g' | sed -e 's/ R\([0-9]\)/ ER\1/g'|\ + utils/pinyin_map.pl conf/pinyin2cmu > $dict_dir/lexicon-ch/lexicon-ch-cmu.txt || exit 1; + +# combine English and Chinese lexicons +cat $dict_dir/lexicon-en/lexicon-en.txt $dict_dir/lexicon-ch/lexicon-ch-cmu.txt |\ + sort -u > $dict_dir/lexicon1.txt || exit 1; + +cat $dict_dir/lexicon1.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}'| \ + sort -u |\ + perl -e ' + my %ph_cl; + while () { + $phone = $_; + chomp($phone); + chomp($_); + $phone =~ s:([A-Z]+)[0-9]:$1:; + if (exists $ph_cl{$phone}) { push(@{$ph_cl{$phone}}, $_) } + else { $ph_cl{$phone} = [$_]; } + } + foreach $key ( keys %ph_cl ) { + print "@{ $ph_cl{$key} }\n" + } + ' | sort -k1 > $dict_dir/nonsilence_phones.txt || exit 1; + +( echo SIL; echo SPN; echo NSN; echo LAU ) > $dict_dir/silence_phones.txt + +echo SIL > $dict_dir/optional_silence.txt + +# No "extra questions" in the input to this setup, as we don't +# have stress or tone + +cat $dict_dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > $dict_dir/extra_questions.txt || exit 1; +cat $dict_dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", $_)) { + $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \ + >> $dict_dir/extra_questions.txt || exit 1; + +# Add to the lexicon the silences, noises etc. +(echo '!SIL SIL'; echo '[VOCALIZED-NOISE] SPN'; echo '[NOISE] NSN'; echo '[LAUGHTER] LAU'; + echo ' SPN' ) | \ + cat - $dict_dir/lexicon1.txt > $dict_dir/lexicon.txt || exit 1; + +echo "$0: aidatatang_200zh dict preparation succeeded" +exit 0; diff --git a/egs/aidatatang_200zh/s5/local/score.sh b/egs/aidatatang_200zh/s5/local/score.sh new file mode 100644 index 00000000000..a9786169973 --- /dev/null +++ b/egs/aidatatang_200zh/s5/local/score.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +set -e -o pipefail +set -x +steps/score_kaldi.sh "$@" +steps/scoring/score_kaldi_cer.sh --stage 2 "$@" + +echo "$0: Done" diff --git a/egs/aidatatang_200zh/s5/local/train_lms.sh b/egs/aidatatang_200zh/s5/local/train_lms.sh new file mode 100644 index 00000000000..bc52f8acb20 --- /dev/null +++ b/egs/aidatatang_200zh/s5/local/train_lms.sh @@ -0,0 +1,92 @@ +#!/bin/bash + + +# To be run from one directory above this script. + + +text=data/local/train/text +lexicon=data/local/dict/lexicon.txt + +for f in "$text" "$lexicon"; do + [ ! -f $x ] && echo "$0: No such file $f" && exit 1; +done + +# This script takes no arguments. It assumes you have already run +# aidatatang_data_prep.sh. +# It takes as input the files +#data/local/train/text +#data/local/dict/lexicon.txt +dir=data/local/lm +mkdir -p $dir + +export LC_ALL=C # You'll get errors about things being not sorted, if you + # have a different locale. +kaldi_lm=`which train_lm.sh` +if [ ! -x $kaldi_lm ]; then + echo "$0: train_lm.sh is not found. That might mean it's not installed" + echo "$0: or it is not added to PATH" + echo "$0: Use the script tools/extra/install_kaldi_lm.sh to install it" + exit 1 +fi + +cleantext=$dir/text.no_oov + +cat $text | awk -v lex=$lexicon 'BEGIN{while((getline0){ seen[$1]=1; } } + {for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf(" ");} } printf("\n");}' \ + > $cleantext || exit 1; + + +cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \ + sort -nr > $dir/word.counts || exit 1; + + +# Get counts from acoustic training transcripts, and add one-count +# for each word in the lexicon (but not silence, we don't want it +# in the LM-- we'll add it optionally later). +cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \ + cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \ + sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1; + +# note: we probably won't really make use of as there aren't any OOVs +cat $dir/unigram.counts | awk '{print $2}' | get_word_map.pl "" "" "" > $dir/word_map \ + || exit 1; + +# note: ignore 1st field of train.txt, it's the utterance-id. +cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline0)map[$1]=$2;} + { for(n=2;n<=NF;n++) { printf map[$n]; if(n$dir/train.gz \ + || exit 1; + +train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1; + +# LM is small enough that we don't need to prune it (only about 0.7M N-grams). +# Perplexity over 128254.000000 words is 90.446690 + +# note: output is +# data/local/lm/3gram-mincount/lm_unpruned.gz + +exit 0 + + +# From here is some commands to do a baseline with SRILM (assuming +# you have it installed). +heldout_sent=10000 # Don't change this if you want result to be comparable with + # kaldi_lm results +sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities. +mkdir -p $sdir +cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n $sdir/heldout +cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n $sdir/train + +cat $dir/word_map | awk '{print $1}' | cat - <(echo ""; echo "" ) > $sdir/wordlist + + +ngram-count -text $sdir/train -order 3 -limit-vocab -vocab $sdir/wordlist -unk \ + -map-unk "" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz +ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/heldout +# 0 zeroprobs, logprob= -250954 ppl= 90.5091 ppl1= 132.482 + +# Note: perplexity SRILM gives to Kaldi-LM model is same as kaldi-lm reports above. +# Difference in WSJ must have been due to different treatment of . +ngram -lm $dir/3gram-mincount/lm_unpruned.gz -ppl $sdir/heldout +# 0 zeroprobs, logprob= -250913 ppl= 90.4439 ppl1= 132.379 diff --git a/egs/aidatatang_200zh/s5/local/wer_hyp_filter b/egs/aidatatang_200zh/s5/local/wer_hyp_filter new file mode 100644 index 00000000000..a1bfdb57efc --- /dev/null +++ b/egs/aidatatang_200zh/s5/local/wer_hyp_filter @@ -0,0 +1,19 @@ +#!/usr/bin/env perl + +@filters=('[NOISE]','[LAUGHTER]','[VOCALIZED-NOISE]','','%HESITATION'); + +foreach $w (@filters) { + $bad{$w} = 1; +} + +while() { + @A = split(" ", $_); + $id = shift @A; + print "$id "; + foreach $a (@A) { + if (!defined $bad{$a}) { + print "$a "; + } + } + print "\n"; +} diff --git a/egs/aidatatang_200zh/s5/local/wer_output_filter b/egs/aidatatang_200zh/s5/local/wer_output_filter new file mode 100644 index 00000000000..aceeeec41b4 --- /dev/null +++ b/egs/aidatatang_200zh/s5/local/wer_output_filter @@ -0,0 +1,25 @@ +#!/usr/bin/env perl +# Copyright 2012-2014 Johns Hopkins University (Author: Yenda Trmal) +# Apache 2.0 +use utf8; + +use open qw(:encoding(utf8)); +binmode STDIN, ":utf8"; +binmode STDOUT, ":utf8"; +binmode STDERR, ":utf8"; + +while (<>) { + @F = split " "; + print $F[0] . " "; + foreach $s (@F[1..$#F]) { + if (($s =~ /\[.*\]/) || ($s =~ /\<.*\>/) || ($s =~ "!SIL")) { + print ""; + } else { + print "$s" + } + print " "; + } + print "\n"; +} + + diff --git a/egs/aidatatang_200zh/s5/local/wer_ref_filter b/egs/aidatatang_200zh/s5/local/wer_ref_filter new file mode 100644 index 00000000000..a1bfdb57efc --- /dev/null +++ b/egs/aidatatang_200zh/s5/local/wer_ref_filter @@ -0,0 +1,19 @@ +#!/usr/bin/env perl + +@filters=('[NOISE]','[LAUGHTER]','[VOCALIZED-NOISE]','','%HESITATION'); + +foreach $w (@filters) { + $bad{$w} = 1; +} + +while() { + @A = split(" ", $_); + $id = shift @A; + print "$id "; + foreach $a (@A) { + if (!defined $bad{$a}) { + print "$a "; + } + } + print "\n"; +} diff --git a/egs/aidatatang_200zh/s5/path.sh b/egs/aidatatang_200zh/s5/path.sh new file mode 100644 index 00000000000..2d17b17a84a --- /dev/null +++ b/egs/aidatatang_200zh/s5/path.sh @@ -0,0 +1,6 @@ +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C diff --git a/egs/aidatatang_200zh/s5/run.sh b/egs/aidatatang_200zh/s5/run.sh new file mode 100644 index 00000000000..47e46a660cd --- /dev/null +++ b/egs/aidatatang_200zh/s5/run.sh @@ -0,0 +1,150 @@ +#!/bin/bash + +# Copyright 2019 Beijing DataTang Tech. Co. Ltd. (Author: Liyuan Wang) +# 2017 Hui Bu +# 2017 Jiayu Du +# 2017 Xingyu Na +# 2017 Bengu Wu +# 2017 Hao Zheng +# Apache 2.0 + +# This is a shell script, but it's recommended that you run the commands one by +# one by copying and pasting into the shell. +# Caution: some of the graph creation steps use quite a bit of memory, so you +# should run this on a machine that has sufficient memory. + + +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. +. ./path.sh + + +# corpus directory and download URL +data=/export/a05/xna/data +data_url=www.openslr.org/resources/62 + +# Obtain the database +#[ -d $data ] || mkdir -p $data || exit 1; +local/download_and_untar.sh $data $data_url aidatatang_200zh || exit 1; + +# Data Preparation: generate text, wav.scp, utt2spk, spk2utt +local/data_prep.sh $data/aidatatang_200zh/corpus $data/aidatatang_200zh/transcript || exit 1; + +# Lexicon Preparation: build a large lexicon that invovles words in both the training and decoding +local/prepare_dict.sh || exit 1; + +# Prepare Language Stuff +# Phone Sets, questions, L compilation +utils/prepare_lang.sh --position-dependent-phones false data/local/dict "" data/local/lang data/lang || exit 1; + +# LM training +local/train_lms.sh || exit 1; + +# G compilation, check LG composition +local/format_data.sh + +# Now make MFCC plus pitch features. +# mfccdir should be some place with a largish disk where you want to store MFCC features. +mfccdir=mfcc +for x in train dev test; do + steps/make_mfcc_pitch.sh --write_utt2dur false --write_utt2num_frames false --cmd "$train_cmd" --nj 10 data/$x exp/make_mfcc/$x $mfccdir || exit 1; + steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1; + utils/fix_data_dir.sh data/$x || exit 1; +done + +steps/train_mono.sh --cmd "$train_cmd" --nj 10 \ + data/train data/lang exp/mono || exit 1; + +# Monophone decoding +utils/mkgraph.sh data/lang_test exp/mono exp/mono/graph || exit 1; +steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 10 \ + exp/mono/graph data/dev exp/mono/decode_dev + +steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 10 \ + exp/mono/graph data/test exp/mono/decode_test + +# Get alignments from monophone system. +steps/align_si.sh --cmd "$train_cmd" --nj 10 \ + data/train data/lang exp/mono exp/mono_ali || exit 1; + +# train tri1 [first triphone pass] +steps/train_deltas.sh --cmd "$train_cmd" \ + 2500 20000 data/train data/lang exp/mono_ali exp/tri1 || exit 1; + +# decode tri1 +utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph || exit 1; +steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 10 \ + exp/tri1/graph data/dev exp/tri1/decode_dev +steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 10 \ + exp/tri1/graph data/test exp/tri1/decode_test + +# align tri1 +steps/align_si.sh --cmd "$train_cmd" --nj 10 \ + data/train data/lang exp/tri1 exp/tri1_ali || exit 1; + +# train tri2 [delta+delta-deltas] +steps/train_deltas.sh --cmd "$train_cmd" \ + 2500 20000 data/train data/lang exp/tri1_ali exp/tri2 || exit 1; + +# decode tri2 +utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph +steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 10 \ + exp/tri2/graph data/dev exp/tri2/decode_dev +steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 10 \ + exp/tri2/graph data/test exp/tri2/decode_test + +#align tri2 +steps/align_si.sh --cmd "$train_cmd" --nj 10 \ + data/train data/lang exp/tri2 exp/tri2_ali || exit 1; + +# Train tri3a, which is LDA+MLLT, +steps/train_lda_mllt.sh --cmd "$train_cmd" \ + 2500 20000 data/train data/lang exp/tri2_ali exp/tri3a || exit 1; + +utils/mkgraph.sh data/lang_test exp/tri3a exp/tri3a/graph || exit 1; +steps/decode.sh --cmd "$decode_cmd" --nj 10 --config conf/decode.config \ + exp/tri3a/graph data/dev exp/tri3a/decode_dev +steps/decode.sh --cmd "$decode_cmd" --nj 10 --config conf/decode.config \ + exp/tri3a/graph data/test exp/tri3a/decode_test + +# From now, we start building a more serious system (with SAT), and we'll +# do the alignment with fMLLR. +steps/align_fmllr.sh --cmd "$train_cmd" --nj 10 \ + data/train data/lang exp/tri3a exp/tri3a_ali || exit 1; + +steps/train_sat.sh --cmd "$train_cmd" \ + 2500 20000 data/train data/lang exp/tri3a_ali exp/tri4a || exit 1; + +utils/mkgraph.sh data/lang_test exp/tri4a exp/tri4a/graph +steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 10 --config conf/decode.config \ + exp/tri4a/graph data/dev exp/tri4a/decode_dev +steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 10 --config conf/decode.config \ + exp/tri4a/graph data/test exp/tri4a/decode_test + +steps/align_fmllr.sh --cmd "$train_cmd" --nj 10 \ + data/train data/lang exp/tri4a exp/tri4a_ali + +# Building a larger SAT system. + +steps/train_sat.sh --cmd "$train_cmd" \ + 3500 100000 data/train data/lang exp/tri4a_ali exp/tri5a || exit 1; + +utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph || exit 1; +steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 10 --config conf/decode.config \ + exp/tri5a/graph data/dev exp/tri5a/decode_dev || exit 1; +steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 10 --config conf/decode.config \ + exp/tri5a/graph data/test exp/tri5a/decode_test || exit 1; + +steps/align_fmllr.sh --cmd "$train_cmd" --nj 10 \ + data/train data/lang exp/tri5a exp/tri5a_ali || exit 1; + +# nnet3 +local/nnet3/run_tdnn.sh + +# chain +local/chain/run_tdnn.sh + +# getting results (see RESULTS file) +for x in exp/*/decode_test; do [ -d $x ] && grep WER $x/cer_* | utils/best_wer.sh; done 2>/dev/null + +exit 0; diff --git a/egs/aidatatang_200zh/s5/steps b/egs/aidatatang_200zh/s5/steps new file mode 120000 index 00000000000..6e99bf5b5ad --- /dev/null +++ b/egs/aidatatang_200zh/s5/steps @@ -0,0 +1 @@ +../../wsj/s5/steps \ No newline at end of file diff --git a/egs/aidatatang_200zh/s5/utils b/egs/aidatatang_200zh/s5/utils new file mode 120000 index 00000000000..b240885218f --- /dev/null +++ b/egs/aidatatang_200zh/s5/utils @@ -0,0 +1 @@ +../../wsj/s5/utils \ No newline at end of file diff --git a/egs/aishell/s5/RESULTS b/egs/aishell/s5/RESULTS index b58ede148c4..b6155cb62d4 100644 --- a/egs/aishell/s5/RESULTS +++ b/egs/aishell/s5/RESULTS @@ -1,8 +1,18 @@ -%WER 33.82 [ 35432 / 104765, 743 ins, 3991 del, 30698 sub ] exp/mono/decode_test/cer_12_0.0 -%WER 19.39 [ 20310 / 104765, 903 ins, 1452 del, 17955 sub ] exp/tri1/decode_test/cer_13_0.5 -%WER 19.23 [ 20147 / 104765, 910 ins, 1287 del, 17950 sub ] exp/tri2/decode_test/cer_14_0.5 -%WER 17.14 [ 17961 / 104765, 812 ins, 1024 del, 16125 sub ] exp/tri3a/decode_test/cer_14_0.0 -%WER 13.64 [ 14294 / 104765, 669 ins, 736 del, 12889 sub ] exp/tri4a/decode_test/cer_14_0.5 -%WER 12.23 [ 12809 / 104765, 656 ins, 580 del, 11573 sub ] exp/tri5a/decode_test/cer_13_1.0 -%WER 8.45 [ 8849 / 104765, 312 ins, 538 del, 7999 sub ] exp/nnet3/tdnn_sp/decode_test/cer_13_1.0 -%WER 7.46 [ 7813 / 104765, 287 ins, 472 del, 7054 sub ] exp/chain/tdnn_1a_sp/decode_test/cer_10_1.0 +%WER 36.41 [ 38146 / 104765, 837 ins, 3114 del, 34195 sub ] exp/mono/decode_test/cer_10_0.0 +%WER 18.76 [ 19654 / 104765, 949 ins, 1152 del, 17553 sub ] exp/tri1/decode_test/cer_13_0.5 +%WER 18.64 [ 19531 / 104765, 941 ins, 1159 del, 17431 sub ] exp/tri2/decode_test/cer_14_0.5 +%WER 17.04 [ 17849 / 104765, 810 ins, 1021 del, 16018 sub ] exp/tri3a/decode_test/cer_14_0.5 +%WER 13.82 [ 14482 / 104765, 764 ins, 670 del, 13048 sub ] exp/tri4a/decode_test/cer_13_0.5 +%WER 12.12 [ 12694 / 104765, 751 ins, 523 del, 11420 sub ] exp/tri5a/decode_test/cer_13_0.5 +%WER 8.65 [ 9064 / 104765, 367 ins, 455 del, 8242 sub ] exp/nnet3/tdnn_sp/decode_test/cer_14_0.5 +%WER 7.48 [ 7839 / 104765, 285 ins, 454 del, 7100 sub ] exp/chain/tdnn_1a_sp/decode_test/cer_10_1.0 + +# nnet3 tdnn with online pitch, local/nnet3/tuning/tun_tdnn_2a.sh +%WER 8.64 [ 9050 / 104765, 349 ins, 521 del, 8180 sub ] exp/nnet3/tdnn_sp/decode_test/cer_15_0.5 +%WER 8.72 [ 9135 / 104765, 367 ins, 422 del, 8346 sub ] exp/nnet3/tdnn_sp_online/decode_test/cer_12_1.0 +%WER 9.36 [ 9807 / 104765, 386 ins, 441 del, 8980 sub ] exp/nnet3/tdnn_sp_online/decode_test_per_utt/cer_13_1.0 + +# chain with online pitch, local/chain/tuning/run_tdnn_2a.sh +%WER 7.45 [ 7807 / 104765, 340 ins, 497 del, 6970 sub ] exp/chain/tdnn_2a_sp/decode_test/cer_11_0.5 +%WER 7.43 [ 7780 / 104765, 341 ins, 469 del, 6970 sub ] exp/chain/tdnn_2a_sp_online/decode_test/cer_11_0.5 +%WER 7.92 [ 8296 / 104765, 384 ins, 472 del, 7440 sub ] exp/chain/tdnn_2a_sp_online/decode_test_per_utt/cer_11_0.5 diff --git a/egs/aishell/s5/conf/online_pitch.conf b/egs/aishell/s5/conf/online_pitch.conf new file mode 100644 index 00000000000..c0f1342160d --- /dev/null +++ b/egs/aishell/s5/conf/online_pitch.conf @@ -0,0 +1,4 @@ +--sample-frequency=16000 +--simulate-first-pass-online=true +--normalization-right-context=25 +--frames-per-chunk=10 diff --git a/egs/aishell/s5/local/aishell_prepare_dict.sh b/egs/aishell/s5/local/aishell_prepare_dict.sh index 3763622a3e7..c4cabb24de4 100755 --- a/egs/aishell/s5/local/aishell_prepare_dict.sh +++ b/egs/aishell/s5/local/aishell_prepare_dict.sh @@ -15,21 +15,9 @@ mkdir -p $dict_dir cp $res_dir/lexicon.txt $dict_dir cat $dict_dir/lexicon.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}'| \ - sort -u |\ - perl -e ' - my %ph_cl; - while () { - $phone = $_; - chomp($phone); - chomp($_); - $phone = $_; - next if ($phone eq "sil"); - if (exists $ph_cl{$phone}) { push(@{$ph_cl{$phone}}, $_) } - else { $ph_cl{$phone} = [$_]; } - } - foreach $key ( keys %ph_cl ) { - print "@{ $ph_cl{$key} }\n" - } + perl -e 'while(<>){ chomp($_); $phone = $_; next if ($phone eq "sil"); + m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$1} .= "$phone "; } + foreach $l (values %q) {print "$l\n";} ' | sort -k1 > $dict_dir/nonsilence_phones.txt || exit 1; echo sil > $dict_dir/silence_phones.txt diff --git a/egs/aishell/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/aishell/s5/local/chain/tuning/run_tdnn_1a.sh index a0b183e3c5a..b38fa4d9c7a 100755 --- a/egs/aishell/s5/local/chain/tuning/run_tdnn_1a.sh +++ b/egs/aishell/s5/local/chain/tuning/run_tdnn_1a.sh @@ -90,7 +90,7 @@ if [ $stage -le 10 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) mkdir -p $dir/configs cat < $dir/configs/network.xconfig diff --git a/egs/aishell/s5/local/chain/tuning/run_tdnn_2a.sh b/egs/aishell/s5/local/chain/tuning/run_tdnn_2a.sh new file mode 100755 index 00000000000..6b7223785d9 --- /dev/null +++ b/egs/aishell/s5/local/chain/tuning/run_tdnn_2a.sh @@ -0,0 +1,211 @@ +#!/bin/bash + +# This script is based on run_tdnn_1a.sh. +# This setup used online pitch to train the neural network. +# It requires a online_pitch.conf in the conf dir. + +set -e + +# configs for 'chain' +affix= +stage=0 +train_stage=-10 +get_egs_stage=-10 +dir=exp/chain/tdnn_2a # Note: _sp will get added to this +decode_iter= + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=2 +num_jobs_final=12 +minibatch_size=128 +frames_per_eg=150,110,90 +remove_egs=true +common_egs_dir= +xent_regularize=0.1 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 9 ]; then + # Build a tree using our new topology. This is the critically different + # step compared with other recipes. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 5000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 10 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=43 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=625 + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=625 + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=625 + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=625 + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=625 + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=625 + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=625 target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=625 target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 11 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aishell-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires_online \ + --tree-dir $treedir \ + --lat-dir exp/tri5a_sp_lats \ + --dir $dir || exit 1; +fi + +if [ $stage -le 12 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph +fi + +graph_dir=$dir/graph +if [ $stage -le 13 ]; then + for test_set in dev test; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 10 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_$test_set \ + $graph_dir data/${test_set}_hires_online $dir/decode_${test_set} || exit 1; + done +fi + +if [ $stage -le 14 ]; then + steps/online/nnet3/prepare_online_decoding.sh --mfcc-config conf/mfcc_hires.conf \ + --add-pitch true \ + $lang exp/nnet3/extractor "$dir" ${dir}_online || exit 1; +fi + +dir=${dir}_online +if [ $stage -le 15 ]; then + for test_set in dev test; do + steps/online/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 10 --cmd "$decode_cmd" \ + --config conf/decode.config \ + $graph_dir data/${test_set}_hires_online $dir/decode_${test_set} || exit 1; + done +fi + +if [ $stage -le 16 ]; then + for test_set in dev test; do + steps/online/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 10 --cmd "$decode_cmd" --per-utt true \ + --config conf/decode.config \ + $graph_dir data/${test_set}_hires_online $dir/decode_${test_set}_per_utt || exit 1; + done +fi + +exit; diff --git a/egs/aishell/s5/local/download_and_untar.sh b/egs/aishell/s5/local/download_and_untar.sh index 3578a1c0835..58a278241d7 100755 --- a/egs/aishell/s5/local/download_and_untar.sh +++ b/egs/aishell/s5/local/download_and_untar.sh @@ -57,7 +57,7 @@ if [ -f $data/$part.tgz ]; then if ! $size_ok; then echo "$0: removing existing file $data/$part.tgz because its size in bytes $size" echo "does not equal the size of one of the archives." - rm $data/$part.gz + rm $data/$part.tgz else echo "$data/$part.tgz exists and appears to be complete." fi diff --git a/egs/aishell/s5/local/nnet3/run_ivector_common.sh b/egs/aishell/s5/local/nnet3/run_ivector_common.sh index 1643e6381b1..af0ae122372 100755 --- a/egs/aishell/s5/local/nnet3/run_ivector_common.sh +++ b/egs/aishell/s5/local/nnet3/run_ivector_common.sh @@ -14,7 +14,7 @@ stage=0 train_set=train test_sets="dev test" gmm=tri5a - +online=false nnet3_affix= . ./cmd.sh @@ -31,6 +31,11 @@ for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do fi done +online_affix= +if [ $online = true ]; then + online_affix=_online +fi + if [ $stage -le 1 ]; then # Although the nnet will be trained by high resolution data, we still have to # perturb the normal data to get the alignment _sp stands for speed-perturbed @@ -54,26 +59,26 @@ if [ $stage -le 3 ]; then # Create high-resolution MFCC features (with 40 cepstra instead of 13). # this shows how you can split across multiple file-systems. echo "$0: creating high-resolution MFCC features" - mfccdir=mfcc_perturbed_hires + mfccdir=mfcc_perturbed_hires$online_affix if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/mfcc/aishell-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage fi for datadir in ${train_set}_sp ${test_sets}; do - utils/copy_data_dir.sh data/$datadir data/${datadir}_hires + utils/copy_data_dir.sh data/$datadir data/${datadir}_hires$online_affix done # do volume-perturbation on the training data prior to extracting hires # features; this helps make trained nnets more invariant to test data volume. - utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires || exit 1; + utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires$online_affix || exit 1; for datadir in ${train_set}_sp ${test_sets}; do - steps/make_mfcc_pitch.sh --nj 10 --mfcc-config conf/mfcc_hires.conf \ - --cmd "$train_cmd" data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1; - steps/compute_cmvn_stats.sh data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1; - utils/fix_data_dir.sh data/${datadir}_hires || exit 1; + steps/make_mfcc_pitch$online_affix.sh --nj 10 --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/${datadir}_hires$online_affix exp/make_hires/$datadir $mfccdir || exit 1; + steps/compute_cmvn_stats.sh data/${datadir}_hires$online_affix exp/make_hires/$datadir $mfccdir || exit 1; + utils/fix_data_dir.sh data/${datadir}_hires$online_affix || exit 1; # create MFCC data dir without pitch to extract iVector - utils/data/limit_feature_dim.sh 0:39 data/${datadir}_hires data/${datadir}_hires_nopitch || exit 1; + utils/data/limit_feature_dim.sh 0:39 data/${datadir}_hires$online_affix data/${datadir}_hires_nopitch || exit 1; steps/compute_cmvn_stats.sh data/${datadir}_hires_nopitch exp/make_hires/$datadir $mfccdir || exit 1; done fi diff --git a/egs/aishell/s5/local/nnet3/run_tdnn.sh b/egs/aishell/s5/local/nnet3/run_tdnn.sh deleted file mode 100755 index 3cb8cd861a3..00000000000 --- a/egs/aishell/s5/local/nnet3/run_tdnn.sh +++ /dev/null @@ -1,117 +0,0 @@ -#!/bin/bash - -# This script is based on swbd/s5c/local/nnet3/run_tdnn.sh - -# this is the standard "tdnn" system, built in nnet3; it's what we use to -# call multi-splice. - -# At this script level we don't support not running on GPU, as it would be painfully slow. -# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false, -# --num-threads 16 and --minibatch-size 128. -set -e - -stage=0 -train_stage=-10 -affix= -common_egs_dir= - -# training options -initial_effective_lrate=0.0015 -final_effective_lrate=0.00015 -num_epochs=4 -num_jobs_initial=2 -num_jobs_final=12 -remove_egs=true - -# feature options -use_ivectors=true - -# End configuration section. - -. ./cmd.sh -. ./path.sh -. ./utils/parse_options.sh - -if ! cuda-compiled; then - cat < $dir/configs/network.xconfig - input dim=100 name=ivector - input dim=43 name=input - - # please note that it is important to have input layer with the name=input - # as the layer immediately preceding the fixed-affine-layer to enable - # the use of short notation for the descriptor - fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat - - # the first splicing is moved before the lda layer, so no splicing here - relu-batchnorm-layer name=tdnn1 dim=850 - relu-batchnorm-layer name=tdnn2 dim=850 input=Append(-1,0,2) - relu-batchnorm-layer name=tdnn3 dim=850 input=Append(-3,0,3) - relu-batchnorm-layer name=tdnn4 dim=850 input=Append(-7,0,2) - relu-batchnorm-layer name=tdnn5 dim=850 input=Append(-3,0,3) - relu-batchnorm-layer name=tdnn6 dim=850 - output-layer name=output input=tdnn6 dim=$num_targets max-change=1.5 -EOF - steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ -fi - -if [ $stage -le 8 ]; then - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then - utils/create_split_dir.pl \ - /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aishell-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage - fi - - steps/nnet3/train_dnn.py --stage=$train_stage \ - --cmd="$decode_cmd" \ - --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ - --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ - --trainer.num-epochs $num_epochs \ - --trainer.optimization.num-jobs-initial $num_jobs_initial \ - --trainer.optimization.num-jobs-final $num_jobs_final \ - --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ - --trainer.optimization.final-effective-lrate $final_effective_lrate \ - --egs.dir "$common_egs_dir" \ - --cleanup.remove-egs $remove_egs \ - --cleanup.preserve-model-interval 500 \ - --use-gpu true \ - --feat-dir=data/${train_set}_hires \ - --ali-dir $ali_dir \ - --lang data/lang \ - --reporting.email="$reporting_email" \ - --dir=$dir || exit 1; -fi - -if [ $stage -le 9 ]; then - # this version of the decoding treats each utterance separately - # without carrying forward speaker information. - for decode_set in dev test; do - num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` - decode_dir=${dir}/decode_$decode_set - steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \ - --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ - $graph_dir data/${decode_set}_hires $decode_dir || exit 1; - done -fi - -wait; -exit 0; diff --git a/egs/aishell/s5/local/nnet3/run_tdnn.sh b/egs/aishell/s5/local/nnet3/run_tdnn.sh new file mode 120000 index 00000000000..34499362831 --- /dev/null +++ b/egs/aishell/s5/local/nnet3/run_tdnn.sh @@ -0,0 +1 @@ +tuning/run_tdnn_1a.sh \ No newline at end of file diff --git a/egs/aishell/s5/local/nnet3/tuning/run_tdnn_1a.sh b/egs/aishell/s5/local/nnet3/tuning/run_tdnn_1a.sh new file mode 100755 index 00000000000..3cb8cd861a3 --- /dev/null +++ b/egs/aishell/s5/local/nnet3/tuning/run_tdnn_1a.sh @@ -0,0 +1,117 @@ +#!/bin/bash + +# This script is based on swbd/s5c/local/nnet3/run_tdnn.sh + +# this is the standard "tdnn" system, built in nnet3; it's what we use to +# call multi-splice. + +# At this script level we don't support not running on GPU, as it would be painfully slow. +# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false, +# --num-threads 16 and --minibatch-size 128. +set -e + +stage=0 +train_stage=-10 +affix= +common_egs_dir= + +# training options +initial_effective_lrate=0.0015 +final_effective_lrate=0.00015 +num_epochs=4 +num_jobs_initial=2 +num_jobs_final=12 +remove_egs=true + +# feature options +use_ivectors=true + +# End configuration section. + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=43 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=850 + relu-batchnorm-layer name=tdnn2 dim=850 input=Append(-1,0,2) + relu-batchnorm-layer name=tdnn3 dim=850 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn4 dim=850 input=Append(-7,0,2) + relu-batchnorm-layer name=tdnn5 dim=850 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn6 dim=850 + output-layer name=output input=tdnn6 dim=$num_targets max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 8 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aishell-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/train_dnn.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --cleanup.preserve-model-interval 500 \ + --use-gpu true \ + --feat-dir=data/${train_set}_hires \ + --ali-dir $ali_dir \ + --lang data/lang \ + --reporting.email="$reporting_email" \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 9 ]; then + # this version of the decoding treats each utterance separately + # without carrying forward speaker information. + for decode_set in dev test; do + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + decode_dir=${dir}/decode_$decode_set + steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $decode_dir || exit 1; + done +fi + +wait; +exit 0; diff --git a/egs/aishell/s5/local/nnet3/tuning/run_tdnn_2a.sh b/egs/aishell/s5/local/nnet3/tuning/run_tdnn_2a.sh new file mode 100755 index 00000000000..603149585f2 --- /dev/null +++ b/egs/aishell/s5/local/nnet3/tuning/run_tdnn_2a.sh @@ -0,0 +1,145 @@ +#!/bin/bash + +# This script is based on aishell/s5/local/nnet3/tuning/run_tdnn_1a.sh + +# In this script, the neural network in trained based on hires mfcc and online pitch. +# The online pitch setup requires a online_pitch.conf in the conf dir for both training +# and testing. + +set -e + +stage=0 +train_stage=-10 +affix= +common_egs_dir= + +# training options +initial_effective_lrate=0.0015 +final_effective_lrate=0.00015 +num_epochs=4 +num_jobs_initial=2 +num_jobs_final=12 +remove_egs=true + +# feature options +use_ivectors=true + +# End configuration section. + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=43 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=850 + relu-batchnorm-layer name=tdnn2 dim=850 input=Append(-1,0,2) + relu-batchnorm-layer name=tdnn3 dim=850 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn4 dim=850 input=Append(-7,0,2) + relu-batchnorm-layer name=tdnn5 dim=850 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn6 dim=850 + output-layer name=output input=tdnn6 dim=$num_targets max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 8 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aishell-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/train_dnn.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --cleanup.preserve-model-interval 500 \ + --use-gpu true \ + --feat-dir=data/${train_set}_hires_online \ + --ali-dir $ali_dir \ + --lang data/lang \ + --reporting.email="$reporting_email" \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 9 ]; then + # this version of the decoding treats each utterance separately + # without carrying forward speaker information. + for decode_set in dev test; do + num_jobs=`cat data/${decode_set}_hires_online/utt2spk|cut -d' ' -f2|sort -u|wc -l` + decode_dir=${dir}/decode_$decode_set + steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires_online $decode_dir || exit 1; + done +fi + +if [ $stage -le 10 ]; then + steps/online/nnet3/prepare_online_decoding.sh --mfcc-config conf/mfcc_hires.conf \ + --add-pitch true \ + data/lang exp/nnet3/extractor "$dir" ${dir}_online || exit 1; +fi + +if [ $stage -le 11 ]; then + # do the actual online decoding with iVectors, carrying info forward from + # previous utterances of the same speaker. + for decode_set in dev test; do + num_jobs=`cat data/${decode_set}_hires_online/utt2spk|cut -d' ' -f2|sort -u|wc -l` + decode_dir=${dir}_online/decode_$decode_set + steps/online/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \ + --config conf/decode.config \ + $graph_dir data/${decode_set}_hires_online $decode_dir || exit 1; + done +fi + +if [ $stage -le 12 ]; then + # this version of the decoding treats each utterance separately + # without carrying forward speaker information. + for decode_set in dev test; do + num_jobs=`cat data/${decode_set}_hires_online/utt2spk|cut -d' ' -f2|sort -u|wc -l` + decode_dir=${dir}_online/decode_${decode_set}_per_utt + steps/online/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \ + --config conf/decode.config --per-utt true \ + $graph_dir data/${decode_set}_hires_online $decode_dir || exit 1; + done +fi + +wait; +exit 0; diff --git a/egs/aishell/v1/local/aishell_data_prep.sh b/egs/aishell/v1/local/aishell_data_prep.sh index 70d6ba1f3e5..11d131dcdb1 100755 --- a/egs/aishell/v1/local/aishell_data_prep.sh +++ b/egs/aishell/v1/local/aishell_data_prep.sh @@ -40,13 +40,11 @@ n=`cat $train_dir/wav.flist $dev_dir/wav.flist $test_dir/wav.flist | wc -l` # Transcriptions preparation for dir in $train_dir $test_dir; do echo Preparing $dir transcriptions - sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' |\ - sort > $dir/utt.list - sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{i=NF-1;printf("%s %s\n",$NF,$i)}' |\ - sort > $dir/utt2spk_all + sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' > $dir/utt.list + sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{i=NF-1;printf("%s %s\n",$NF,$i)}' > $dir/utt2spk_all paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all utils/filter_scp.pl -f 1 $dir/utt.list $aishell_text_dir/*.txt > $dir/transcripts.txt - awk '{print $1}' $dir/transcripts.txt > $dir/utt.list + awk '{print $1}' $dir/transcripts.txt | sort -u > $dir/utt.list utils/filter_scp.pl -f 1 $dir/utt.list $dir/utt2spk_all | sort -u > $dir/utt2spk utils/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp sort -u $dir/transcripts.txt > $dir/text diff --git a/egs/aishell/v1/local/download_and_untar.sh b/egs/aishell/v1/local/download_and_untar.sh index 0189bad1d4a..3578a1c0835 100755 --- a/egs/aishell/v1/local/download_and_untar.sh +++ b/egs/aishell/v1/local/download_and_untar.sh @@ -15,7 +15,7 @@ if [ $# -ne 3 ]; then echo "Usage: $0 [--remove-archive] " echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/33 data_aishell" echo "With --remove-archive it will remove the archive after successfully un-tarring it." - echo " can be one of: data_aishell, resource." + echo " can be one of: data_aishell, resource_aishell." fi data=$1 @@ -28,7 +28,7 @@ if [ ! -d "$data" ]; then fi part_ok=false -list="data_aishell resource" +list="data_aishell resource_aishell" for x in $list; do if [ "$part" == $x ]; then part_ok=true; fi done diff --git a/egs/aishell2/README.md b/egs/aishell2/README.md new file mode 100644 index 00000000000..f87f3819036 --- /dev/null +++ b/egs/aishell2/README.md @@ -0,0 +1,64 @@ +# AISHELL-2 + +AISHELL-2 is by far the largest free speech corpus available for Mandarin ASR research. +## 1. DATA +### Training data +* 1000 hours of speech data (around 1 million utterances) +* 1991 speakers (845 male and 1146 female) +* clean recording environment (studio or quiet living room) +* read speech +* reading prompts from various domain: entertainment, finance, technology, sports, control command, place of interest etc. +* near field recording via 3 parallel channels (iOS, Android, Microphone). +* iOS data is free for non-commercial research and education use (e.g. universities and non-commercial institutes) + +### Evaluation data: +Currently we release AISHELL2-2018A-EVAL, containing: +* dev: 2500 utterances from 5 speakers +* test: 5000 utterances from 10 speakers + +Both sets are available across the three channel conditions. + +One of interest can download the sets from [here](http://www.aishelltech.com/aishell_eval). Note that we may update and release other evaluation sets on the website later, targeting on different applications and senarios. + +## 2. RECIPE +Based on Kaldi standard system, AISHELL-2 provides a self-contained Mandarin ASR recipe, with: +* a word segmentation module, which is a must-have component for Chinese ASR systems +* an open-sourced Mandarin lexicon (DaCiDian, open-sourced at [here](https://github.com/aishell-foundation/DaCiDian)) +* Simplified GMM training & alignment generating recipe (we stopped at speaker independent stage) +* LFMMI TDNN training and decoding recipe + +# REFERENCE +We released a [paper on Arxiv](https://arxiv.org/abs/1808.10583) on a more detailed description about the corpus with some preliminary resulting numbers. If one would like to use AISHELL-2 in experiments, please cite the paper as below: +``` +@ARTICLE{aishell2, + author = {{Du}, J. and {Na}, X. and {Liu}, X. and {Bu}, H.}, + title = "{AISHELL-2: Transforming Mandarin ASR Research Into Industrial Scale}", + journal = {ArXiv}, + eprint = {1808.10583}, + primaryClass = "cs.CL", + year = 2018, + month = Aug, +} +``` + +# APPLY FOR DATA/CONTACT +AISHELL foundation is a non-profit online organization, with members from speech industry and research institutes. + +We hope AISHELL-2 corpus and recipe could be beneficial to the entire speech community. + +Depends on your location and internet speed, we distribute the corpus in two ways: +* hard-disk delivery +* cloud-disk downloading + +To apply for AISHELL-2 corpus for free, you need to fill in a very simple application form, confirming that: +* university department / educational institute information has been fully provided +* only for non-commercial research / education use + +AISHELL-foundation covers all data distribution fees (including the corpus, hard-disk cost etc) + +Data re-distribution inside your university department is OK for convenience. However, users are not supposed to re-distribute the data to other universities or educational institutes. + +To get the application form, or you come across any problem with the recipe, contact us via: + +aishell.foundation@gmail.com + diff --git a/egs/aishell2/README.txt b/egs/aishell2/README.txt deleted file mode 100644 index e8b4260f2bb..00000000000 --- a/egs/aishell2/README.txt +++ /dev/null @@ -1,50 +0,0 @@ -# AISHELL-2 - -AISHELL-2 is by far the largest free speech corpus available for Mandarin ASR research. -## 1. DATA -### training data -* 1000 hours of speech data (around 1 million utterances) -* 1991 speakers (845 male and 1146 female) -* clean recording environment(studio or quiet living room) -* read speech -* reading prompts from various domain: entertainment, finance, technology, sports, control command, place of interest etc. -* near field recording via 3 parallel channels(iOS, Android, Microphone). -* iOS data is free for non-commercial research and education use (e.g. universities and colleges) - -### evaluation data: -Currently we release AISHELL2-2018A-EVAL, containing: -* dev: 2500 utterances from 5 speaker -* test: 5000 utterances from 10 speakers - -you can download above evaluation set from: -http://www.aishelltech.com/aishell_eval - -we may update and release other evaluation sets on the website later, targeting on different applications and senarios. - -## 2. RECIPE -Based on Kaldi standard system, AISHELL-2 provides a self-contained Mandarin ASR recipe, with: -* a word segmentation module, which is a must-have component for Chinese ASR systems -* an open-sourced Mandarin lexicon(DaCiDian) -* a simplified GMM training recipe -* acoustic channel adaptation recipe(AM fine-tuning) - -# CONTACT -AISHELL foundation is a non-profit online organization, with members from speech industry and research institutes. - -We hope AISHELL-2 corpus and recipe could be beneficial to the entire speech community. - -Depends on your location and internet speed, we distribute the corpus in two ways: -* hard-disk delivery -* cloud-disk downloading - -To apply for AISHELL-2 corpus for free, you need to fill in a very simple application form, confirming that: -* university department / education institute info -* only for non-commercial research / education use - -AISHELL-foundation covers all data distribution fees (including the corpus, hard-disk cost etc) - -Data re-distribution inside your university department is OK for convenience. However, users are not supposed to re-distribute AISHELL-2 to other universities or education institutes. - -To get the application form, or you come across any problem with the recipe, contact us via: - -aishell.foundation@gmail.com diff --git a/egs/aishell2/s5/conf/online_cmvn.conf b/egs/aishell2/s5/conf/online_cmvn.conf new file mode 100644 index 00000000000..048bdfa65de --- /dev/null +++ b/egs/aishell2/s5/conf/online_cmvn.conf @@ -0,0 +1 @@ +# configuration file for apply-cmvn-online diff --git a/egs/aishell2/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/aishell2/s5/local/chain/tuning/run_tdnn_1a.sh index 459bd64eeb5..86c9becac5b 100755 --- a/egs/aishell2/s5/local/chain/tuning/run_tdnn_1a.sh +++ b/egs/aishell2/s5/local/chain/tuning/run_tdnn_1a.sh @@ -103,7 +103,7 @@ fi if [ $stage -le 10 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) opts="l2-regularize=0.002" linear_opts="orthonormal-constraint=1.0" output_opts="l2-regularize=0.0005 bottleneck-dim=256" diff --git a/egs/aishell2/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/aishell2/s5/local/chain/tuning/run_tdnn_1b.sh index 30a19293181..d8560e63909 100755 --- a/egs/aishell2/s5/local/chain/tuning/run_tdnn_1b.sh +++ b/egs/aishell2/s5/local/chain/tuning/run_tdnn_1b.sh @@ -3,18 +3,17 @@ # _1b is as _1a, but with pitch feats, i-vector and dropout schedule added, referenced from wsj # basic info: -# steps/info/chain_dir_info.pl exp/chain/tdnn_1b_all_sp/ -# exp/chain/tdnn_1b_all_sp/: num-iters=1446 nj=2..2 num-params=19.3M dim=43+100->4456 combine=-0.079->-0.075 (over 9) xent:train/valid[962,1445,final]=(-0.922,-0.795,-0.746/-0.960,-0.840,-0.785) logprob:train/valid[962,1445,final]=(-0.084,-0.072,-0.070/-0.085,-0.075,-0.071) +# steps/info/chain_dir_info.pl exp/chain/tdnn_1f_nopitch_ivec_sp/exp/chain/tdnn_1f_nopitch_ivec_sp/: num-iters=578 nj=2..8 num-params=19.3M dim=43+100->4520 combine=-0.082->-0.081 (over 6) xent:train/valid[384,577,final]=(-0.863,-0.752,-0.740/-0.901,-0.791,-0.784) logprob:train/valid[384,577,final]=(-0.083,-0.076,-0.075/-0.084,-0.077,-0.076) # results: -# local/chain/compare_wer.sh exp/chain/tdnn_1d_all_sp/ -# Model tdnn_1d_all_sp +# local/chain/compare_wer.sh exp/chain/tdnn_1f_nopitch_ivec_sp/ +# Model tdnn_1f_nopitch_ivec_sp # Num. of params 19.3M -# WER(%) 8.84 -# Final train prob -0.0696 -# Final valid prob -0.0714 -# Final train prob (xent) -0.7458 -# Final valid prob (xent) -0.7854 +# WER(%) 8.81 +# Final train prob -0.0749 +# Final valid prob -0.0756 +# Final train prob (xent) -0.7401 +# Final valid prob (xent) -0.7837 set -e @@ -68,9 +67,12 @@ if [ $stage -le 5 ]; then mfccdir=mfcc_hires for datadir in ${train_set} ${test_sets}; do utils/copy_data_dir.sh data/${datadir} data/${datadir}_hires - utils/data/perturb_data_dir_volume.sh data/${datadir}_hires || exit 1; - steps/make_mfcc_pitch.sh --mfcc-config conf/mfcc_hires.conf --pitch-config conf/pitch.conf \ + utils/data/perturb_data_dir_volume.sh data/${datadir}_hires || exit 1; + steps/make_mfcc_pitch.sh --mfcc-config conf/mfcc_hires.conf --pitch-config conf/pitch.conf \ --nj $nj data/${datadir}_hires exp/make_mfcc/ ${mfccdir} + steps/compute_cmvn_stats.sh data/${datadir}_hires exp/make_mfcc ${mfccdir} + utils/data/limit_feature_dim.sh 0:39 data/${datadir}_hires data/${datadir}_hires_nopitch + steps/compute_cmvn_stats.sh data/${datadir}_hires_nopitch exp/make_mfcc ${mfccdir} done fi @@ -81,15 +83,11 @@ if [ $stage -le 6 ]; then mkdir -p exp/chain/diag_ubm_${affix} temp_data_root=exp/chain/diag_ubm_${affix} - num_utts_total=$(wc -l < data/${train_set}_hires/utt2spk) + num_utts_total=$(wc -l < data/${train_set}_hires_nopitch/utt2spk) num_utts=$[$num_utts_total/4] - utils/data/subset_data_dir.sh data/${train_set}_hires \ + utils/data/subset_data_dir.sh data/${train_set}_hires_nopitch \ $num_utts ${temp_data_root}/${train_set}_subset - #echo "$0: get cmvn stats if not there for subset" - #[ -f ${temp_data_root}/${train_set}_subset/cmvn.scp ] || \ - steps/compute_cmvn_stats.sh ${temp_data_root}/${train_set}_subset || exit 1; - echo "$0: computing a PCA transform from the hires data." steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \ --splice-opts "--left-context=3 --right-context=3" \ @@ -108,13 +106,13 @@ if [ $stage -le 6 ]; then echo "$0: training the iVector extractor" steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj $nj \ - data/${train_set}_hires exp/chain/diag_ubm_${affix} \ + data/${train_set}_hires_nopitch exp/chain/diag_ubm_${affix} \ exp/chain/extractor_${affix} || exit 1; for datadir in ${train_set} ${test_sets}; do - steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/${datadir}_hires data/${datadir}_hires_max2 + steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/${datadir}_hires_nopitch data/${datadir}_hires_nopitch_max2 steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \ - data/${datadir}_hires_max2 exp/chain/extractor_${affix} exp/chain/ivectors_${datadir}_${affix} || exit 1; + data/${datadir}_hires_nopitch_max2 exp/chain/extractor_${affix} exp/chain/ivectors_${datadir}_${affix} || exit 1; done fi @@ -152,7 +150,7 @@ if [ $stage -le 10 ]; then echo "$0: creating neural net configs using the xconfig parser"; feat_dim=$(feat-to-dim scp:data/${train_set}_hires/feats.scp -) num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) opts="l2-regularize=0.002" linear_opts="orthonormal-constraint=1.0" output_opts="l2-regularize=0.0005 bottleneck-dim=256" diff --git a/egs/aishell2/s5/local/prepare_data.sh b/egs/aishell2/s5/local/prepare_data.sh index 419d8eddfd1..4be9664ac31 100755 --- a/egs/aishell2/s5/local/prepare_data.sh +++ b/egs/aishell2/s5/local/prepare_data.sh @@ -45,8 +45,9 @@ utils/filter_scp.pl -f 1 $tmp/utt.list $tmp/tmp_wav.scp | sort -k 1 | uniq > $tm python -c "import jieba" 2>/dev/null || \ (echo "jieba is not found. Use tools/extra/install_jieba.sh to install it." && exit 1;) utils/filter_scp.pl -f 1 $tmp/utt.list $corpus/trans.txt | sort -k 1 | uniq > $tmp/trans.txt -awk '{print $1}' $dict_dir/lexicon.txt | sort | uniq | awk 'BEGIN{idx=0}{print $1,idx++}'> $tmp/vocab.txt -python local/word_segmentation.py $tmp/vocab.txt $tmp/trans.txt > $tmp/text +# jieba's vocab format requires word count(frequency), set to 99 +awk '{print $1}' $dict_dir/lexicon.txt | sort | uniq | awk '{print $1,99}'> $tmp/word_seg_vocab.txt +python local/word_segmentation.py $tmp/word_seg_vocab.txt $tmp/trans.txt > $tmp/text # utt2spk & spk2utt awk -F'\t' '{print $2}' $tmp/wav.scp > $tmp/wav.list diff --git a/egs/aishell2/s5/local/prepare_dict.sh b/egs/aishell2/s5/local/prepare_dict.sh index d59585273a7..56ab885ae94 100755 --- a/egs/aishell2/s5/local/prepare_dict.sh +++ b/egs/aishell2/s5/local/prepare_dict.sh @@ -10,7 +10,7 @@ download_dir=data/local/DaCiDian dir=data/local/dict -if [ $# -ne 1 ]; then +if [ $# -ne 1 ]; then echo "Usage: $0 "; exit 1; fi @@ -18,7 +18,9 @@ fi dir=$1 # download the DaCiDian from github -git clone https://github.com/aishell-foundation/DaCiDian.git $download_dir +if [ ! -d $download_dir ]; then + git clone https://github.com/aishell-foundation/DaCiDian.git $download_dir +fi # here we map to the phone spn(spoken noise) mkdir -p $dir @@ -27,21 +29,9 @@ echo -e "\tspn" >> $dir/lexicon.txt # prepare silence_phones.txt, nonsilence_phones.txt, optional_silence.txt, extra_questions.txt cat $dir/lexicon.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}'| \ - sort -u |\ - perl -e ' - my %ph_cl; - while () { - $phone = $_; - chomp($phone); - chomp($_); - $phone = $_; - next if ($phone eq "sil"); - if (exists $ph_cl{$phone}) { push(@{$ph_cl{$phone}}, $_) } - else { $ph_cl{$phone} = [$_]; } - } - foreach $key ( keys %ph_cl ) { - print "@{ $ph_cl{$key} }\n" - } + perl -e 'while(<>){ chomp($_); $phone = $_; next if ($phone eq "sil"); + m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$1} .= "$phone "; } + foreach $l (values %q) {print "$l\n";} ' | sort -k1 > $dir/nonsilence_phones.txt || exit 1; echo sil > $dir/silence_phones.txt @@ -49,9 +39,8 @@ echo sil > $dir/optional_silence.txt cat $dir/silence_phones.txt | awk '{printf("%s ", $1);} END{printf "\n";}' > $dir/extra_questions.txt || exit 1; cat $dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", $_)) { - $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \ + $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; if($p eq "\$0"){$q{""} .= "$p ";}else{$q{$2} .= "$p ";} } } foreach $l (values %q) {print "$l\n";}' \ >> $dir/extra_questions.txt || exit 1; echo "local/prepare_dict.sh succeeded" exit 0; - diff --git a/egs/aishell2/s5/local/word_segmentation.py b/egs/aishell2/s5/local/word_segmentation.py index 1cb2c1e7350..4ce55a2003e 100644 --- a/egs/aishell2/s5/local/word_segmentation.py +++ b/egs/aishell2/s5/local/word_segmentation.py @@ -4,6 +4,7 @@ # 2018 Beijing Shell Shell Tech. Co. Ltd. (Author: Hui BU) # Apache 2.0 +from __future__ import print_function import sys import jieba reload(sys) @@ -19,6 +20,6 @@ jieba.set_dictionary(vocab_file) for line in open(trans_file): key,trans = line.strip().split('\t',1) - words = jieba.cut(trans) + words = jieba.cut(trans, HMM=False) # turn off new word discovery (HMM-based) new_line = key + '\t' + " ".join(words) print(new_line) diff --git a/egs/ami/s5/local/ami_download.sh b/egs/ami/s5/local/ami_download.sh index b14f8550c75..cba130c8467 100755 --- a/egs/ami/s5/local/ami_download.sh +++ b/egs/ami/s5/local/ami_download.sh @@ -53,12 +53,12 @@ cat local/split_train.orig local/split_eval.orig local/split_dev.orig > $wdir/am wgetfile=$wdir/wget_$mic.sh # TODO fix this with Pawel, files don't exist anymore, -manifest="wget --continue -O $adir/MANIFEST.TXT http://groups.inf.ed.ac.uk/ami/download/temp/amiBuild-04237-Sun-Jun-15-2014.manifest.txt" -license="wget --continue -O $adir/LICENCE.TXT http://groups.inf.ed.ac.uk/ami/download/temp/Creative-Commons-Attribution-NonCommercial-ShareAlike-2.5.txt" +manifest="wget --continue -O $adir/MANIFEST.TXT http://groups.inf.ed.ac.uk/ami/download/temp/amiBuild-0153-Tue-Oct-2-2018.manifest.txt" + echo "#!/bin/bash" > $wgetfile echo $manifest >> $wgetfile -echo $license >> $wgetfile + while read line; do if [ "$mic" == "ihm" ]; then extra_headset= #some meetings have 5 sepakers (headsets) @@ -100,8 +100,7 @@ else fi fi -echo "Downloads of AMI corpus completed succesfully. License can be found under $adir/LICENCE.TXT" +echo "Downloads of AMI corpus completed succesfully." exit 0; - diff --git a/egs/ami/s5/local/ami_ihm_scoring_data_prep.sh b/egs/ami/s5/local/ami_ihm_scoring_data_prep.sh index 3157d7ffec7..7112e0259a0 100755 --- a/egs/ami/s5/local/ami_ihm_scoring_data_prep.sh +++ b/egs/ami/s5/local/ami_ihm_scoring_data_prep.sh @@ -87,18 +87,15 @@ sort -k 2 $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt || exit 1; join $dir/utt2spk $dir/segments | \ perl -ne '{BEGIN{$pu=""; $pt=0.0;} split; if ($pu eq $_[1] && $pt > $_[3]) { - print "$_[0] $_[2] $_[3] $_[4]>$_[0] $_[2] $pt $_[4]\n" + print "s/^$_[0] $_[2] $_[3] $_[4]\$/$_[0] $_[2] $pt $_[4]/;\n" } - $pu=$_[1]; $pt=$_[4]; + $pu=$_[1]; $pt=$_[4]; }' > $dir/segments_to_fix -if [ `cat $dir/segments_to_fix | wc -l` -gt 0 ]; then + +if [ -s $dir/segments_to_fix ]; then echo "$0. Applying following fixes to segments" cat $dir/segments_to_fix - while read line; do - p1=`echo $line | awk -F'>' '{print $1}'` - p2=`echo $line | awk -F'>' '{print $2}'` - sed -ir "s!$p1!$p2!" $dir/segments - done < $dir/segments_to_fix + perl -i -pf $dir/segments_to_fix $dir/segments fi # Copy stuff into its final locations diff --git a/egs/ami/s5/local/ami_mdm_scoring_data_prep.sh b/egs/ami/s5/local/ami_mdm_scoring_data_prep.sh index 4cfa9110edf..9c4b55308f2 100755 --- a/egs/ami/s5/local/ami_mdm_scoring_data_prep.sh +++ b/egs/ami/s5/local/ami_mdm_scoring_data_prep.sh @@ -94,19 +94,15 @@ awk '{print $1}' $tmpdir/segments | \ join $tmpdir/utt2spk_stm $tmpdir/segments | \ awk '{ utt=$1; spk=$2; wav=$3; t_beg=$4; t_end=$5; if(spk_prev == spk && t_end_prev > t_beg) { - print utt, wav, t_beg, t_end">"utt, wav, t_end_prev, t_end; + print "s/^"utt, wav, t_beg, t_end"$/"utt, wav, t_end_prev, t_end"/;"; } spk_prev=spk; t_end_prev=t_end; }' > $tmpdir/segments_to_fix -if [ `cat $tmpdir/segments_to_fix | wc -l` -gt 0 ]; then +if [ -s $tmpdir/segments_to_fix ]; then echo "$0. Applying following fixes to segments" cat $tmpdir/segments_to_fix - while read line; do - p1=`echo $line | awk -F'>' '{print $1}'` - p2=`echo $line | awk -F'>' '{print $2}'` - sed -ir "s:$p1:$p2:" $tmpdir/segments - done < $tmpdir/segments_to_fix + perl -i -pf $tmpdir/segments_to_fix $tmpdir/segments fi # Copy stuff into its final locations [this has been moved from the format_data diff --git a/egs/ami/s5/local/ami_sdm_scoring_data_prep.sh b/egs/ami/s5/local/ami_sdm_scoring_data_prep.sh index 91baa37d6e1..815e1b2d270 100755 --- a/egs/ami/s5/local/ami_sdm_scoring_data_prep.sh +++ b/egs/ami/s5/local/ami_sdm_scoring_data_prep.sh @@ -101,19 +101,15 @@ awk '{print $1}' $tmpdir/segments | \ join $tmpdir/utt2spk_stm $tmpdir/segments | \ awk '{ utt=$1; spk=$2; wav=$3; t_beg=$4; t_end=$5; if(spk_prev == spk && t_end_prev > t_beg) { - print utt, wav, t_beg, t_end">"utt, wav, t_end_prev, t_end; + print "s/^"utt, wav, t_beg, t_end"$/"utt, wav, t_end_prev, t_end"/;"; } spk_prev=spk; t_end_prev=t_end; }' > $tmpdir/segments_to_fix -if [ `cat $tmpdir/segments_to_fix | wc -l` -gt 0 ]; then +if [ -s $tmpdir/segments_to_fix ]; then echo "$0. Applying following fixes to segments" cat $tmpdir/segments_to_fix - while read line; do - p1=`echo $line | awk -F'>' '{print $1}'` - p2=`echo $line | awk -F'>' '{print $2}'` - sed -ir "s:$p1:$p2:" $tmpdir/segments - done < $tmpdir/segments_to_fix + perl -i -pf $tmpdir/segments_to_fix $tmpdir/segments fi # Copy stuff into its final locations [this has been moved from the format_data diff --git a/egs/ami/s5/local/sort_bad_utts.py b/egs/ami/s5/local/sort_bad_utts.py index f84fcb12608..baabdc73508 100644 --- a/egs/ami/s5/local/sort_bad_utts.py +++ b/egs/ami/s5/local/sort_bad_utts.py @@ -1,5 +1,6 @@ #!/usr/bin/env python +from __future__ import print_function import sys import argparse import logging @@ -38,10 +39,10 @@ def GetSortedWers(utt_info_file): utt_wer_sorted = sorted(utt_wer, key = lambda k : k[1]) try: import numpy as np - bins = range(0,105,5) + bins = list(range(0,105,5)) bins.append(sys.float_info.max) - hist, bin_edges = np.histogram(map(lambda x: x[1], utt_wer_sorted), + hist, bin_edges = np.histogram([x[1] for x in utt_wer_sorted], bins = bins) num_utts = len(utt_wer) string = '' diff --git a/egs/ami/s5/local/tfrnnlm/run_lstm.sh b/egs/ami/s5/local/tfrnnlm/run_lstm.sh index 31ae4a8bad7..d68fadb10f3 100755 --- a/egs/ami/s5/local/tfrnnlm/run_lstm.sh +++ b/egs/ami/s5/local/tfrnnlm/run_lstm.sh @@ -27,7 +27,7 @@ mkdir -p $dir if [ $stage -le 2 ]; then # the following script uses TensorFlow. You could use tools/extras/install_tensorflow_py.sh to install it $cuda_cmd $dir/train_rnnlm.log utils/parallel/limit_num_gpus.sh \ - python steps/tfrnnlm/lstm.py --data-path=$dir --save-path=$dir/rnnlm --vocab-path=$dir/wordlist.rnn.final + python steps/tfrnnlm/lstm.py --data_path=$dir --save_path=$dir/rnnlm --vocab_path=$dir/wordlist.rnn.final fi final_lm=ami_fsh.o3g.kn @@ -39,7 +39,7 @@ if [ $stage -le 3 ]; then decode_dir=${basedir}/decode_${decode_set} # Lattice rescoring - steps/lmrescore_rnnlm_lat.sh \ + steps/tfrnnlm/lmrescore_rnnlm_lat.sh \ --cmd "$tfrnnlm_cmd --mem 16G" \ --rnnlm-ver tensorflow --weight $weight --max-ngram-order $ngram_order \ data/lang_$LM $dir \ diff --git a/egs/ami/s5/local/tfrnnlm/run_lstm_fast.sh b/egs/ami/s5/local/tfrnnlm/run_lstm_fast.sh index 8dd876c2b2c..4cc71b55b5c 100755 --- a/egs/ami/s5/local/tfrnnlm/run_lstm_fast.sh +++ b/egs/ami/s5/local/tfrnnlm/run_lstm_fast.sh @@ -27,7 +27,7 @@ mkdir -p $dir if [ $stage -le 2 ]; then # the following script uses TensorFlow. You could use tools/extras/install_tensorflow_py.sh to install it $cuda_cmd $dir/train_rnnlm.log utils/parallel/limit_num_gpus.sh \ - python steps/tfrnnlm/lstm_fast.py --data-path=$dir --save-path=$dir/rnnlm --vocab-path=$dir/wordlist.rnn.final + python steps/tfrnnlm/lstm_fast.py --data_path=$dir --save_path=$dir/rnnlm --vocab_path=$dir/wordlist.rnn.final fi final_lm=ami_fsh.o3g.kn diff --git a/egs/ami/s5/local/tfrnnlm/run_vanilla_rnnlm.sh b/egs/ami/s5/local/tfrnnlm/run_vanilla_rnnlm.sh index 7a4635f07a4..7a95f38ba1e 100755 --- a/egs/ami/s5/local/tfrnnlm/run_vanilla_rnnlm.sh +++ b/egs/ami/s5/local/tfrnnlm/run_vanilla_rnnlm.sh @@ -27,7 +27,7 @@ mkdir -p $dir if [ $stage -le 2 ]; then # the following script uses TensorFlow. You could use tools/extras/install_tensorflow_py.sh to install it $cuda_cmd $dir/train_rnnlm.log utils/parallel/limit_num_gpus.sh \ - python steps/tfrnnlm/vanilla_rnnlm.py --data-path=$dir --save-path=$dir/rnnlm --vocab-path=$dir/wordlist.rnn.final + python steps/tfrnnlm/vanilla_rnnlm.py --data_path=$dir --save_path=$dir/rnnlm --vocab_path=$dir/wordlist.rnn.final fi final_lm=ami_fsh.o3g.kn @@ -39,7 +39,7 @@ if [ $stage -le 3 ]; then decode_dir=${basedir}/decode_${decode_set} # Lattice rescoring - steps/lmrescore_rnnlm_lat.sh \ + steps/tfrnnlm/lmrescore_rnnlm_lat.sh \ --cmd "$tfrnnlm_cmd --mem 16G" \ --rnnlm-ver tensorflow --weight $weight --max-ngram-order $ngram_order \ data/lang_$LM $dir \ diff --git a/egs/ami/s5b/RESULTS_ihm b/egs/ami/s5b/RESULTS_ihm index 42af5763829..7eb908f685e 100644 --- a/egs/ami/s5b/RESULTS_ihm +++ b/egs/ami/s5b/RESULTS_ihm @@ -86,8 +86,7 @@ %WER 19.8 | 13098 94475 | 83.1 9.6 7.4 2.8 19.8 51.8 | -0.041 | exp/ihm/chain_cleaned/tdnn_lstm1l_sp_bi_ld5/decode_dev/ascore_10/dev_hires.ctm.filt.sys %WER 19.2 | 12643 89964 | 83.2 10.7 6.1 2.5 19.2 49.7 | 0.079 | exp/ihm/chain_cleaned/tdnn_lstm1l_sp_bi_ld5/decode_eval/ascore_10/eval_hires.ctm.filt.sys -# local/chain/multi_condition/tuning/run_tdnn_lstm_1a.sh --mic ihm +# local/chain/multi_condition/tuning/run_tdnn_lstm_1b.sh --mic ihm # cleanup + chain TDNN+LSTM model + IHM reverberated data -%WER 19.4 | 13098 94479 | 83.8 10.0 6.1 3.2 19.4 51.8 | -0.168 | exp/ihm/chain_cleaned_rvb/tdnn_lstm1i_sp_rvb_bi/decode_dev/ascore_10/dev_hires.ctm.filt.sys -%WER 19.3 | 12643 89977 | 83.3 11.0 5.7 2.6 19.3 49.6 | -0.046 | exp/ihm/chain_cleaned_rvb/tdnn_lstm1i_sp_rvb_bi/decode_eval/ascore_10/eval_hires.ctm.filt.sys - +%WER 18.9 | 13098 94488 | 84.1 9.7 6.2 3.0 18.9 51.2 | 0.012 | exp/ihm/chain_cleaned_rvb/tdnn_lstm1b_sp_rvb_bi/decode_dev/ascore_11/dev_hires.ctm.filt.sys +%WER 19.3 | 12643 89989 | 83.1 10.7 6.2 2.5 19.3 50.0 | 0.136 | exp/ihm/chain_cleaned_rvb/tdnn_lstm1b_sp_rvb_bi/decode_eval/ascore_11/eval_hires.ctm.filt.sys diff --git a/egs/ami/s5b/RESULTS_sdm b/egs/ami/s5b/RESULTS_sdm index 0993b2eb52a..584c50f298a 100644 --- a/egs/ami/s5b/RESULTS_sdm +++ b/egs/ami/s5b/RESULTS_sdm @@ -93,9 +93,13 @@ %WER 35.9 | 14900 94497 | 67.8 18.2 14.1 3.7 35.9 62.5 | 0.647 | exp/sdm1/chain_cleaned/tdnn_lstm1l_sp_bi_ihmali_ld5/decode_dev/ascore_9/dev_hires_o4.ctm.filt.sys %WER 39.4 | 13223 89946 | 64.1 19.7 16.2 3.5 39.4 67.0 | 0.611 | exp/sdm1/chain_cleaned/tdnn_lstm1l_sp_bi_ihmali_ld5/decode_eval/ascore_9/eval_hires_o4.ctm.filt.sys -# local/chain/multi_condition/tuning/run_tdnn_lstm_1a.sh --mic sdm1 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned +# local/chain/multi_condition/tuning/run_tdnn_lstm_1b.sh --mic sdm1 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned # cleanup + chain TDNN+LSTM model, SDM original + IHM reverberated data, alignments from ihm data. -# *** best system *** -%WER 34.0 | 14455 94497 | 69.8 17.7 12.5 3.8 34.0 63.9 | 0.675 | exp/sdm1/chain_cleaned_rvb/tdnn_lstm1i_sp_rvb_bi_ihmali/decode_dev/ascore_10/dev_hires_o4.ctm.filt.sys -%WER 37.5 | 13261 89982 | 65.9 19.3 14.7 3.5 37.5 66.2 | 0.642 | exp/sdm1/chain_cleaned_rvb/tdnn_lstm1i_sp_rvb_bi_ihmali/decode_eval/ascore_10/eval_hires_o4.ctm.filt.sys +%WER 33.9 | 14185 94492 | 70.3 18.1 11.7 4.2 33.9 66.0 | 0.605 | exp/sdm1/chain_cleaned_rvb/tdnn_lstm1b_sp_rvb_bi_ihmali/decode_dev/ascore_10/dev_hires_o4.ctm.filt.sys +%WER 37.4 | 13610 89969 | 66.3 19.9 13.7 3.7 37.4 65.5 | 0.568 | exp/sdm1/chain_cleaned_rvb/tdnn_lstm1b_sp_rvb_bi_ihmali/decode_eval/ascore_10/eval_hires_o4.ctm.filt.sys +# local/chain/multi_condition/tuning/run_tdnn_1a.sh --mic sdm1 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned +# cleanup + chain TDNN-F model, SDM original + IHM reverberated data, alignments from ihm data. +# *** best system *** +%WER 33.3 | 14696 94538 | 70.4 17.2 12.4 3.7 33.3 63.1 | 0.612 | exp/sdm1/chain_cleaned_rvb/tdnn1a_sp_rvb_bi_ihmali/decode_dev/ascore_10/dev_hires_o4.ctm.filt.sys +%WER 36.7 | 14855 89974 | 66.7 18.9 14.4 3.4 36.7 59.8 | 0.580 | exp/sdm1/chain_cleaned_rvb/tdnn1a_sp_rvb_bi_ihmali/decode_eval/ascore_10/eval_hires_o4.ctm.filt.sys diff --git a/egs/ami/s5b/conf/mfcc_hires80.conf b/egs/ami/s5b/conf/mfcc_hires80.conf new file mode 100644 index 00000000000..5fb03de59c4 --- /dev/null +++ b/egs/ami/s5b/conf/mfcc_hires80.conf @@ -0,0 +1,10 @@ +# config for high-resolution MFCC features, intended for neural network training +# Note: we keep all cepstra, so it has the same info as filterbank features, +# but MFCC is more easily compressible (because less correlated) which is why +# we prefer this method. +--use-energy=false # use average of log energy, not energy. +--num-mel-bins=80 # similar to Google's setup. +--num-ceps=80 # there is no dimensionality reduction. +--low-freq=20 # low cutoff frequency for mel bins... this is high-bandwidth data, so + # there might be some information at the low end. +--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) diff --git a/egs/ami/s5b/local/ami_ihm_scoring_data_prep.sh b/egs/ami/s5b/local/ami_ihm_scoring_data_prep.sh index 746c42c4c1a..c54876331f1 100755 --- a/egs/ami/s5b/local/ami_ihm_scoring_data_prep.sh +++ b/egs/ami/s5b/local/ami_ihm_scoring_data_prep.sh @@ -93,18 +93,15 @@ sort -k 2 $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt || exit 1; join $dir/utt2spk $dir/segments | \ perl -ne '{BEGIN{$pu=""; $pt=0.0;} split; if ($pu eq $_[1] && $pt > $_[3]) { - print "$_[0] $_[2] $_[3] $_[4]>$_[0] $_[2] $pt $_[4]\n" + print "s/^$_[0] $_[2] $_[3] $_[4]\$/$_[0] $_[2] $pt $_[4]/;\n" } $pu=$_[1]; $pt=$_[4]; }' > $dir/segments_to_fix -if [ `cat $dir/segments_to_fix | wc -l` -gt 0 ]; then + +if [ -s $dir/segments_to_fix ]; then echo "$0. Applying following fixes to segments" cat $dir/segments_to_fix - while read line; do - p1=`echo $line | awk -F'>' '{print $1}'` - p2=`echo $line | awk -F'>' '{print $2}'` - sed -ir "s!$p1!$p2!" $dir/segments - done < $dir/segments_to_fix + perl -i -pf $dir/segments_to_fix $dir/segments fi # Copy stuff into its final locations diff --git a/egs/ami/s5b/local/ami_mdm_scoring_data_prep.sh b/egs/ami/s5b/local/ami_mdm_scoring_data_prep.sh index 65f514f223c..475ef5405ba 100755 --- a/egs/ami/s5b/local/ami_mdm_scoring_data_prep.sh +++ b/egs/ami/s5b/local/ami_mdm_scoring_data_prep.sh @@ -99,19 +99,15 @@ awk '{print $1}' $tmpdir/segments | \ join $tmpdir/utt2spk_stm $tmpdir/segments | \ awk '{ utt=$1; spk=$2; wav=$3; t_beg=$4; t_end=$5; if(spk_prev == spk && t_end_prev > t_beg) { - print utt, wav, t_beg, t_end">"utt, wav, t_end_prev, t_end; + print "s/^"utt, wav, t_beg, t_end"$/"utt, wav, t_end_prev, t_end"/;"; } spk_prev=spk; t_end_prev=t_end; }' > $tmpdir/segments_to_fix -if [ `cat $tmpdir/segments_to_fix | wc -l` -gt 0 ]; then +if [ -s $tmpdir/segments_to_fix ]; then echo "$0. Applying following fixes to segments" cat $tmpdir/segments_to_fix - while read line; do - p1=`echo $line | awk -F'>' '{print $1}'` - p2=`echo $line | awk -F'>' '{print $2}'` - sed -ir "s:$p1:$p2:" $tmpdir/segments - done < $tmpdir/segments_to_fix + perl -i -pf $tmpdir/segments_to_fix $tmpdir/segments fi # Copy stuff into its final locations [this has been moved from the format_data diff --git a/egs/ami/s5b/local/ami_sdm_scoring_data_prep.sh b/egs/ami/s5b/local/ami_sdm_scoring_data_prep.sh index 1378f8b8965..580880818fc 100755 --- a/egs/ami/s5b/local/ami_sdm_scoring_data_prep.sh +++ b/egs/ami/s5b/local/ami_sdm_scoring_data_prep.sh @@ -107,29 +107,25 @@ awk '{print $1}' $tmpdir/segments | \ #check and correct the case when segment timings for given speaker overlap themself #(important for simulatenous asclite scoring to proceed). -#There is actually only one such case for devset and automatic segmentetions +#There is actually only one such case for devset and automatic segmentations join $tmpdir/utt2spk_stm $tmpdir/segments | \ awk '{ utt=$1; spk=$2; wav=$3; t_beg=$4; t_end=$5; if(spk_prev == spk && t_end_prev > t_beg) { - print utt, wav, t_beg, t_end">"utt, wav, t_end_prev, t_end; + print "s:[^\\S\\n]+$::;s:^"utt, wav, t_beg, t_end"$:"utt, wav, t_end_prev, t_end":;"; } spk_prev=spk; t_end_prev=t_end; }' > $tmpdir/segments_to_fix -if [ `cat $tmpdir/segments_to_fix | wc -l` -gt 0 ]; then +if [ -s $tmpdir/segments_to_fix ]; then echo "$0. Applying following fixes to segments" cat $tmpdir/segments_to_fix - while read line; do - p1=`echo $line | awk -F'>' '{print $1}'` - p2=`echo $line | awk -F'>' '{print $2}'` - sed -ir "s:$p1:$p2:" $tmpdir/segments - done < $tmpdir/segments_to_fix + perl -i -pf $tmpdir/segments_to_fix $tmpdir/segments fi # Copy stuff into its final locations [this has been moved from the format_data # script] mkdir -p $dir -for f in spk2utt utt2spk utt2spk_stm wav.scp text segments reco2file_and_channel; do +for f in segments_to_fix spk2utt utt2spk utt2spk_stm wav.scp text segments reco2file_and_channel; do cp $tmpdir/$f $dir/$f || exit 1; done diff --git a/egs/ami/s5b/local/chain/multi_condition/run_tdnn.sh b/egs/ami/s5b/local/chain/multi_condition/run_tdnn.sh deleted file mode 100755 index 754a9508e66..00000000000 --- a/egs/ami/s5b/local/chain/multi_condition/run_tdnn.sh +++ /dev/null @@ -1,283 +0,0 @@ -#!/bin/bash - -# This is a chain-training script with TDNN neural networks. -# This script is based on local/chain/tuning/run_tdnn_1a.sh, but adding -# the reverberated IHM data into the train set. -# This script obtains better results on IHM, SDM and MDM tasks. - -# Please see RESULTS_* for examples of command lines invoking this script. - -# local/chain/multi_condition/run_tdnn.sh --mic ihm --train-set train_cleaned --gmm tri3_cleaned & -# local/chain/multi_condition/run_tdnn.sh --mic sdm1 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned & -# local/chain/multi_condition/run_tdnn.sh --mic mdm8 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned & - - -set -e -o pipefail - -# First the options that are passed through to run_ivector_common.sh -# (some of which are also used in this script directly). -stage=1 -mic=ihm -nj=30 -min_seg_len=1.55 -use_ihm_ali=false -train_set=train_cleaned -gmm=tri3_cleaned # the gmm for the target data -ihm_gmm=tri3_cleaned # the gmm for the IHM system (if --use-ihm-ali true). -num_threads_ubm=32 -num_data_reps=1 - -# The rest are configs specific to this script. Most of the parameters -# are just hardcoded at this level, in the commands below. -train_stage=-10 -tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. -tdnn_affix= #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration. -common_egs_dir= # you can set this to use previously dumped egs. - -# End configuration section. -echo "$0 $@" # Print the command line for logging - -. ./cmd.sh -. ./path.sh -. ./utils/parse_options.sh - -if ! $use_ihm_ali; then - [ "$mic" != "ihm" ] && \ - echo "$0: you cannot specify --use-ihm-ali false if the microphone is not ihm." && \ - exit 1; -else - [ "$mic" == "ihm" ] && \ - echo "$0: you must specify --use-ihm-ali false if the microphone is ihm." && \ - exit 1; -fi - -if ! cuda-compiled; then - cat <data/lang_chain/topo - fi -fi - -if [ $stage -le 13 ]; then - # Get the alignments as lattices (gives the chain training more freedom). - # use the same num-jobs as the alignments - steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ - data/lang $gmm_dir $original_lat_dir - rm $original_lat_dir/fsts.*.gz # save space - - lat_dir_ihmdata=exp/ihm/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats - - mkdir -p $lat_dir/temp/ - mkdir -p $lat_dir/temp2/ - lattice-copy "ark:gunzip -c $original_lat_dir/lat.*.gz |" ark,scp:$lat_dir/temp/lats.ark,$lat_dir/temp/lats.scp - lattice-copy "ark:gunzip -c $lat_dir_ihmdata/lat.*.gz |" ark,scp:$lat_dir/temp2/lats.ark,$lat_dir/temp2/lats.scp - - # copy the lattices for the reverberated data - rm -f $lat_dir/temp/combined_lats.scp - touch $lat_dir/temp/combined_lats.scp - cat $lat_dir/temp/lats.scp >> $lat_dir/temp/combined_lats.scp - for i in `seq 1 $num_data_reps`; do - cat $lat_dir/temp2/lats.scp | sed -e "s/^/rev${i}_/" >> $lat_dir/temp/combined_lats.scp - done - sort -u $lat_dir/temp/combined_lats.scp > $lat_dir/temp/combined_lats_sorted.scp - - lattice-copy scp:$lat_dir/temp/combined_lats_sorted.scp "ark:|gzip -c >$lat_dir/lat.1.gz" || exit 1; - echo "1" > $lat_dir/num_jobs - - # copy other files from original lattice dir - for f in cmvn_opts final.mdl splice_opts tree; do - cp $original_lat_dir/$f $lat_dir/$f - done -fi - - -if [ $stage -le 14 ]; then - # Build a tree using our new topology. We know we have alignments for the - # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use - # those. - if [ -f $tree_dir/final.mdl ]; then - echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." - exit 1; - fi - steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ - --context-opts "--context-width=2 --central-position=1" \ - --leftmost-questions-truncate -1 \ - --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir -fi - -if [ $stage -le 15 ]; then - mkdir -p $dir - - echo "$0: creating neural net configs"; - - steps/nnet3/tdnn/make_configs.py \ - --self-repair-scale-nonlinearity 0.00001 \ - --feat-dir data/$mic/${train_set}_sp_hires_comb \ - --ivector-dir $train_ivector_dir \ - --tree-dir $tree_dir \ - --relu-dim 450 \ - --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \ - --use-presoftmax-prior-scale false \ - --xent-regularize 0.1 \ - --xent-separate-forward-affine true \ - --include-log-softmax false \ - --final-layer-normalize-target 1.0 \ - $dir/configs || exit 1; -fi - -if [ $stage -le 16 ]; then - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then - utils/create_split_dir.pl \ - /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-rvb$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage - fi - - touch $dir/egs/.nodelete # keep egs around when that run dies. - - steps/nnet3/chain/train.py --stage $train_stage \ - --cmd "$decode_cmd" \ - --feat.online-ivector-dir $train_ivector_dir \ - --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ - --chain.xent-regularize 0.1 \ - --chain.leaky-hmm-coefficient 0.1 \ - --chain.l2-regularize 0.00005 \ - --chain.apply-deriv-weights false \ - --chain.lm-opts="--num-extra-lm-states=2000" \ - --egs.dir "$common_egs_dir" \ - --egs.opts "--frames-overlap-per-eg 0" \ - --egs.chunk-width 150 \ - --trainer.num-chunk-per-minibatch 128 \ - --trainer.frames-per-iter 1500000 \ - --trainer.num-epochs 4 \ - --trainer.optimization.num-jobs-initial 2 \ - --trainer.optimization.num-jobs-final 12 \ - --trainer.optimization.initial-effective-lrate 0.001 \ - --trainer.optimization.final-effective-lrate 0.0001 \ - --trainer.max-param-change 2.0 \ - --cleanup.remove-egs true \ - --feat-dir $train_data_dir \ - --tree-dir $tree_dir \ - --lat-dir $lat_dir \ - --dir $dir -fi - - -graph_dir=$dir/graph_${LM} -if [ $stage -le 17 ]; then - # Note: it might appear that this data/lang_chain directory is mismatched, and it is as - # far as the 'topo' is concerned, but this script doesn't read the 'topo' from - # the lang directory. - utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir -fi - -if [ $stage -le 18 ]; then - rm $dir/.error 2>/dev/null || true - for decode_set in dev eval; do - ( - steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --nj $nj --cmd "$decode_cmd" \ - --online-ivector-dir exp/$mic/nnet3${nnet3_affix}${rvb_affix}/ivectors_${decode_set}_hires \ - --scoring-opts "--min-lmwt 5 " \ - $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1; - ) || touch $dir/.error & - done - wait - if [ -f $dir/.error ]; then - echo "$0: something went wrong in decoding" - exit 1 - fi -fi -exit 0 diff --git a/egs/ami/s5b/local/chain/multi_condition/run_tdnn.sh b/egs/ami/s5b/local/chain/multi_condition/run_tdnn.sh new file mode 120000 index 00000000000..34499362831 --- /dev/null +++ b/egs/ami/s5b/local/chain/multi_condition/run_tdnn.sh @@ -0,0 +1 @@ +tuning/run_tdnn_1a.sh \ No newline at end of file diff --git a/egs/ami/s5b/local/chain/multi_condition/run_tdnn_lstm.sh b/egs/ami/s5b/local/chain/multi_condition/run_tdnn_lstm.sh index 8e647598556..a4fa11e0908 120000 --- a/egs/ami/s5b/local/chain/multi_condition/run_tdnn_lstm.sh +++ b/egs/ami/s5b/local/chain/multi_condition/run_tdnn_lstm.sh @@ -1 +1 @@ -tuning/run_tdnn_lstm_1a.sh \ No newline at end of file +tuning/run_tdnn_lstm_1b.sh \ No newline at end of file diff --git a/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_1a.sh b/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_1a.sh new file mode 100755 index 00000000000..4d260e3c517 --- /dev/null +++ b/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_1a.sh @@ -0,0 +1,334 @@ +#!/bin/bash + +# This script is based on swbd 7q TDNN-F recipe +# with resnet-style skip connections, more layers, +# skinnier bottlenecks, removing the 3-way splicing and skip-layer splicing, +# and re-tuning the learning rate and l2 regularize. The configs are +# standardized and substantially simplified. +# The advantage of this style of config is that it also works +# well on smaller datasets, and we adopt this style here also for consistency. +# This gives better results than TDNN+LSTM on AMI SDM. + +# local/chain/multi_condition/tuning/run_tdnn_1a.sh --mic ihm --train-set train_cleaned --gmm tri3_cleaned & +# local/chain/multi_condition/tuning/run_tdnn_1a.sh --mic sdm1 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned & +# local/chain/multi_condition/tuning/run_tdnn_1a.sh --mic mdm8 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned & + +# steps/info/chain_dir_info.pl exp/sdm1/chain_cleaned_rvb/tdnn1a_sp_rvb_bi_ihmali +# exp/sdm1/chain_cleaned_rvb/tdnn1a_sp_rvb_bi_ihmali: num-iters=193 nj=3..16 num-params=17.5M dim=40+100->3728 combine=-0.122->-0.121 (over 2) xent:train/valid[127,192,final]=(-2.03,-1.57,-1.58/-2.12,-1.71,-1.71) logprob:train/valid[127,192,final]=(-0.179,-0.121,-0.122/-0.198,-0.158,-0.157) + +# local/chain/compare_wer_general.sh sdm1 chain_cleaned_rvb tdnn_lstm1b_sp_rvb_bi_ihmali tdnn1a_sp_rvb_bi_ihmali +# System tdnn_lstm1b_sp_rvb_bi_ihmali tdnn1a_sp_rvb_bi_ihmali +# WER on dev 33.9 33.3 +# WER on eval 37.4 36.7 +# Final train prob -0.133611 -0.122155 +# Final valid prob -0.161014 -0.156612 +# Final train prob (xent) -1.9774 -1.57504 +# Final valid prob (xent) -2.09991 -1.705 + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +mic=ihm +nj=30 +use_ihm_ali=false +train_set=train_cleaned +gmm=tri3_cleaned # the gmm for the target data +ihm_gmm=tri3_cleaned # the gmm for the IHM system (if --use-ihm-ali true). +num_threads_ubm=32 +num_data_reps=1 +num_epochs=6 +get_egs_stage=-5 +remove_egs=false + +chunk_width=160,140,110,80 +dropout_schedule='0,0@0.20,0.5@0.50,0' # dropout schedule controls the dropout + # proportion for each training iteration. +xent_regularize=0.1 + +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tdnn_affix=1a #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. + +# decode options +frames_per_chunk=160 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! $use_ihm_ali; then + [ "$mic" != "ihm" ] && \ + echo "$0: you cannot specify --use-ihm-ali false if the microphone is not ihm." && \ + exit 1; +else + [ "$mic" == "ihm" ] && \ + echo "$0: you must specify --use-ihm-ali false if the microphone is ihm." && \ + exit 1; +fi + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 13 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" \ + --generate-ali-from-lats true ${lores_train_data_dir} \ + data/lang $gmm_dir $original_lat_dir + rm $original_lat_dir/fsts.*.gz # save space + + lat_dir_ihmdata=exp/ihm/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats + + original_lat_nj=$(cat $original_lat_dir/num_jobs) + ihm_lat_nj=$(cat $lat_dir_ihmdata/num_jobs) + + $train_cmd --max-jobs-run 10 JOB=1:$original_lat_nj $lat_dir/temp/log/copy_original_lats.JOB.log \ + lattice-copy "ark:gunzip -c $original_lat_dir/lat.JOB.gz |" ark,scp:$lat_dir/temp/lats.JOB.ark,$lat_dir/temp/lats.JOB.scp + + $train_cmd --max-jobs-run 10 JOB=1:$ihm_lat_nj $lat_dir/temp2/log/copy_ihm_lats.JOB.log \ + lattice-copy "ark:gunzip -c $lat_dir_ihmdata/lat.JOB.gz |" ark,scp:$lat_dir/temp2/lats.JOB.ark,$lat_dir/temp2/lats.JOB.scp + + for n in $(seq $original_lat_nj); do + cat $lat_dir/temp/lats.$n.scp + done > $lat_dir/temp/combined_lats.scp + + for i in `seq 1 $num_data_reps`; do + for n in $(seq $ihm_lat_nj); do + cat $lat_dir/temp2/lats.$n.scp + done | sed -e "s/^/rev${i}_/" + done >> $lat_dir/temp/combined_lats.scp + + sort -u $lat_dir/temp/combined_lats.scp > $lat_dir/temp/combined_lats_sorted.scp + + utils/split_data.sh $train_data_dir $nj + + $train_cmd --max-jobs-run 10 JOB=1:$nj $lat_dir/copy_combined_lats.JOB.log \ + lattice-copy --include=$train_data_dir/split$nj/JOB/utt2spk \ + scp:$lat_dir/temp/combined_lats_sorted.scp \ + "ark:|gzip -c >$lat_dir/lat.JOB.gz" || exit 1; + + echo $nj > $lat_dir/num_jobs + + # copy other files from original lattice dir + for f in cmvn_opts final.mdl splice_opts tree; do + cp $original_lat_dir/$f $lat_dir/$f + done +fi + + +if [ $stage -le 14 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $original_lat_dir $tree_dir +fi + +if [ $stage -le 15 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) + affine_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true" + tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66" + linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0" + prefinal_opts="l2-regularize=0.01" + output_opts="l2-regularize=0.002" + + mkdir -p $dir/configs + + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + linear-component name=prefinal-l dim=256 $linear_opts + + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $chunk_width \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.00025 \ + --trainer.optimization.final-effective-lrate 0.000025 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir +fi + + +graph_dir=$dir/graph_${LM} +if [ $stage -le 17 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + rm $dir/.error 2>/dev/null || true + + [ -z $extra_left_context ] && extra_left_context=$chunk_left_context; + + for decode_set in dev eval; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $nj --cmd "$decode_cmd" \ + --frames-per-chunk "$frames_per_chunk" \ + --online-ivector-dir exp/$mic/nnet3${nnet3_affix}${rvb_affix}/ivectors_${decode_set}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1; + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi +exit 0 diff --git a/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_lstm_1a.sh b/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_lstm_1a.sh index 2869049843f..3546b6a7ced 100755 --- a/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_lstm_1a.sh +++ b/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_lstm_1a.sh @@ -19,7 +19,6 @@ set -e -o pipefail stage=0 mic=ihm nj=30 -min_seg_len=1.55 use_ihm_ali=false train_set=train_cleaned gmm=tri3_cleaned # the gmm for the target data @@ -27,7 +26,7 @@ ihm_gmm=tri3_cleaned # the gmm for the IHM system (if --use-ihm-ali true). num_threads_ubm=32 num_data_reps=1 -chunk_width=150 +chunk_width=160,140,110,80 chunk_left_context=40 chunk_right_context=0 label_delay=5 @@ -35,13 +34,13 @@ label_delay=5 # are just hardcoded at this level, in the commands below. train_stage=-10 tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. -tlstm_affix=1i #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration. +tlstm_affix=1a #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration. common_egs_dir= # you can set this to use previously dumped egs. # decode options extra_left_context=50 -frames_per_chunk= +frames_per_chunk=160 # End configuration section. @@ -75,21 +74,19 @@ rvb_affix=_rvb if $use_ihm_ali; then gmm_dir=exp/ihm/${ihm_gmm} - ali_dir=exp/${mic}/${ihm_gmm}_ali_${train_set}_sp_comb_ihmdata - lores_train_data_dir=data/$mic/${train_set}_ihmdata_sp_comb + lores_train_data_dir=data/$mic/${train_set}_ihmdata_sp tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}_ihmdata - original_lat_dir=exp/$mic/chain${nnet3_affix}/${ihm_gmm}_${train_set}_sp_comb_lats_ihmdata - lat_dir=exp/$mic/chain${nnet3_affix}${rvb_affix}/${ihm_gmm}_${train_set}_sp${rvb_affix}_comb_lats_ihmdata + original_lat_dir=exp/$mic/chain${nnet3_affix}/${ihm_gmm}_${train_set}_sp_lats_ihmdata + lat_dir=exp/$mic/chain${nnet3_affix}${rvb_affix}/${ihm_gmm}_${train_set}_sp${rvb_affix}_lats_ihmdata dir=exp/$mic/chain${nnet3_affix}${rvb_affix}/tdnn_lstm${tlstm_affix}_sp${rvb_affix}_bi_ihmali # note: the distinction between when we use the 'ihmdata' suffix versus # 'ihmali' is pretty arbitrary. else gmm_dir=exp/${mic}/$gmm - ali_dir=exp/${mic}/${gmm}_ali_${train_set}_sp_comb - lores_train_data_dir=data/$mic/${train_set}_sp_comb + lores_train_data_dir=data/$mic/${train_set}_sp tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix} - original_lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats - lat_dir=exp/$mic/chain${nnet3_affix}${rvb_affix}/${gmm}_${train_set}_sp${rvb_affix}_comb_lats + original_lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats + lat_dir=exp/$mic/chain${nnet3_affix}${rvb_affix}/${gmm}_${train_set}_sp${rvb_affix}_lats dir=exp/$mic/chain${nnet3_affix}${rvb_affix}/tdnn_lstm${tlstm_affix}_sp${rvb_affix}_bi fi @@ -97,9 +94,7 @@ fi local/nnet3/multi_condition/run_ivector_common.sh --stage $stage \ --mic $mic \ --nj $nj \ - --min-seg-len $min_seg_len \ --train-set $train_set \ - --gmm $gmm \ --num-threads-ubm $num_threads_ubm \ --num-data-reps $num_data_reps \ --nnet3-affix "$nnet3_affix" @@ -109,13 +104,13 @@ local/nnet3/multi_condition/run_ivector_common.sh --stage $stage \ local/nnet3/prepare_lores_feats.sh --stage $stage \ --mic $mic \ --nj $nj \ - --min-seg-len $min_seg_len \ + --min-seg-len "" \ --use-ihm-ali $use_ihm_ali \ --train-set $train_set -train_data_dir=data/$mic/${train_set}_sp${rvb_affix}_hires_comb -train_ivector_dir=exp/$mic/nnet3${nnet3_affix}${rvb_affix}/ivectors_${train_set}_sp${rvb_affix}_hires_comb +train_data_dir=data/$mic/${train_set}_sp${rvb_affix}_hires +train_ivector_dir=exp/$mic/nnet3${nnet3_affix}${rvb_affix}/ivectors_${train_set}_sp${rvb_affix}_hires final_lm=`cat data/local/lm/final_lm` LM=$final_lm.pr1-7 @@ -126,19 +121,6 @@ for f in $gmm_dir/final.mdl $lores_train_data_dir/feats.scp \ done -if [ $stage -le 11 ]; then - if [ -f $ali_dir/ali.1.gz ]; then - echo "$0: alignments in $ali_dir appear to already exist. Please either remove them " - echo " ... or use a later --stage option." - exit 1 - fi - echo "$0: aligning perturbed, short-segment-combined ${maybe_ihm}data" - steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \ - ${lores_train_data_dir} data/lang $gmm_dir $ali_dir -fi - -[ ! -f $ali_dir/ali.1.gz ] && echo "$0: expected $ali_dir/ali.1.gz to exist" && exit 1 - if [ $stage -le 12 ]; then echo "$0: creating lang directory with one state per phone." # Create a version of the lang/ directory that has one state per phone in the @@ -165,28 +147,42 @@ fi if [ $stage -le 13 ]; then # Get the alignments as lattices (gives the chain training more freedom). # use the same num-jobs as the alignments - steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" \ + --generate-ali-from-lats true ${lores_train_data_dir} \ data/lang $gmm_dir $original_lat_dir rm $original_lat_dir/fsts.*.gz # save space - lat_dir_ihmdata=exp/ihm/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats + lat_dir_ihmdata=exp/ihm/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats + + original_lat_nj=$(cat $original_lat_dir/num_jobs) + ihm_lat_nj=$(cat $lat_dir_ihmdata/num_jobs) - mkdir -p $lat_dir/temp/ - mkdir -p $lat_dir/temp2/ - lattice-copy "ark:gunzip -c $original_lat_dir/lat.*.gz |" ark,scp:$lat_dir/temp/lats.ark,$lat_dir/temp/lats.scp - lattice-copy "ark:gunzip -c $lat_dir_ihmdata/lat.*.gz |" ark,scp:$lat_dir/temp2/lats.ark,$lat_dir/temp2/lats.scp + $train_cmd --max-jobs-run 10 JOB=1:$original_lat_nj $lat_dir/temp/log/copy_original_lats.JOB.log \ + lattice-copy "ark:gunzip -c $original_lat_dir/lat.JOB.gz |" ark,scp:$lat_dir/temp/lats.JOB.ark,$lat_dir/temp/lats.JOB.scp + + $train_cmd --max-jobs-run 10 JOB=1:$ihm_lat_nj $lat_dir/temp2/log/copy_ihm_lats.JOB.log \ + lattice-copy "ark:gunzip -c $lat_dir_ihmdata/lat.JOB.gz |" ark,scp:$lat_dir/temp2/lats.JOB.ark,$lat_dir/temp2/lats.JOB.scp + + for n in $(seq $original_lat_nj); do + cat $lat_dir/temp/lats.$n.scp + done > $lat_dir/temp/combined_lats.scp - # copy the lattices for the reverberated data - rm -f $lat_dir/temp/combined_lats.scp - touch $lat_dir/temp/combined_lats.scp - cat $lat_dir/temp/lats.scp >> $lat_dir/temp/combined_lats.scp for i in `seq 1 $num_data_reps`; do - cat $lat_dir/temp2/lats.scp | sed -e "s/^/rev${i}_/" >> $lat_dir/temp/combined_lats.scp - done + for n in $(seq $ihm_lat_nj); do + cat $lat_dir/temp2/lats.$n.scp + done | sed -e "s/^/rev${i}_/" + done >> $lat_dir/temp/combined_lats.scp + sort -u $lat_dir/temp/combined_lats.scp > $lat_dir/temp/combined_lats_sorted.scp - lattice-copy scp:$lat_dir/temp/combined_lats_sorted.scp "ark:|gzip -c >$lat_dir/lat.1.gz" || exit 1; - echo "1" > $lat_dir/num_jobs + utils/split_data.sh $train_data_dir $nj + + $train_cmd --max-jobs-run 10 JOB=1:$nj $lat_dir/copy_combined_lats.JOB.log \ + lattice-copy --include=$train_data_dir/split$nj/JOB/utt2spk \ + scp:$lat_dir/temp/combined_lats_sorted.scp \ + "ark:|gzip -c >$lat_dir/lat.JOB.gz" || exit 1; + + echo $nj > $lat_dir/num_jobs # copy other files from original lattice dir for f in cmvn_opts final.mdl splice_opts tree; do @@ -206,7 +202,7 @@ if [ $stage -le 14 ]; then steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ --context-opts "--context-width=2 --central-position=1" \ --leftmost-questions-truncate -1 \ - --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir + --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $original_lat_dir $tree_dir fi xent_regularize=0.1 @@ -215,7 +211,7 @@ if [ $stage -le 15 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) mkdir -p $dir/configs cat < $dir/configs/network.xconfig @@ -312,7 +308,6 @@ if [ $stage -le 18 ]; then rm $dir/.error 2>/dev/null || true [ -z $extra_left_context ] && extra_left_context=$chunk_left_context; - [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width; for decode_set in dev eval; do ( diff --git a/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_lstm_1b.sh b/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_lstm_1b.sh new file mode 100755 index 00000000000..1a839b045bd --- /dev/null +++ b/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_lstm_1b.sh @@ -0,0 +1,360 @@ +#!/bin/bash + +# This is a chain-training script with TDNN+LSTM neural networks. +# This script is similar to local/chain/multi_condition/tuning/run_tdnn_lstm_1a.sh, +# but updated to use new l2-regularize options and fast-lstmp with decay-time. +# It uses the reverberated IHM data in the train set. +# This script obtains better results on IHM, SDM and MDM tasks. + +# Please see RESULTS_* for examples of command lines invoking this script. + +# local/chain/multi_condition/tuning/run_tdnn_lstm_1b.sh --mic ihm --train-set train_cleaned --gmm tri3_cleaned & +# local/chain/multi_condition/tuning/run_tdnn_lstm_1b.sh --mic sdm1 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned & +# local/chain/multi_condition/tuning/run_tdnn_lstm_1b.sh --mic mdm8 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned & + +# steps/info/chain_dir_info.pl exp/ihm/chain_cleaned_rvb/tdnn_lstm1b_sp_rvb_bi +# exp/ihm/chain_cleaned_rvb/tdnn_lstm1b_sp_rvb_bi: num-iters=176 nj=2..12 num-params=43.4M dim=40+100->3736 combine=-0.101->-0.100 (over 2) xent:train/valid[116,175,final]=(-2.47,-1.60,-1.55/-2.58,-1.73,-1.69) logprob:train/valid[116,175,final]=(-0.144,-0.101,-0.099/-0.163,-0.138,-0.136) +# steps/info/chain_dir_info.pl exp/sdm1/chain_cleaned_rvb/tdnn_lstm1b_sp_rvb_bi_ihmali +# exp/sdm1/chain_cleaned_rvb/tdnn_lstm1b_sp_rvb_bi_ihmali: num-iters=174 nj=2..12 num-params=43.4M dim=40+100->3728 combine=-0.129->-0.126 (over 4) xent:train/valid[115,173,final]=(-2.86,-1.97,-1.98/-2.96,-2.10,-2.10) logprob:train/valid[115,173,final]=(-0.184,-0.134,-0.134/-0.200,-0.164,-0.161) + +# local/chain/compare_wer_general.sh ihm chain_cleaned_rvb tdnn_lstm1{a,b}_sp_rvb_bi +# System tdnn_lstm1a_sp_rvb_bi tdnn_lstm1b_sp_rvb_bi +# WER on dev 19.4 18.9 +# WER on eval 19.4 19.3 +# Final train prob -0.0627414-0.0985175 +# Final valid prob -0.141082 -0.136302 +# Final train prob (xent) -0.847054 -1.55263 +# Final valid prob (xent) -1.25849 -1.69064 + +# local/chain/compare_wer_general.sh sdm1 chain_cleaned_rvb tdnn_lstm1{a,b}_sp_rvb_bi_ihmali +# System tdnn_lstm1a_sp_rvb_bi_ihmali tdnn_lstm1b_sp_rvb_bi_ihmali +# WER on dev 34.6 33.9 +# WER on eval 37.6 37.4 +# Final train prob -0.0861836 -0.133611 +# Final valid prob -0.149669 -0.161014 +# Final train prob (xent) -1.21927 -1.9774 +# Final valid prob (xent) -1.53542 -2.09991 + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +mic=ihm +nj=30 +use_ihm_ali=false +train_set=train_cleaned +gmm=tri3_cleaned # the gmm for the target data +ihm_gmm=tri3_cleaned # the gmm for the IHM system (if --use-ihm-ali true). +num_threads_ubm=32 +num_data_reps=1 +num_epochs=4 + +chunk_width=160,140,110,80 +chunk_left_context=40 +chunk_right_context=0 +label_delay=5 +dropout_schedule='0,0@0.20,0.3@0.50,0' # dropout schedule controls the dropout + # proportion for each training iteration. +xent_regularize=0.025 + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tlstm_affix=1b #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. + +# decode options +extra_left_context=50 +frames_per_chunk=160 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! $use_ihm_ali; then + [ "$mic" != "ihm" ] && \ + echo "$0: you cannot specify --use-ihm-ali false if the microphone is not ihm." && \ + exit 1; +else + [ "$mic" == "ihm" ] && \ + echo "$0: you must specify --use-ihm-ali false if the microphone is ihm." && \ + exit 1; +fi + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 13 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" \ + --generate-ali-from-lats true ${lores_train_data_dir} \ + data/lang $gmm_dir $original_lat_dir + rm $original_lat_dir/fsts.*.gz # save space + + lat_dir_ihmdata=exp/ihm/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats + + original_lat_nj=$(cat $original_lat_dir/num_jobs) + ihm_lat_nj=$(cat $lat_dir_ihmdata/num_jobs) + + $train_cmd --max-jobs-run 10 JOB=1:$original_lat_nj $lat_dir/temp/log/copy_original_lats.JOB.log \ + lattice-copy "ark:gunzip -c $original_lat_dir/lat.JOB.gz |" ark,scp:$lat_dir/temp/lats.JOB.ark,$lat_dir/temp/lats.JOB.scp + + $train_cmd --max-jobs-run 10 JOB=1:$ihm_lat_nj $lat_dir/temp2/log/copy_ihm_lats.JOB.log \ + lattice-copy "ark:gunzip -c $lat_dir_ihmdata/lat.JOB.gz |" ark,scp:$lat_dir/temp2/lats.JOB.ark,$lat_dir/temp2/lats.JOB.scp + + for n in $(seq $original_lat_nj); do + cat $lat_dir/temp/lats.$n.scp + done > $lat_dir/temp/combined_lats.scp + + for i in `seq 1 $num_data_reps`; do + for n in $(seq $ihm_lat_nj); do + cat $lat_dir/temp2/lats.$n.scp + done | sed -e "s/^/rev${i}_/" + done >> $lat_dir/temp/combined_lats.scp + + sort -u $lat_dir/temp/combined_lats.scp > $lat_dir/temp/combined_lats_sorted.scp + + utils/split_data.sh $train_data_dir $nj + + $train_cmd --max-jobs-run 10 JOB=1:$nj $lat_dir/copy_combined_lats.JOB.log \ + lattice-copy --include=$train_data_dir/split$nj/JOB/utt2spk \ + scp:$lat_dir/temp/combined_lats_sorted.scp \ + "ark:|gzip -c >$lat_dir/lat.JOB.gz" || exit 1; + + echo $nj > $lat_dir/num_jobs + + # copy other files from original lattice dir + for f in cmvn_opts final.mdl splice_opts tree; do + cp $original_lat_dir/$f $lat_dir/$f + done +fi + + +if [ $stage -le 14 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $original_lat_dir $tree_dir +fi + +if [ $stage -le 15 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) + tdnn_opts="l2-regularize=0.006" + lstm_opts="l2-regularize=0.0025 decay-time=20 dropout-proportion=0.0" + output_opts="l2-regularize=0.001" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=1024 $tdnn_opts + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024 $tdnn_opts + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024 $tdnn_opts + + # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults + fast-lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024 $tdnn_opts + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024 $tdnn_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024 $tdnn_opts + fast-lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024 $tdnn_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=1024 $tdnn_opts + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=1024 $tdnn_opts + fast-lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts + +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $chunk_width \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 12 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir +fi + + +graph_dir=$dir/graph_${LM} +if [ $stage -le 17 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + rm $dir/.error 2>/dev/null || true + + [ -z $extra_left_context ] && extra_left_context=$chunk_left_context; + + for decode_set in dev eval; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $nj --cmd "$decode_cmd" \ + --extra-left-context $extra_left_context \ + --frames-per-chunk "$frames_per_chunk" \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --online-ivector-dir exp/$mic/nnet3${nnet3_affix}${rvb_affix}/ivectors_${decode_set}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1; + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi +exit 0 diff --git a/egs/ami/s5b/local/chain/run_tdnn.sh b/egs/ami/s5b/local/chain/run_tdnn.sh index deb68d515d2..05a7c2d345b 120000 --- a/egs/ami/s5b/local/chain/run_tdnn.sh +++ b/egs/ami/s5b/local/chain/run_tdnn.sh @@ -1 +1 @@ -tuning/run_tdnn_1i.sh \ No newline at end of file +tuning/run_tdnn_1j.sh \ No newline at end of file diff --git a/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1a.sh b/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1a.sh index 16d1f4044f5..d926c1dc6d7 100644 --- a/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1a.sh +++ b/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1a.sh @@ -184,7 +184,7 @@ if [ $stage -le 15 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) lstm_opts="decay-time=20" diff --git a/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1b.sh b/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1b.sh index 83e6a95582f..d9cd1c356e8 100644 --- a/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1b.sh +++ b/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1b.sh @@ -176,7 +176,7 @@ if [ $stage -le 15 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) lstm_opts="decay-time=20 dropout-proportion=0" diff --git a/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1c.sh b/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1c.sh index 387b4bfcc88..a0805b4f9f1 100755 --- a/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1c.sh +++ b/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1c.sh @@ -185,7 +185,7 @@ if [ $stage -le 15 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) lstm_opts="decay-time=40" diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1b.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1b.sh index 57108dbddae..997357b80a9 100755 --- a/egs/ami/s5b/local/chain/tuning/run_tdnn_1b.sh +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1b.sh @@ -164,7 +164,7 @@ if [ $stage -le 15 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) mkdir -p $dir/configs cat < $dir/configs/network.xconfig diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1c.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1c.sh index f87e1a12d36..4d062e65429 100755 --- a/egs/ami/s5b/local/chain/tuning/run_tdnn_1c.sh +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1c.sh @@ -151,7 +151,7 @@ if [ $stage -le 15 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) mkdir -p $dir/configs cat < $dir/configs/network.xconfig diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1d.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1d.sh index eb84a1cd876..387570388d0 100755 --- a/egs/ami/s5b/local/chain/tuning/run_tdnn_1d.sh +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1d.sh @@ -163,7 +163,7 @@ if [ $stage -le 15 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) mkdir -p $dir/configs cat < $dir/configs/network.xconfig diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1e.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1e.sh index e6592b667dc..0436b08cdc0 100755 --- a/egs/ami/s5b/local/chain/tuning/run_tdnn_1e.sh +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1e.sh @@ -161,7 +161,7 @@ if [ $stage -le 15 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) mkdir -p $dir/configs cat < $dir/configs/network.xconfig diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1f.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1f.sh index 8bf2b73dada..4ca526d63b8 100644 --- a/egs/ami/s5b/local/chain/tuning/run_tdnn_1f.sh +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1f.sh @@ -165,7 +165,7 @@ if [ $stage -le 15 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) mkdir -p $dir/configs cat < $dir/configs/network.xconfig diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1g.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1g.sh index dfb6dfedee7..baed760bb68 100644 --- a/egs/ami/s5b/local/chain/tuning/run_tdnn_1g.sh +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1g.sh @@ -166,7 +166,7 @@ if [ $stage -le 15 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) mkdir -p $dir/configs cat < $dir/configs/network.xconfig diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1h.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1h.sh index 3e26a8b38bd..e721a858c0a 100755 --- a/egs/ami/s5b/local/chain/tuning/run_tdnn_1h.sh +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1h.sh @@ -167,7 +167,7 @@ if [ $stage -le 15 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) mkdir -p $dir/configs cat < $dir/configs/network.xconfig diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1i.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1i.sh index 1931127c86d..de40cb2d1a4 100755 --- a/egs/ami/s5b/local/chain/tuning/run_tdnn_1i.sh +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1i.sh @@ -168,7 +168,7 @@ if [ $stage -le 15 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) opts="l2-regularize=0.02" output_opts="l2-regularize=0.004" diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1j.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1j.sh new file mode 100755 index 00000000000..80b2aee60e9 --- /dev/null +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1j.sh @@ -0,0 +1,281 @@ +#!/bin/bash + +# 1j is same as swbd 7q. It uses modified topology with resnet-style skip connections, more layers, +# skinnier bottlenecks. + +# local/chain/tuning/run_tdnn_1j.sh --mic sdm1 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned + +# local/chain/compare_wer_general.sh sdm1 tdnn1i_sp_bi_ihmali tdnn1j_sp_bi_ihmali +# System tdnn1i_sp_bi_ihmali tdnn1i_sp_bi_ihmali +# WER on dev 36.6 31.7 +# WER on eval 40.6 35.1 +# Final train prob -0.196231 -0.114088 +# Final valid prob -0.265572 -0.214282 +# Final train prob (xent) -2.48061 -1.37987 +# Final valid prob (xent) -2.71794 -1.8639 + +# steps/info/chain_dir_info.pl exp/sdm1/chain_cleaned/tdnn1j_sp_bi_ihmali +# exp/sdm1/chain_cleaned/tdnn1j_sp_bi_ihmali: num-iters=327 nj=2..12 num-params=34.3M dim=80+100->3728 combine=-0.126->-0.124 (over 4) xent:train/valid[217,326,final]=(-1.69,-1.43,-1.38/-2.06,-1.93,-1.86) logprob:train/valid[217,326,final]=(-0.143,-0.120,-0.114/-0.226,-0.218,-0.214) + +set -e -o pipefail +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +mic=ihm +nj=30 +min_seg_len=1.55 +use_ihm_ali=false +train_set=train_cleaned +gmm=tri3_cleaned # the gmm for the target data +ihm_gmm=tri3 # the gmm for the IHM system (if --use-ihm-ali true). +num_threads_ubm=32 +ivector_transform_type=pca +nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned +num_epochs=15 +remove_egs=true + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tdnn_affix=1j #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. +dropout_schedule='0,0@0.20,0.5@0.50,0' + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 13 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 14 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + +xent_regularize=0.1 + +if [ $stage -le 15 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) + affine_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true" + tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66" + linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0" + prefinal_opts="l2-regularize=0.01" + output_opts="l2-regularize=0.002" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=80 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=2136 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=0 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3 + tdnnf-layer name=tdnnf9 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3 + tdnnf-layer name=tdnnf10 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3 + tdnnf-layer name=tdnnf11 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3 + tdnnf-layer name=tdnnf12 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3 + tdnnf-layer name=tdnnf13 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3 + tdnnf-layer name=tdnnf14 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3 + tdnnf-layer name=tdnnf15 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3 + linear-component name=prefinal-l dim=512 $linear_opts + + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=2136 small-dim=512 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=2136 small-dim=512 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts + +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch 32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 12 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --cleanup.preserve-model-interval 50 \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir +fi + + +graph_dir=$dir/graph_${LM} +if [ $stage -le 17 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + rm $dir/.error 2>/dev/null || true + for decode_set in dev eval; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $nj --cmd "$decode_cmd" \ + --online-ivector-dir exp/$mic/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1; + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi +exit 0 diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh index d63712f1f0f..4f580b88f6b 100755 --- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh @@ -171,7 +171,7 @@ if [ $stage -le 15 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) mkdir -p $dir/configs cat < $dir/configs/network.xconfig diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1b.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1b.sh index a53785f45c2..904a079d7de 100755 --- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1b.sh +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1b.sh @@ -173,7 +173,7 @@ if [ $stage -le 15 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) mkdir -p $dir/configs cat < $dir/configs/network.xconfig diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1c.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1c.sh index 76a9f735c5f..511e520465a 100755 --- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1c.sh +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1c.sh @@ -172,7 +172,7 @@ if [ $stage -le 15 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) mkdir -p $dir/configs cat < $dir/configs/network.xconfig diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1d.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1d.sh index 8cc1a4e15fa..bd81b7df4eb 100755 --- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1d.sh +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1d.sh @@ -172,7 +172,7 @@ if [ $stage -le 15 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) mkdir -p $dir/configs cat < $dir/configs/network.xconfig diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1e.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1e.sh index accfd158a9d..50903e78b6d 100755 --- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1e.sh +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1e.sh @@ -174,7 +174,7 @@ if [ $stage -le 15 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) mkdir -p $dir/configs cat < $dir/configs/network.xconfig diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1f.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1f.sh index 2b275e4e27d..f6c53001498 100755 --- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1f.sh +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1f.sh @@ -173,7 +173,7 @@ if [ $stage -le 15 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) mkdir -p $dir/configs cat < $dir/configs/network.xconfig diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1g.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1g.sh index 1c90af38c4c..79fd9ef3fb5 100755 --- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1g.sh +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1g.sh @@ -174,7 +174,7 @@ if [ $stage -le 15 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) mkdir -p $dir/configs cat < $dir/configs/network.xconfig diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1h.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1h.sh index fb4b6a475e2..e58a7f89e03 100755 --- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1h.sh +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1h.sh @@ -171,7 +171,7 @@ if [ $stage -le 15 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) mkdir -p $dir/configs cat < $dir/configs/network.xconfig diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh index 92636b4c17e..13f894f5a48 100755 --- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh @@ -174,7 +174,7 @@ if [ $stage -le 15 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) mkdir -p $dir/configs cat < $dir/configs/network.xconfig diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1j.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1j.sh index 89fd8ce2915..48b31832e8c 100755 --- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1j.sh +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1j.sh @@ -181,7 +181,7 @@ if [ $stage -le 15 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) lstm_opts="decay-time=20" diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1k.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1k.sh index b8d947d8e92..e675bc494bb 100755 --- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1k.sh +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1k.sh @@ -177,7 +177,7 @@ if [ $stage -le 15 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) lstm_opts="decay-time=20" diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh index 74c0f5a6ead..2d019398274 100644 --- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh @@ -224,7 +224,7 @@ if [ $stage -le 15 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) mkdir -p $dir/configs cat < $dir/configs/network.xconfig diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1m.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1m.sh index b0e7af0618d..9e5b971bbe2 100644 --- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1m.sh +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1m.sh @@ -226,7 +226,7 @@ if [ $stage -le 15 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) lstm_opts="decay-time=20 dropout-proportion=0.0" diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1n.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1n.sh index bee4d997b01..9575c3cf686 100644 --- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1n.sh +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1n.sh @@ -178,7 +178,7 @@ if [ $stage -le 15 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) mkdir -p $dir/configs cat < $dir/configs/network.xconfig diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1o.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1o.sh index 1e4111adc6a..a7f2625c181 100755 --- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1o.sh +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1o.sh @@ -182,7 +182,7 @@ if [ $stage -le 15 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) tdnn_opts="l2-regularize=0.025" lstm_opts="l2-regularize=0.01" output_opts="l2-regularize=0.004" diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_bs_1a.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_bs_1a.sh index b672a44e572..ca920869b30 100755 --- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_bs_1a.sh +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_bs_1a.sh @@ -180,7 +180,7 @@ if [ $stage -le 15 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) tdnn_opts="l2-regularize=0.003" lstm_opts="l2-regularize=0.005" output_opts="l2-regularize=0.001" diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1a.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1a.sh index f68c4203767..53dbd5238db 100644 --- a/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1a.sh +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1a.sh @@ -178,7 +178,7 @@ if [ $stage -le 15 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) gru_opts="dropout-per-frame=true dropout-proportion=0.0" mkdir -p $dir/configs diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1b.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1b.sh index ac4266ca162..dafef668e60 100644 --- a/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1b.sh +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1b.sh @@ -177,7 +177,7 @@ if [ $stage -le 15 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) gru_opts="dropout-per-frame=true dropout-proportion=0.0" mkdir -p $dir/configs diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1c.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1c.sh index 74b21f10c33..677946d0b9a 100644 --- a/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1c.sh +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1c.sh @@ -176,7 +176,7 @@ if [ $stage -le 15 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) gru_opts="dropout-per-frame=true dropout-proportion=0.0" mkdir -p $dir/configs diff --git a/egs/ami/s5b/local/nnet3/multi_condition/run_ivector_common.sh b/egs/ami/s5b/local/nnet3/multi_condition/run_ivector_common.sh index eb20415e515..5ba35fa421c 100755 --- a/egs/ami/s5b/local/nnet3/multi_condition/run_ivector_common.sh +++ b/egs/ami/s5b/local/nnet3/multi_condition/run_ivector_common.sh @@ -10,19 +10,17 @@ set -e -o pipefail stage=1 mic=ihm nj=30 -min_seg_len=1.55 # min length in seconds... we do this because chain training - # will discard segments shorter than 1.5 seconds. Must remain in sync with - # the same option given to prepare_lores_feats.sh. train_set=train_cleaned # you might set this to e.g. train_cleaned. -gmm=tri3_cleaned # This specifies a GMM-dir from the features of the type you're training the system on; - # it should contain alignments for 'train_set'. - +norvb_datadir=data/ihm/train_cleaned_sp num_threads_ubm=32 rvb_affix=_rvb nnet3_affix=_cleaned # affix for exp/$mic/nnet3 directory to put iVector stuff in, so it # becomes exp/$mic/nnet3_cleaned or whatever. num_data_reps=1 +sample_rate=16000 + +max_jobs_run=10 . ./cmd.sh . ./path.sh @@ -30,10 +28,7 @@ num_data_reps=1 nnet3_affix=${nnet3_affix}$rvb_affix -gmmdir=exp/${mic}/${gmm} - - -for f in data/${mic}/${train_set}/feats.scp ${gmmdir}/final.mdl; do +for f in data/${mic}/${train_set}/feats.scp; do if [ ! -f $f ]; then echo "$0: expected file $f to exist" exit 1 @@ -73,36 +68,23 @@ if [ $stage -le 1 ]; then for datadir in ${train_set}_sp dev eval; do steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \ - --cmd "$train_cmd" data/$mic/${datadir}_hires + --cmd "$train_cmd --max-jobs-run $max_jobs_run" data/$mic/${datadir}_hires steps/compute_cmvn_stats.sh data/$mic/${datadir}_hires utils/fix_data_dir.sh data/$mic/${datadir}_hires done fi -if [ $stage -le 2 ]; then - echo "$0: combining short segments of speed-perturbed high-resolution MFCC training data" - # we have to combine short segments or we won't be able to train chain models - # on those segments. - utils/data/combine_short_segments.sh \ - data/${mic}/${train_set}_sp_hires $min_seg_len data/${mic}/${train_set}_sp_hires_comb - - # just copy over the CMVN to avoid having to recompute it. - cp data/${mic}/${train_set}_sp_hires/cmvn.scp data/${mic}/${train_set}_sp_hires_comb/ - utils/fix_data_dir.sh data/${mic}/${train_set}_sp_hires_comb/ -fi if [ $stage -le 3 ]; then echo "$0: creating reverberated MFCC features" - datadir=data/ihm/train_cleaned_sp - - mfccdir=${datadir}_rvb${num_data_reps}_hires/data + mfccdir=${norvb_datadir}${rvb_affix}${num_data_reps}_hires/data if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$mic-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage fi - if [ ! -f ${datadir}_rvb${num_data_reps}_hires/feats.scp ]; then - if [ ! -d "RIRS_NOISES" ]; then + if [ ! -f ${norvb_datadir}${rvb_affix}${num_data_reps}_hires/feats.scp ]; then + if [ ! -d "RIRS_NOISES/" ]; then # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip unzip rirs_noises.zip @@ -123,60 +105,29 @@ if [ $stage -le 3 ]; then --isotropic-noise-addition-probability 1 \ --num-replications ${num_data_reps} \ --max-noises-per-minute 1 \ - --source-sampling-rate 16000 \ - ${datadir} ${datadir}_rvb${num_data_reps} + --source-sampling-rate $sample_rate \ + ${norvb_datadir} ${norvb_datadir}${rvb_affix}${num_data_reps} - utils/copy_data_dir.sh ${datadir}_rvb${num_data_reps} ${datadir}_rvb${num_data_reps}_hires - utils/data/perturb_data_dir_volume.sh ${datadir}_rvb${num_data_reps}_hires + utils/copy_data_dir.sh ${norvb_datadir}${rvb_affix}${num_data_reps} ${norvb_datadir}${rvb_affix}${num_data_reps}_hires + utils/data/perturb_data_dir_volume.sh ${norvb_datadir}${rvb_affix}${num_data_reps}_hires steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \ - --cmd "$train_cmd" ${datadir}_rvb${num_data_reps}_hires - steps/compute_cmvn_stats.sh ${datadir}_rvb${num_data_reps}_hires - utils/fix_data_dir.sh ${datadir}_rvb${num_data_reps}_hires - - utils/data/combine_short_segments.sh \ - ${datadir}_rvb${num_data_reps}_hires $min_seg_len ${datadir}_rvb${num_data_reps}_hires_comb - - # just copy over the CMVN to avoid having to recompute it. - cp ${datadir}_rvb${num_data_reps}_hires/cmvn.scp ${datadir}_rvb${num_data_reps}_hires_comb/ - utils/fix_data_dir.sh ${datadir}_rvb${num_data_reps}_hires_comb/ + --cmd "$train_cmd --max-jobs-run $max_jobs_run" ${norvb_datadir}${rvb_affix}${num_data_reps}_hires + steps/compute_cmvn_stats.sh ${norvb_datadir}${rvb_affix}${num_data_reps}_hires + utils/fix_data_dir.sh ${norvb_datadir}${rvb_affix}${num_data_reps}_hires fi - utils/combine_data.sh data/${mic}/${train_set}_sp_rvb_hires data/${mic}/${train_set}_sp_hires ${datadir}_rvb${num_data_reps}_hires - utils/combine_data.sh data/${mic}/${train_set}_sp_rvb_hires_comb data/${mic}/${train_set}_sp_hires_comb ${datadir}_rvb${num_data_reps}_hires_comb + utils/combine_data.sh data/${mic}/${train_set}_sp${rvb_affix}_hires data/${mic}/${train_set}_sp_hires ${norvb_datadir}${rvb_affix}${num_data_reps}_hires fi - if [ $stage -le 4 ]; then - echo "$0: selecting segments of hires training data that were also present in the" - echo " ... original training data." - - # note, these data-dirs are temporary; we put them in a sub-directory - # of the place where we'll make the alignments. - temp_data_root=exp/$mic/nnet3${nnet3_affix}/tri5 - mkdir -p $temp_data_root - - utils/data/subset_data_dir.sh --utt-list data/${mic}/${train_set}/feats.scp \ - data/${mic}/${train_set}_sp_hires $temp_data_root/${train_set}_hires - - # note: essentially all the original segments should be in the hires data. - n1=$(wc -l $tmpdir/ihmutt2utt # Map the 1st field of the segments file from the ihm data (the 1st field being # the utterance-id) to the corresponding SDM or MDM utterance-id. The other # fields remain the same (e.g. we want the recording-ids from the IHM data). -utils/apply_map.pl -f 1 $tmpdir/ihmutt2utt data/$mic/train_ihmdata/segments +utils/apply_map.pl -f 1 $tmpdir/ihmutt2utt data/$mic/${train_set}_ihmdata/segments -utils/fix_data_dir.sh data/$mic/train_ihmdata +utils/fix_data_dir.sh data/$mic/${train_set}_ihmdata rm $tmpdir/ihmutt2utt diff --git a/egs/an4/s5/local/data_prep.py b/egs/an4/s5/local/data_prep.py index 24cb9bffb07..9d8083f3b60 100644 --- a/egs/an4/s5/local/data_prep.py +++ b/egs/an4/s5/local/data_prep.py @@ -15,6 +15,7 @@ # See the Apache 2 License for the specific language governing permissions and # limitations under the License. +from __future__ import print_function import os import re import sys diff --git a/egs/an4/s5/local/lexicon_prep.py b/egs/an4/s5/local/lexicon_prep.py index 8d451daf869..3584fa86dfb 100644 --- a/egs/an4/s5/local/lexicon_prep.py +++ b/egs/an4/s5/local/lexicon_prep.py @@ -15,6 +15,7 @@ # See the Apache 2 License for the specific language governing permissions and # limitations under the License. +from __future__ import print_function import os import re import sys diff --git a/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh b/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh index 8ff59d83ed0..bd13010c791 100755 --- a/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh +++ b/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh @@ -138,7 +138,7 @@ if [ $stage -le 11 ]; then num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}') [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; } - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) lstm_opts="decay-time=20" diff --git a/egs/aspire/s5/local/chain/tuning/run_blstm_asp_1.sh b/egs/aspire/s5/local/chain/tuning/run_blstm_asp_1.sh index 0ca6062e9c8..b5979a3ce6b 100755 --- a/egs/aspire/s5/local/chain/tuning/run_blstm_asp_1.sh +++ b/egs/aspire/s5/local/chain/tuning/run_blstm_asp_1.sh @@ -208,7 +208,7 @@ if [ $stage -le 14 ]; then extra_right_context=$[$chunk_right_context+10] # %WER 26.8 | 2120 27220 | 80.2 11.7 8.1 7.0 26.8 76.5 | -0.804 | exp/chain/blstm_asp_1/decode_dev_aspire_whole_uniformsegmented_win10_over5_v7_iterfinal_pp_fg/score_9/penalty_0.0/ - local/nnet3/prep_test_aspire.sh --stage 4 --decode-num-jobs 30 --affix "v7" \ + local/multi_condition/prep_test_aspire.sh --stage 4 --decode-num-jobs 30 --affix "v7" \ --extra-left-context $extra_left_context \ --extra-right-context $extra_right_context \ --frames-per-chunk $chunk_width \ diff --git a/egs/aspire/s5/local/chain/tuning/run_tdnn_7b.sh b/egs/aspire/s5/local/chain/tuning/run_tdnn_7b.sh index 201f61dc64b..af12e323e76 100755 --- a/egs/aspire/s5/local/chain/tuning/run_tdnn_7b.sh +++ b/egs/aspire/s5/local/chain/tuning/run_tdnn_7b.sh @@ -44,7 +44,7 @@ lang=data/lang_chain # The iVector-extraction and feature-dumping parts are the same as the standard # nnet3 setup, and you can skip them by setting "--stage 8" if you have already # run those things. -local/nnet3/run_ivector_common.sh --stage $stage --num-data-reps 3|| exit 1; +local/nnet3/run_ivector_common.sh --stage $stage --num-data-reps ${num_data_reps} || exit 1; if [ $stage -le 7 ]; then # Create a version of the lang/ directory that has one state per phone in the @@ -92,8 +92,8 @@ if [ $stage -le 9 ]; then # combine the non-hires features for alignments/lattices rm -rf data/${latgen_train_set}_min${min_seg_len} - utt_prefix="THISISUNIQUESTRING_" - spk_prefix="THISISUNIQUESTRING_" + utt_prefix="THISISUNIQUESTRING-" + spk_prefix="THISISUNIQUESTRING-" utils/copy_data_dir.sh --spk-prefix "$spk_prefix" --utt-prefix "$utt_prefix" \ data/train data/train_temp_for_lats utils/data/combine_short_segments.sh \ @@ -136,7 +136,7 @@ if [ $stage -le 11 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) mkdir -p $dir/configs cat < $dir/configs/network.xconfig @@ -182,6 +182,7 @@ if [ $stage -le 12 ]; then /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage fi + mkdir -p $dir/egs touch $dir/egs/.nodelete # keep egs around when that run dies. steps/nnet3/chain/train.py --stage $train_stage \ diff --git a/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_1a.sh index 63d3a7ca988..f98dff5e6fa 100755 --- a/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_1a.sh +++ b/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_1a.sh @@ -26,7 +26,6 @@ cell_dim=1024 projection_dim=256 # training options -num_epochs=2 minibatch_size=64,32 chunk_left_context=40 chunk_right_context=0 @@ -95,7 +94,7 @@ if [ $stage -le 8 ]; then for n in `seq $nj`; do awk '{print $1}' data/${train_set}/split$nj/$n/utt2spk | \ - perl -ane 's/rev[1-3]_//g' > $lat_dir/uttlist.$n.$nj + perl -ane 's/rev[1-3]-//g' > $lat_dir/uttlist.$n.$nj done rm -f $lat_dir/lat_tmp.*.{ark,scp} 2>/dev/null @@ -106,7 +105,7 @@ if [ $stage -le 8 ]; then ark,scp:$lat_dir/lat_tmp.JOB.ark,$lat_dir/lat_tmp.JOB.scp || exit 1 for n in `seq 3`; do - cat $lat_dir/lat_tmp.*.scp | awk -v n=$n '{print "rev"n"_"$1" "$2}' + cat $lat_dir/lat_tmp.*.scp | awk -v n=$n '{print "rev"n"-"$1" "$2}' done > $lat_dir/lat_rvb.scp $train_cmd JOB=1:$nj $lat_dir/log/copy_rvb_lattices.JOB.log \ @@ -151,7 +150,7 @@ if [ $stage -le 12 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) lstm_opts="decay-time=40" @@ -309,4 +308,3 @@ if [ $stage -le 17 ]; then fi exit 0; - diff --git a/egs/aspire/s5/local/generate_uniformly_segmented_data_dir.sh b/egs/aspire/s5/local/generate_uniformly_segmented_data_dir.sh index f4366fef679..2ceb4a4cf05 100755 --- a/egs/aspire/s5/local/generate_uniformly_segmented_data_dir.sh +++ b/egs/aspire/s5/local/generate_uniformly_segmented_data_dir.sh @@ -63,7 +63,8 @@ if [ $stage -le 3 ]; then [ -d data/${segmented_data_set}_hires ] && rm -r data/${segmented_data_set}_hires if [ ! -f data/${data_set}_hires/segments ]; then utils/data/get_segments_for_data.sh data/${data_set}_hires > \ - data/${data_set}_hires/segments + data/${data_set}_hires/segments.tmp + mv data/${data_set}_hires/segments.tmp data/${data_set}_hires/segments fi mkdir -p data/${segmented_data_set}_hires diff --git a/egs/aspire/s5/local/multi_condition/create_uniform_segments.py b/egs/aspire/s5/local/multi_condition/create_uniform_segments.py index e7baafc028c..010811490ef 100755 --- a/egs/aspire/s5/local/multi_condition/create_uniform_segments.py +++ b/egs/aspire/s5/local/multi_condition/create_uniform_segments.py @@ -4,13 +4,14 @@ # creates a segments file in the provided data directory # into uniform segments with specified window and overlap +from __future__ import division import imp, sys, argparse, os, math, subprocess min_segment_length = 10 # in seconds def segment(total_length, window_length, overlap = 0): increment = window_length - overlap num_windows = int(math.ceil(float(total_length)/increment)) - segments = map(lambda x: (x * increment, min( total_length, (x * increment) + window_length)), range(0, num_windows)) + segments = [(x * increment, min( total_length, (x * increment) + window_length)) for x in range(0, num_windows)] if segments[-1][1] - segments[-1][0] < min_segment_length: segments[-2] = (segments[-2][0], segments[-1][1]) segments.pop() @@ -53,7 +54,7 @@ def prepare_segments_file(kaldi_data_dir, window_length, overlap): parser = argparse.ArgumentParser() parser.add_argument('--window-length', type = float, default = 30.0, help = 'length of the window used to cut the segment') parser.add_argument('--overlap', type = float, default = 5.0, help = 'overlap of neighboring windows') - parser.add_argument('data_dir', type=str, help='directory such as data/train') + parser.add_argument('data_dir', help='directory such as data/train') params = parser.parse_args() diff --git a/egs/aspire/s5/local/multi_condition/fill_missing_recordings.py b/egs/aspire/s5/local/multi_condition/fill_missing_recordings.py index e249e54e5f6..2b4bcddda69 100755 --- a/egs/aspire/s5/local/multi_condition/fill_missing_recordings.py +++ b/egs/aspire/s5/local/multi_condition/fill_missing_recordings.py @@ -38,14 +38,14 @@ def fill_ctm(input_ctm_file, output_ctm_file, recording_names): sys.stderr.write(str(" ".join(sys.argv))) parser = argparse.ArgumentParser(usage) - parser.add_argument('input_ctm_file', type=str, help='ctm file for the recordings') - parser.add_argument('output_ctm_file', type=str, help='ctm file for the recordings') - parser.add_argument('recording_name_file', type=str, help='file with names of the recordings') + parser.add_argument('input_ctm_file', help='ctm file for the recordings') + parser.add_argument('output_ctm_file', help='ctm file for the recordings') + parser.add_argument('recording_name_file', help='file with names of the recordings') params = parser.parse_args() try: - file_names = map(lambda x: x.strip(), open("{0}".format(params.recording_name_file)).readlines()) + file_names = [x.strip() for x in open("{0}".format(params.recording_name_file)).readlines()] except IOError: raise Exception("Expected to find {0}".format(params.recording_name_file)) diff --git a/egs/aspire/s5/local/multi_condition/get_air_file_patterns.py b/egs/aspire/s5/local/multi_condition/get_air_file_patterns.py index cc06f58616a..1f06d3e7c3b 100755 --- a/egs/aspire/s5/local/multi_condition/get_air_file_patterns.py +++ b/egs/aspire/s5/local/multi_condition/get_air_file_patterns.py @@ -3,6 +3,7 @@ # script to generate the file_patterns of the AIR database # see load_air.m file in AIR db to understand the naming convention +from __future__ import print_function import sys, glob, re, os.path air_dir = sys.argv[1] @@ -45,4 +46,4 @@ file_patterns.append(file_pattern+" "+output_file_name) file_patterns = list(set(file_patterns)) file_patterns.sort() -print "\n".join(file_patterns) +print("\n".join(file_patterns)) diff --git a/egs/aspire/s5/local/multi_condition/get_ctm_conf.sh b/egs/aspire/s5/local/multi_condition/get_ctm_conf.sh deleted file mode 100755 index 23f3bcb8378..00000000000 --- a/egs/aspire/s5/local/multi_condition/get_ctm_conf.sh +++ /dev/null @@ -1,97 +0,0 @@ -#!/bin/bash -# Copyright Johns Hopkins University (Author: Daniel Povey) 2012. Apache 2.0. - -# This script produces CTM files from a decoding directory that has lattices -# present. This version gives you confidence scores. - - -# begin configuration section. -cmd=run.pl -stage=0 -min_lmwt=5 -max_lmwt=20 -use_segments=true # if we have a segments file, use it to convert - # the segments to be relative to the original files. -iter=final -#end configuration section. - -echo "$0 $@" # Print the command line for logging - -[ -f ./path.sh ] && . ./path.sh -. parse_options.sh || exit 1; - -if [ $# -ne 3 ]; then - echo "Usage: $0 [options] " - echo " Options:" - echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." - echo " --stage (0|1|2) # start scoring script from part-way through." - echo " --use-segments (true|false) # use segments and reco2file_and_channel files " - echo " # to produce a ctm relative to the original audio" - echo " # files, with channel information (typically needed" - echo " # for NIST scoring)." - echo "e.g.:" - echo "$0 data/train data/lang exp/tri4a/decode/" - echo "See also: steps/get_train_ctm.sh" - exit 1; -fi - -data=$1 -lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied. -dir=$3 - -model=$dir/../$iter.mdl # assume model one level up from decoding dir. - - -for f in $lang/words.txt $model $dir/lat.1.gz; do - [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1; -done - -name=`basename $data`; # e.g. eval2000 - -mkdir -p $dir/scoring/log - -if [ -f $dir/../frame_shift ]; then - frame_shift_opt="--frame-shift=$(cat $dir/../frame_shift)" - echo "$0: $dir/../frame_shift exists, using $frame_shift_opt" -elif [ -f $dir/../frame_subsampling_factor ]; then - factor=$(cat $dir/../frame_subsampling_factor) || exit 1 - frame_shift_opt="--frame-shift=0.0$factor" - echo "$0: $dir/../frame_subsampling_factor exists, using $frame_shift_opt" -fi - - - -if [ $stage -le 0 ]; then - if [ -f $data/segments ] && $use_segments; then - f=$data/reco2file_and_channel - [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1; - filter_cmd="utils/convert_ctm.pl $data/segments $data/reco2file_and_channel" - else - filter_cmd=cat - fi - - if [ -f $lang/phones/word_boundary.int ]; then - $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/get_ctm.LMWT.log \ - mkdir -p $dir/score_LMWT/ '&&' \ - lattice-prune --inv-acoustic-scale=LMWT --beam=5 "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ - lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \ - lattice-to-ctm-conf $frame_shift_opt --decode-mbr=true --inv-acoustic-scale=LMWT ark:- - \| \ - utils/int2sym.pl -f 5 $lang/words.txt \| \ - $filter_cmd '>' $dir/score_LMWT/$name.ctm || exit 1; - else - if [ ! -f $lang/phones/align_lexicon.int ]; then - echo "$0: neither $lang/phones/word_boundary.int nor $lang/phones/align_lexicon.int exists: cannot align." - exit 1; - fi - - $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/get_ctm.LMWT.log \ - mkdir -p $dir/score_LMWT/ '&&' \ - lattice-prune --inv-acoustic-scale=LMWT --beam=5 "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ - lattice-align-words-lexicon $lang/phones/align_lexicon.int $model ark:- ark:- \| \ - lattice-to-ctm-conf $frame_shift_opt --decode-mbr=true --inv-acoustic-scale=LMWT ark:- - \| \ - utils/int2sym.pl -f 5 $lang/words.txt \| \ - $filter_cmd '>' $dir/score_LMWT/$name.ctm || exit 1; - fi -fi - - diff --git a/egs/aspire/s5/local/multi_condition/get_ctm_conf.sh b/egs/aspire/s5/local/multi_condition/get_ctm_conf.sh new file mode 120000 index 00000000000..4c0ff429c31 --- /dev/null +++ b/egs/aspire/s5/local/multi_condition/get_ctm_conf.sh @@ -0,0 +1 @@ +../../../../wsj/s5/steps/conf/get_ctm_conf.sh \ No newline at end of file diff --git a/egs/aspire/s5/local/multi_condition/normalize_wavs.py b/egs/aspire/s5/local/multi_condition/normalize_wavs.py index dabf420d9f8..6e67d2113c1 100755 --- a/egs/aspire/s5/local/multi_condition/normalize_wavs.py +++ b/egs/aspire/s5/local/multi_condition/normalize_wavs.py @@ -3,6 +3,8 @@ # normalizes the wave files provided in input file list with a common scaling factor # the common scaling factor is computed to 1/\sqrt(1/(total_samples) * \sum_i{\sum_j x_i(j)^2}) where total_samples is sum of all samples of all wavefiles. If the data is multi-channel then each channel is treated as a seperate wave files +from __future__ import division +from __future__ import print_function import argparse, scipy.io.wavfile, warnings, numpy as np, math def get_normalization_coefficient(file_list, is_rir, additional_scaling): @@ -29,7 +31,7 @@ def get_normalization_coefficient(file_list, is_rir, additional_scaling): assert(rate == sampling_rate) else: sampling_rate = rate - data = data / dtype_max_value + data = data/dtype_max_value if is_rir: # just count the energy of the direct impulse response # this is treated as energy of signal from 0.001 seconds before impulse @@ -55,8 +57,8 @@ def get_normalization_coefficient(file_list, is_rir, additional_scaling): except IOError: warnings.warn("Did not find the file {0}.".format(file)) assert(total_samples > 0) - scaling_coefficient = np.sqrt(total_samples / total_energy) - print "Scaling coefficient is {0}.".format(scaling_coefficient) + scaling_coefficient = np.sqrt(total_samples/total_energy) + print("Scaling coefficient is {0}.".format(scaling_coefficient)) if math.isnan(scaling_coefficient): raise Exception(" Nan encountered while computing scaling coefficient. This is mostly due to numerical overflow") return scaling_coefficient diff --git a/egs/aspire/s5/local/multi_condition/prepare_impulses_noises.sh b/egs/aspire/s5/local/multi_condition/prepare_impulses_noises.sh index 804de611cae..8297cdee9ca 100755 --- a/egs/aspire/s5/local/multi_condition/prepare_impulses_noises.sh +++ b/egs/aspire/s5/local/multi_condition/prepare_impulses_noises.sh @@ -114,7 +114,7 @@ cp ${output_dir}_non_normalized/info/* $output_dir/info # rename file location in the noise-rir pairing files for file in `ls $output_dir/info/noise_impulse*`; do - sed -i "s/_non_normalized//g" $file + perl -i -pe "s/_non_normalized//g" $file done # generating the rir-list with probabilities alloted for each rir diff --git a/egs/aspire/s5/local/multi_condition/read_rir.py b/egs/aspire/s5/local/multi_condition/read_rir.py index a2e1c2052e2..04898bda760 100755 --- a/egs/aspire/s5/local/multi_condition/read_rir.py +++ b/egs/aspire/s5/local/multi_condition/read_rir.py @@ -29,9 +29,9 @@ def usage(): #sys.stderr.write(" ".join(sys.argv)+"\n") parser = argparse.ArgumentParser(usage()) parser.add_argument('--output-sampling-rate', type = int, default = 8000, help = 'sampling rate of the output') - parser.add_argument('type', type = str, default = None, help = 'database type', choices = ['air']) - parser.add_argument('input', type = str, default = None, help = 'directory containing the multi-channel data for a particular recording, or file name or file-regex-pattern') - parser.add_argument('output_filename', type = str, default = None, help = 'output filename (if "-" then output is written to output pipe)') + parser.add_argument('type', default = None, help = 'database type', choices = ['air']) + parser.add_argument('input', default = None, help = 'directory containing the multi-channel data for a particular recording, or file name or file-regex-pattern') + parser.add_argument('output_filename', default = None, help = 'output filename (if "-" then output is written to output pipe)') params = parser.parse_args() if params.output_filename == "-": diff --git a/egs/aspire/s5/local/multi_condition/reverberate_wavs.py b/egs/aspire/s5/local/multi_condition/reverberate_wavs.py index 998a3ed5e74..f43e4a2f894 100755 --- a/egs/aspire/s5/local/multi_condition/reverberate_wavs.py +++ b/egs/aspire/s5/local/multi_condition/reverberate_wavs.py @@ -4,18 +4,20 @@ # script to generate multicondition training data / dev data / test data import argparse, glob, math, os, random, scipy.io.wavfile, sys -class list_cyclic_iterator: +class list_cyclic_iterator(object): def __init__(self, list, random_seed = 0): self.list_index = 0 self.list = list random.seed(random_seed) random.shuffle(self.list) - def next(self): + def __next__(self): item = self.list[self.list_index] self.list_index = (self.list_index + 1) % len(self.list) return item + next = __next__ # for Python 2 + def return_nonempty_lines(lines): new_lines = [] for line in lines: @@ -71,15 +73,15 @@ def return_nonempty_lines(lines): for i in range(len(wav_files)): wav_file = " ".join(wav_files[i].split()[1:]) output_wav_file = wav_out_files[i] - impulse_file = impulses.next() + impulse_file = next(impulses) noise_file = '' snr = '' found_impulse = False if add_noise: - for i in xrange(len(impulse_noise_index)): + for i in range(len(impulse_noise_index)): if impulse_file in impulse_noise_index[i][0]: - noise_file = impulse_noise_index[i][1].next() - snr = snrs.next() + noise_file = next(impulse_noise_index[i][1]) + snr = next(snrs) assert(len(wav_file.strip()) > 0) assert(len(impulse_file.strip()) > 0) assert(len(noise_file.strip()) > 0) diff --git a/egs/aspire/s5/local/nnet3/segment_and_decode.sh b/egs/aspire/s5/local/nnet3/segment_and_decode.sh index d66b72200c1..e8917d091e2 100755 --- a/egs/aspire/s5/local/nnet3/segment_and_decode.sh +++ b/egs/aspire/s5/local/nnet3/segment_and_decode.sh @@ -109,9 +109,9 @@ fi if [ $stage -le 4 ]; then utils/copy_data_dir.sh $sad_work_dir/${segmented_data_set}_seg \ - data/${segmented_data_set}_hires - steps/compute_cmvn_stats.sh data/${segmented_data_set}_hires - utils/fix_data_dir.sh data/${segmented_data_set}_hires + data/${segmented_data_set}_seg_hires + steps/compute_cmvn_stats.sh data/${segmented_data_set}_seg_hires + utils/fix_data_dir.sh data/${segmented_data_set}_seg_hires fi if [ $stage -le 5 ]; then @@ -122,11 +122,11 @@ if [ $stage -le 5 ]; then # acoustic conditions drift over time within the speaker's data. steps/online/nnet2/extract_ivectors.sh --cmd "$train_cmd" --nj $decode_num_jobs \ --sub-speaker-frames $sub_speaker_frames --max-count $max_count \ - data/${segmented_data_set}_hires $lang $ivector_root_dir/extractor \ - $ivector_root_dir/ivectors_${segmented_data_set} + data/${segmented_data_set}_seg_hires $lang $ivector_root_dir/extractor \ + $ivector_root_dir/ivectors_${segmented_data_set}_seg fi -decode_dir=$dir/decode_${segmented_data_set}${affix}_pp +decode_dir=$dir/decode_${segmented_data_set}_seg${affix}_pp if [ $stage -le 6 ]; then echo "Generating lattices" rm -f ${decode_dir}_tg/.error @@ -138,8 +138,8 @@ if [ $stage -le 6 ]; then --extra-right-context-final $extra_right_context_final \ --frames-per-chunk "$frames_per_chunk" \ --skip-scoring true ${iter:+--iter $iter} --lattice-beam $lattice_beam \ - --online-ivector-dir $ivector_root_dir/ivectors_${segmented_data_set} \ - $graph data/${segmented_data_set}_hires ${decode_dir}_tg || \ + --online-ivector-dir $ivector_root_dir/ivectors_${segmented_data_set}_seg \ + $graph data/${segmented_data_set}_seg_hires ${decode_dir}_tg || \ { echo "$0: Error decoding" && exit 1; } fi @@ -147,7 +147,7 @@ if [ $stage -le 7 ]; then echo "Rescoring lattices" steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ --skip-scoring true \ - ${lang}_pp_test{,_fg} data/${segmented_data_set}_hires \ + ${lang}_pp_test{,_fg} data/${segmented_data_set}_seg_hires \ ${decode_dir}_{tg,fg}; fi @@ -161,5 +161,5 @@ if [ $stage -le 8 ]; then ${iter:+--iter $iter} \ --decode-mbr true \ --tune-hyper true \ - $lang $decode_dir $act_data_set $segmented_data_set $out_file + $lang $decode_dir $act_data_set ${segmented_data_set}_seg $out_file fi diff --git a/egs/aspire/s5/local/run_asr_segmentation.sh b/egs/aspire/s5/local/run_asr_segmentation.sh index de0a925a242..095e47e99de 100755 --- a/egs/aspire/s5/local/run_asr_segmentation.sh +++ b/egs/aspire/s5/local/run_asr_segmentation.sh @@ -213,7 +213,7 @@ if [ $stage -le 9 ]; then # Use left and right context options that were used when training # the chain nnet # Increase sil-scale to predict more silence - local/nnet3/prep_test_aspire_segmentation.sh --stage $test_stage \ + local/nnet3/segment_and_decode.sh --stage $test_stage \ --decode-num-jobs $test_nj --affix "${test_affix}" \ --sad-opts "$sad_opts" \ --sad-graph-opts "--min-silence-duration=0.03 --min-speech-duration=0.3 --max-speech-duration=10.0" --sad-priors-opts "--sil-scale=0.1" \ diff --git a/egs/aurora4/s5/RESULTS b/egs/aurora4/s5/RESULTS index a7d4a444a02..dc9af7171f7 100644 --- a/egs/aurora4/s5/RESULTS +++ b/egs/aurora4/s5/RESULTS @@ -1,8 +1,19 @@ -for x in exp/{mono,tri,sgmm,nnet,dnn}*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done +for x in exp/{mono,tri,sgmm,nnet,dnn,chain/tdnn*}*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done -%WER 19.61 [ 14698 / 74942, 1233 ins, 3759 del, 9706 sub ] exp/tri2b_multi/decode_tgpr_5k_eval92/wer_13 -%WER 13.93 [ 10437 / 74942, 732 ins, 2695 del, 7010 sub ] exp/tri3a_dnn/decode_tgpr_5k_eval92/wer_10 -%WER 13.61 [ 10202 / 74942, 660 ins, 2987 del, 6555 sub ] exp/tri4a_dnn/decode_tgpr_5k_eval92/wer_11 +# mono +%WER 37.42 [ 14223 / 38010, 1030 ins, 2613 del, 10580 sub ] exp/mono0a_multi/decode_tgpr_0166/wer_10 +%WER 38.18 [ 28612 / 74942, 1919 ins, 5319 del, 21374 sub ] exp/mono0a_multi/decode_tgpr_eval92/wer_10 +# tri2b +%WER 20.42 [ 7763 / 38010, 827 ins, 1905 del, 5031 sub ] exp/tri2b_multi/decode_tgpr_5k_0166/wer_12 +%WER 19.61 [ 14728 / 74942, 1411 ins, 3548 del, 9769 sub ] exp/tri2b_multi/decode_tgpr_5k_eval92/wer_12 + +# tri3b +%WER 15.71 [ 5970 / 38010, 641 ins, 1403 del, 3926 sub ] exp/tri3b_multi/decode_tgpr_0166/wer_13 +%WER 15.28 [ 11454 / 74942, 1082 ins, 2633 del, 7739 sub ] exp/tri3b_multi/decode_tgpr_eval92/wer_13 + +# chain +%WER 7.88 [ 2994 / 38010, 216 ins, 1045 del, 1733 sub ] exp/chain/tdnn1a_sp/decode_tgpr_5k_0166/wer_15 +%WER 7.67 [ 5745 / 74942, 392 ins, 1758 del, 3595 sub ] exp/chain/tdnn1a_sp/decode_tgpr_5k_eval92/wer_13 for x in /mnt/matylda3/qmallidi/Karels_New-Parametric-ReLU/kaldi/egs/aurora4/s5_PReLU/exp/{mono,tri,sgmm,nnet,dnn}*/decode*; do [ -d $x ] && grep WER $x/wer_* | /mnt/matylda5/iveselyk/DEVEL/kaldi-official/egs/aurora4/s5/utils/best_wer.sh; done diff --git a/egs/aurora4/s5/conf/mfcc_hires.conf b/egs/aurora4/s5/conf/mfcc_hires.conf new file mode 100644 index 00000000000..434834a6725 --- /dev/null +++ b/egs/aurora4/s5/conf/mfcc_hires.conf @@ -0,0 +1,10 @@ +# config for high-resolution MFCC features, intended for neural network training +# Note: we keep all cepstra, so it has the same info as filterbank features, +# but MFCC is more easily compressible (because less correlated) which is why +# we prefer this method. +--use-energy=false # use average of log energy, not energy. +--num-mel-bins=40 # similar to Google's setup. +--num-ceps=40 # there is no dimensionality reduction. +--low-freq=20 # low cutoff frequency for mel bins... this is high-bandwidth data, so + # there might be some information at the low end. +--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) diff --git a/egs/aurora4/s5/conf/online_cmvn.conf b/egs/aurora4/s5/conf/online_cmvn.conf new file mode 100644 index 00000000000..cbdaf5f281c --- /dev/null +++ b/egs/aurora4/s5/conf/online_cmvn.conf @@ -0,0 +1 @@ +# configuration file for apply-cmvn-online, used in the script ../local/online/run_online_decoding_nnet2.sh diff --git a/egs/aurora4/s5/local/chain/compare_wer.sh b/egs/aurora4/s5/local/chain/compare_wer.sh new file mode 100755 index 00000000000..91701cad9e9 --- /dev/null +++ b/egs/aurora4/s5/local/chain/compare_wer.sh @@ -0,0 +1,137 @@ +#!/bin/bash + +# this script is used for comparing decoding results between systems. +# e.g. local/chain/compare_wer.sh exp/chain/tdnn_{c,d}_sp +# For use with discriminatively trained systems you specify the epochs after a colon: +# for instance, +# local/chain/compare_wer.sh exp/chain/tdnn_c_sp exp/chain/tdnn_c_sp_smbr:{1,2,3} + + +if [ $# == 0 ]; then + echo "Usage: $0: [--looped] [--online] [ ... ]" + echo "e.g.: $0 exp/chain/tdnn_{b,c}_sp" + echo "or (with epoch numbers for discriminative training):" + echo "$0 exp/chain/tdnn_b_sp_disc:{1,2,3}" + exit 1 +fi + +echo "# $0 $*" + +include_looped=false +if [ "$1" == "--looped" ]; then + include_looped=true + shift +fi +include_online=false +if [ "$1" == "--online" ]; then + include_online=true + shift +fi + + +used_epochs=false + +# this function set_names is used to separate the epoch-related parts of the name +# [for discriminative training] and the regular parts of the name. +# If called with a colon-free directory name, like: +# set_names exp/chain/tdnn_lstm1e_sp_bi_smbr +# it will set dir=exp/chain/tdnn_lstm1e_sp_bi_smbr and epoch_infix="" +# If called with something like: +# set_names exp/chain/tdnn_d_sp_smbr:3 +# it will set dir=exp/chain/tdnn_d_sp_smbr and epoch_infix="_epoch3" + + +set_names() { + if [ $# != 1 ]; then + echo "compare_wer_general.sh: internal error" + exit 1 # exit the program + fi + dirname=$(echo $1 | cut -d: -f1) + epoch=$(echo $1 | cut -s -d: -f2) + if [ -z $epoch ]; then + epoch_infix="" + else + used_epochs=true + epoch_infix=_epoch${epoch} + fi +} + + + +echo -n "# System " +for x in $*; do printf "% 10s" " $(basename $x)"; done +echo + +strings=( +"# WER eval92 (tgpr_5k) " +"# WER 0166 (tgpr_5k) ") + +for n in 0 1; do + echo -n "${strings[$n]}" + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + decode_names=(tgpr_5k_eval92 tgpr_5k_0166) + + wer=$(cat $dirname/decode_${decode_names[$n]}/wer* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + if $include_looped; then + echo -n "# [looped:] " + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + wer=$(cat $dirname/decode_looped_${decode_names[$n]}/wer* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + fi + if $include_online; then + echo -n "# [online:] " + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + wer=$(cat ${dirname}_online/decode_${decode_names[$n]}/wer* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + fi +done + + +if $used_epochs; then + exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. +fi + + +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final train prob (xent)" +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob (xent)" +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Num-params " +for x in $*; do + printf "% 10s" $(grep num-parameters $x/log/progress.1.log | awk '{print $2}') +done +echo diff --git a/egs/aurora4/s5/local/chain/run_tdnn.sh b/egs/aurora4/s5/local/chain/run_tdnn.sh new file mode 120000 index 00000000000..34499362831 --- /dev/null +++ b/egs/aurora4/s5/local/chain/run_tdnn.sh @@ -0,0 +1 @@ +tuning/run_tdnn_1a.sh \ No newline at end of file diff --git a/egs/aurora4/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/aurora4/s5/local/chain/tuning/run_tdnn_1a.sh new file mode 100755 index 00000000000..8bc69f9c8cf --- /dev/null +++ b/egs/aurora4/s5/local/chain/tuning/run_tdnn_1a.sh @@ -0,0 +1,286 @@ +#!/bin/bash + +# 1a is same as 1h setup in WSJ + +# local/chain/compare_wer.sh exp/chain/tdnn1a_sp +# System tdnn1a_sp +# WER eval92 (tgpr_5k) 7.67 +# WER 0166 (tgpr_5k) 7.88 +# Final train prob -0.0338 +# Final valid prob -0.0602 +# Final train prob (xent) -0.7632 +# Final valid prob (xent) -0.9377 +# Num-params 8315264 + +# steps/info/chain_dir_info.pl exp/chain/tdnn1a_sp +# exp/chain/tdnn1a_sp: num-iters=24 nj=2..8 num-params=8.3M dim=40+100->2752 combine=-0.034->-0.034 (over 1) xent:train/valid[15,23,final]=(-1.13,-0.809,-0.763/-1.16,-0.961,-0.938) logprob:train/valid[15,23,final]=(-0.063,-0.038,-0.034/-0.068,-0.062,-0.060) + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +train_set=train_si84_multi +test_sets="eval92 0166" +gmm=tri3b_multi # this is the source gmm-dir that we'll use for alignments; it + # should have alignments for the specified training data. + +num_threads_ubm=8 + +nj_extractor=10 +# It runs a JOB with '-pe smp N', where N=$[threads*processes] +num_threads_extractor=4 +num_processes_extractor=2 + +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. + +# Options which are not passed through to run_ivector_common.sh +affix=1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +common_egs_dir= +reporting_email= + +# LSTM/chain options +train_stage=-10 +xent_regularize=0.1 +dropout_schedule='0,0@0.20,0.5@0.50,0' + +# training chunk-options +chunk_width=140,100,160 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 + +# training options +srand=0 +remove_egs=false + +#decode options +test_online_decoding=false # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 10 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + +if [ $stage -le 12 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print(0.5/$xent_regularize)" | python) + tdnn_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim-continuous=true" + tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66" + linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0" + prefinal_opts="l2-regularize=0.01" + output_opts="l2-regularize=0.005" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + idct-layer name=idct input=input dim=40 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat + delta-layer name=delta input=idct + no-op-component name=input2 input=Append(delta, Scale(1.0, ReplaceIndex(ivector, t, 0))) + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 $tdnn_opts dim=1024 input=input2 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=0 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + linear-component name=prefinal-l dim=192 $linear_opts + + + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$train_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=10 \ + --trainer.frames-per-iter=5000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=8 \ + --trainer.optimization.initial-effective-lrate=0.0005 \ + --trainer.optimization.final-effective-lrate=0.00005 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=0 \ + --egs.chunk-right-context=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=wait \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 14 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/lang/check_phones_compatible.sh \ + data/lang_test_tgpr_5k/phones.txt $lang/phones.txt + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgpr_5k \ + $tree_dir $tree_dir/graph_tgpr_5k || exit 1; + +fi + +if [ $stage -le 15 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in ${test_sets}; do + data_affix=$(echo $data | sed s/test_//) + nspk=$(wc -l 5." + exit 1 +fi + +if [ $stage -le 4 ]; then + echo "$0: preparing directory for speed-perturbed data" + utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp +fi + +if [ $stage -le 5 ]; then + echo "$0: creating high-resolution MFCC features" + + # this shows how you can split across multiple file-systems. we'll split the + # MFCC dir across multiple locations. You might want to be careful here, if you + # have multiple copies of Kaldi checked out and run the same recipe, not to let + # them overwrite each other. + mfccdir=data/${train_set}_sp_hires/data + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/mfcc/wsj-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage + fi + + for datadir in ${train_set}_sp ${test_sets}; do + utils/copy_data_dir.sh data/$datadir data/${datadir}_hires + done + + # do volume-perturbation on the training data prior to extracting hires + # features; this helps make trained nnets more invariant to test data volume. + utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires + + for datadir in ${train_set}_sp ${test_sets}; do + steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/${datadir}_hires + steps/compute_cmvn_stats.sh data/${datadir}_hires + utils/fix_data_dir.sh data/${datadir}_hires + done +fi + +if [ $stage -le 6 ]; then + echo "$0: computing a subset of data to train the diagonal UBM." + + mkdir -p exp/nnet3${nnet3_affix}/diag_ubm + temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm + + # train a diagonal UBM using a subset of about a quarter of the data + num_utts_total=$(wc -l " data/local/lang_tmp data/lang || exit 1; +if [ $stage -le 1 ]; then + local/wsj_prepare_dict.sh + utils/prepare_lang.sh data/local/dict "" data/local/lang_tmp data/lang +fi -local/aurora4_format_data.sh || exit 1; +if [ $stage -le 2 ]; then + local/aurora4_format_data.sh +fi -# Now make MFCC features. -# mfccdir should be some place with a largish disk where you -# want to store MFCC features. mfccdir=mfcc -for x in train_si84_clean train_si84_multi test_eval92 test_0166 dev_0330 dev_1206; do - steps/make_mfcc.sh --nj 10 \ - data/$x exp/make_mfcc/$x $mfccdir || exit 1; - steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1; -done - -# make fbank features -fbankdir=fbank -mkdir -p data-fbank -for x in train_si84_clean train_si84_multi dev_0330 dev_1206 test_eval92 test_0166; do - cp -r data/$x data-fbank/$x - steps/make_fbank.sh --nj 10 \ - data-fbank/$x exp/make_fbank/$x $fbankdir || exit 1; -done - -# Note: the --boost-silence option should probably be omitted by default -# for normal setups. It doesn't always help. [it's to discourage non-silence -# models from modeling silence.] -#steps/train_mono.sh --boost-silence 1.25 --nj 10 \ -# data/train_si84_clean data/lang exp/mono0a || exit 1; - -steps/train_mono.sh --boost-silence 1.25 --nj 10 \ - data/train_si84_multi data/lang exp/mono0a_multi || exit 1; -#( -# utils/mkgraph.sh data/lang_test_tgpr exp/mono0a exp/mono0a/graph_tgpr && \ -# steps/decode.sh --nj 8 \ -# exp/mono0a/graph_tgpr data/test_eval92 exp/mono0a/decode_tgpr_eval92 -#) & - -#steps/align_si.sh --boost-silence 1.25 --nj 10 \ -# data/train_si84_clean data/lang exp/mono0a exp/mono0a_ali || exit 1; -steps/align_si.sh --boost-silence 1.25 --nj 10 \ - data/train_si84_multi data/lang exp/mono0a_multi exp/mono0a_multi_ali || exit 1; - -#steps/train_deltas.sh --boost-silence 1.25 \ -# 2000 10000 data/train_si84_clean data/lang exp/mono0a_ali exp/tri1 || exit 1; - -steps/train_deltas.sh --boost-silence 1.25 \ - 2000 10000 data/train_si84_multi data/lang exp/mono0a_multi_ali exp/tri1_multi || exit 1; - +if [ $stage -le 3 ]; then + # Now make MFCC features. + # mfccdir should be some place with a largish disk where you + # want to store MFCC features. + for x in train_si84_clean train_si84_multi test_eval92 test_0166 dev_0330 dev_1206; do + steps/make_mfcc.sh --nj 10 \ + data/$x exp/make_mfcc/$x $mfccdir || exit 1; + steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1; + done +fi + +model_affix= +if [ $train_set == 'multi' ]; then + model_affix=_multi +fi + +if [ $stage -le 4 ]; then + # Note: the --boost-silence option should probably be omitted by default + # for normal setups. It doesn't always help. [it's to discourage non-silence + # models from modeling silence.] + if $train; then + steps/train_mono.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \ + data/train_si84_${train_set} data/lang exp/mono0a${model_affix} || exit 1; + fi + + if $decode; then + for testdir in $test_sets; do + utils/mkgraph.sh data/lang_test_tgpr exp/mono0a${model_affix} exp/mono0a${model_affix}/graph_tgpr && \ + steps/decode.sh --nj 8 --cmd "$decode_cmd" \ + exp/mono0a${model_affix}/graph_tgpr data/test_${testdir} exp/mono0a${model_affix}/decode_tgpr_${testdir} + done + fi +fi + +if [ $stage -le 5 ]; then + # tri1 + if $train; then + steps/align_si.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \ + data/train_si84_${train_set} data/lang exp/mono0a${model_affix} exp/mono0a${model_affix}_ali || exit 1; + + steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \ + 2000 10000 data/train_si84_${train_set} data/lang exp/mono0a${model_affix}_ali exp/tri1${model_affix} || exit 1; + fi +fi + +if [ $stage -le 6 ]; then + # tri2 + if $train; then + steps/align_si.sh --nj 10 --cmd "$train_cmd" \ + data/train_si84_${train_set} data/lang exp/tri1${model_affix} exp/tri1${model_affix}_ali_si84 || exit 1; + + steps/train_deltas.sh --cmd "$train_cmd" 2500 15000 \ + data/train_si84_${train_set} data/lang exp/tri1${model_affix}_ali_si84 exp/tri2a${model_affix} || exit 1; + + steps/align_si.sh --nj 10 --cmd "$train_cmd" \ + data/train_si84_${train_set} data/lang exp/tri2a${model_affix} exp/tri2a${model_affix}_ali_si84 || exit 1; + + steps/train_lda_mllt.sh --cmd "$train_cmd" \ + --splice-opts "--left-context=3 --right-context=3" \ + 2500 15000 data/train_si84_${train_set} data/lang exp/tri2a${model_affix}_ali_si84 exp/tri2b${model_affix} || exit 1; + fi + + if $decode; then + for testdir in $test_sets; do + utils/mkgraph.sh data/lang_test_tgpr_5k exp/tri2b${model_affix} exp/tri2b${model_affix}/graph_tgpr_5k || exit 1; + steps/decode.sh --nj 8 --cmd "$decode_cmd" \ + exp/tri2b${model_affix}/graph_tgpr_5k data/test_${testdir} exp/tri2b${model_affix}/decode_tgpr_5k_${testdir} || exit 1; + done + fi +fi + +if [ $stage -le 7 ]; then + # From 2b system, train 3b which is LDA + MLLT + SAT. + + # Align tri2b system with all the si84 data. + if $train; then + steps/align_si.sh --nj 10 --cmd "$train_cmd" --use-graphs true \ + data/train_si84_${train_set} data/lang exp/tri2b${model_affix} exp/tri2b${model_affix}_ali_si84 || exit 1; + + steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \ + data/train_si84_${train_set} data/lang exp/tri2b${model_affix}_ali_si84 exp/tri3b${model_affix} || exit 1; + fi + + if $decode; then + for testdir in $test_sets; do + nspk=$(wc -l > sys.stderr, "warning: suspicious JOB argument " + argv[0]; + print("warning: suspicious JOB argument " + argv[0], file=sys.stderr); if jobstart > jobend: sys.stderr.write("lonestar.py: JOBSTART("+ str(jobstart) + ") must be lower than JOBEND(" + str(jobend) + ")\n") @@ -238,8 +239,8 @@ def setup_paths_and_vars(opts): cwd = os.getcwd() if opts.varname and (opts.varname not in opts.logfile ) and (opts.jobstart != opts.jobend): - print >>sys.stderr, "lonestar.py: you are trying to run a parallel job" \ - "but you are putting the output into just one log file (" + opts.logfile + ")"; + print("lonestar.py: you are trying to run a parallel job" \ + "but you are putting the output into just one log file (" + opts.logfile + ")", file=sys.stderr); sys.exit(1) if not os.path.isabs(opts.logfile): @@ -261,8 +262,8 @@ def setup_paths_and_vars(opts): taskname=os.path.basename(queue_logfile) taskname = taskname.replace(".log", ""); if taskname == "": - print >> sys.stderr, "lonestar.py: you specified the log file name in such form " \ - "that leads to an empty task name ("+logfile + ")"; + print("lonestar.py: you specified the log file name in such form " \ + "that leads to an empty task name ("+logfile + ")", file=sys.stderr); sys.exit(1) if not os.path.isabs(queue_logfile): diff --git a/egs/babel/s5b/local/resegment/segmentation.py b/egs/babel/s5b/local/resegment/segmentation.py index 7c5c8665a16..aed65a4ca14 100755 --- a/egs/babel/s5b/local/resegment/segmentation.py +++ b/egs/babel/s5b/local/resegment/segmentation.py @@ -3,6 +3,7 @@ # Copyright 2014 Vimal Manohar # Apache 2.0 +from __future__ import division import os, glob, argparse, sys, re, time from argparse import ArgumentParser @@ -19,12 +20,12 @@ def mean(l): if len(l) > 0: - return float(sum(l)) / len(l) + return (float(sum(l))/len(l)) return 0 # Analysis class # Stores statistics like the confusion matrix, length of the segments etc. -class Analysis: +class Analysis(object): def __init__(self, file_id, frame_shift, prefix): self.confusion_matrix = [0] * 9 self.type_counts = [ [[] for j in range(0,9)] for i in range(0,3) ] @@ -274,8 +275,8 @@ def read_rttm_file(rttm_file, temp_dir, frame_shift): i = len(this_file) category = splits[6] word = splits[5] - start_time = int(float(splits[3])/frame_shift + 0.5) - duration = int(float(splits[4])/frame_shift + 0.5) + start_time = int((float(splits[3])/frame_shift) + 0.5) + duration = int((float(splits[4])/frame_shift) + 0.5) if i < start_time: this_file.extend(["0"]*(start_time - i)) if type1 == "NON-LEX": @@ -295,7 +296,7 @@ def read_rttm_file(rttm_file, temp_dir, frame_shift): # Stats class to store some basic stats about the number of # times the post-processor goes through particular loops or blocks # of code in the algorithm. This is just for debugging. -class Stats: +class Stats(object): def __init__(self): self.inter_utt_nonspeech = 0 self.merge_nonspeech_segment = 0 @@ -321,7 +322,7 @@ def reset(self): self.noise_only = 0 # Timer class to time functions -class Timer: +class Timer(object): def __enter__(self): self.start = time.clock() return self @@ -332,7 +333,7 @@ def __exit__(self, *args): # The main class for post-processing a file. # This does the segmentation either looking at the file isolated # or by looking at both classes simultaneously -class JointResegmenter: +class JointResegmenter(object): def __init__(self, P, A, f, options, phone_map, stats = None, reference = None): # Pointers to prediction arrays and Initialization @@ -1290,22 +1291,22 @@ def main(): dest='hard_max_segment_length', default=15.0, \ help="Hard maximum on the segment length above which the segment " \ + "will be broken even if in the middle of speech (default: %(default)s)") - parser.add_argument('--first-separator', type=str, \ + parser.add_argument('--first-separator', \ dest='first_separator', default="-", \ help="Separator between recording-id and start-time (default: %(default)s)") - parser.add_argument('--second-separator', type=str, \ + parser.add_argument('--second-separator', \ dest='second_separator', default="-", \ help="Separator between start-time and end-time (default: %(default)s)") - parser.add_argument('--remove-noise-only-segments', type=str, \ + parser.add_argument('--remove-noise-only-segments', \ dest='remove_noise_only_segments', default="true", choices=("true", "false"), \ help="Remove segments that have only noise. (default: %(default)s)") parser.add_argument('--min-inter-utt-silence-length', type=float, \ dest='min_inter_utt_silence_length', default=1.0, \ help="Minimum silence that must exist between two separate utterances (default: %(default)s)"); - parser.add_argument('--channel1-file', type=str, \ + parser.add_argument('--channel1-file', \ dest='channel1_file', default="inLine", \ help="String that matches with the channel 1 file (default: %(default)s)") - parser.add_argument('--channel2-file', type=str, \ + parser.add_argument('--channel2-file', \ dest='channel2_file', default="outLine", \ help="String that matches with the channel 2 file (default: %(default)s)") parser.add_argument('--isolated-resegmentation', \ @@ -1388,7 +1389,7 @@ def main(): speech_cap = None if options.speech_cap_length != None: - speech_cap = int( options.speech_cap_length / options.frame_shift ) + speech_cap = int(options.speech_cap_length/options.frame_shift) # End if for f in pred_files: @@ -1454,7 +1455,7 @@ def main(): f2 = f3 # End if - if (len(A1) - len(A2)) > options.max_length_diff / options.frame_shift: + if (len(A1) - len(A2)) > int(options.max_length_diff/options.frame_shift): sys.stderr.write( \ "%s: Warning: Lengths of %s and %s differ by more than %f. " \ % (sys.argv[0], f1,f2, options.max_length_diff) \ diff --git a/egs/babel/s5c/local/lonestar.py b/egs/babel/s5c/local/lonestar.py index e1594e55ada..809f99b22cf 100755 --- a/egs/babel/s5c/local/lonestar.py +++ b/egs/babel/s5c/local/lonestar.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +from __future__ import print_function from pylauncher import * import pylauncher import sys @@ -39,7 +40,7 @@ def KaldiLauncher(lo, **kwargs): logfiles = list() commands = list() - for q in xrange(lo.jobstart, lo.jobend+1): + for q in range(lo.jobstart, lo.jobend+1): s = "bash " + lo.queue_scriptfile + " " + str(q) commands.append(s) @@ -74,7 +75,7 @@ def KaldiLauncher(lo, **kwargs): time.sleep(delay); lines=tail(10, logfile) - with_status=filter(lambda x:re.search(r'with status (\d+)', x), lines) + with_status=[x for x in lines if re.search(r'with status (\d+)', x)] if len(with_status) == 0: sys.stderr.write("The last line(s) of the log-file " + logfile + " does not seem" @@ -98,7 +99,7 @@ def KaldiLauncher(lo, **kwargs): sys.exit(-1); #Remove service files. Be careful not to remove something that might be needed in problem diagnostics - for i in xrange(len(commands)): + for i in range(len(commands)): out_file=os.path.join(qdir, ce.outstring+str(i)) #First, let's wait on files missing (it might be that those are missing @@ -149,7 +150,7 @@ def KaldiLauncher(lo, **kwargs): #print job.final_report() -class LauncherOpts: +class LauncherOpts(object): def __init__(self): self.sync=0 self.nof_threads = 1 @@ -199,7 +200,7 @@ def CmdLineParser(argv): jobend=int(m.group(2)) argv.pop(0) elif re.match("^.+=.*:.*$", argv[0]): - print >> sys.stderr, "warning: suspicious JOB argument " + argv[0]; + print("warning: suspicious JOB argument " + argv[0], file=sys.stderr); if jobstart > jobend: sys.stderr.write("lonestar.py: JOBSTART("+ str(jobstart) + ") must be lower than JOBEND(" + str(jobend) + ")\n") @@ -238,8 +239,8 @@ def setup_paths_and_vars(opts): cwd = os.getcwd() if opts.varname and (opts.varname not in opts.logfile ) and (opts.jobstart != opts.jobend): - print >>sys.stderr, "lonestar.py: you are trying to run a parallel job" \ - "but you are putting the output into just one log file (" + opts.logfile + ")"; + print("lonestar.py: you are trying to run a parallel job" \ + "but you are putting the output into just one log file (" + opts.logfile + ")", file=sys.stderr); sys.exit(1) if not os.path.isabs(opts.logfile): @@ -261,8 +262,8 @@ def setup_paths_and_vars(opts): taskname=os.path.basename(queue_logfile) taskname = taskname.replace(".log", ""); if taskname == "": - print >> sys.stderr, "lonestar.py: you specified the log file name in such form " \ - "that leads to an empty task name ("+logfile + ")"; + print("lonestar.py: you specified the log file name in such form " \ + "that leads to an empty task name ("+logfile + ")", file=sys.stderr); sys.exit(1) if not os.path.isabs(queue_logfile): diff --git a/egs/babel/s5c/local/resegment/segmentation.py b/egs/babel/s5c/local/resegment/segmentation.py index 7c5c8665a16..4bdb0fea75c 100755 --- a/egs/babel/s5c/local/resegment/segmentation.py +++ b/egs/babel/s5c/local/resegment/segmentation.py @@ -3,6 +3,7 @@ # Copyright 2014 Vimal Manohar # Apache 2.0 +from __future__ import division import os, glob, argparse, sys, re, time from argparse import ArgumentParser @@ -19,12 +20,12 @@ def mean(l): if len(l) > 0: - return float(sum(l)) / len(l) + return (float(sum(l))/len(l)) return 0 # Analysis class # Stores statistics like the confusion matrix, length of the segments etc. -class Analysis: +class Analysis(object): def __init__(self, file_id, frame_shift, prefix): self.confusion_matrix = [0] * 9 self.type_counts = [ [[] for j in range(0,9)] for i in range(0,3) ] @@ -274,7 +275,7 @@ def read_rttm_file(rttm_file, temp_dir, frame_shift): i = len(this_file) category = splits[6] word = splits[5] - start_time = int(float(splits[3])/frame_shift + 0.5) + start_time = int((float(splits[3])/frame_shift) + 0.5) duration = int(float(splits[4])/frame_shift + 0.5) if i < start_time: this_file.extend(["0"]*(start_time - i)) @@ -295,7 +296,7 @@ def read_rttm_file(rttm_file, temp_dir, frame_shift): # Stats class to store some basic stats about the number of # times the post-processor goes through particular loops or blocks # of code in the algorithm. This is just for debugging. -class Stats: +class Stats(object): def __init__(self): self.inter_utt_nonspeech = 0 self.merge_nonspeech_segment = 0 @@ -321,7 +322,7 @@ def reset(self): self.noise_only = 0 # Timer class to time functions -class Timer: +class Timer(object): def __enter__(self): self.start = time.clock() return self @@ -332,7 +333,7 @@ def __exit__(self, *args): # The main class for post-processing a file. # This does the segmentation either looking at the file isolated # or by looking at both classes simultaneously -class JointResegmenter: +class JointResegmenter(object): def __init__(self, P, A, f, options, phone_map, stats = None, reference = None): # Pointers to prediction arrays and Initialization @@ -351,9 +352,9 @@ def __init__(self, P, A, f, options, phone_map, stats = None, reference = None): self.frame_shift = options.frame_shift # Convert length in seconds to frames - self.max_frames = int(options.max_segment_length / options.frame_shift) - self.hard_max_frames = int(options.hard_max_segment_length / options.frame_shift) - self.min_inter_utt_nonspeech_length = int(options.min_inter_utt_silence_length / options.frame_shift) + self.max_frames = int(options.max_segment_length/options.frame_shift) + self.hard_max_frames = int(options.hard_max_segment_length/options.frame_shift) + self.min_inter_utt_nonspeech_length = int(options.min_inter_utt_silence_length, options.frame_shift) if ( options.remove_noise_only_segments == "false" ): self.remove_noise_segments = False elif ( options.remove_noise_only_segments == "true" ): @@ -540,7 +541,7 @@ def set_nonspeech_proportion(self): # Set the number of non-speech frames to be added depending on the # silence proportion. The target number of frames in the segments # is computed as below: - target_segment_frames = int(num_speech_frames / (1.0 - self.options.silence_proportion)) + target_segment_frames = int(num_speech_frames/(1.0 - self.options.silence_proportion)) # The number of frames currently in the segments num_segment_frames = num_speech_frames @@ -599,7 +600,7 @@ def set_nonspeech_proportion(self): if not changed: # avoid an infinite loop. if no changes, then break. break if num_segment_frames < target_segment_frames: - proportion = float(num_segment_frames - num_speech_frames) / num_segment_frames + proportion = float(num_segment_frames - num_speech_frames)/num_segment_frames sys.stderr.write("%s: Warning: for recording %s, only got a proportion %f of non-speech frames, versus target %f\n" % (sys.argv[0], self.file_id, proportion, self.options.silence_proportion)) ########################################################################### @@ -863,14 +864,14 @@ def split_long_segments(self): # Count the number of times long segments are split self.stats.split_segments += 1 - num_pieces = int((float(segment_length) / self.hard_max_frames) + 0.99999) + num_pieces = int((float(segment_length)/self.hard_max_frames) + 0.99999) sys.stderr.write("%s: Warning: for recording %s, " \ % (sys.argv[0], self.file_id) \ + "splitting segment of length %f seconds into %d pieces " \ % (segment_length * self.frame_shift, num_pieces) \ + "(--hard-max-segment-length %f)\n" \ % self.options.hard_max_segment_length) - frames_per_piece = int(segment_length / num_pieces) + frames_per_piece = int(segment_length/num_pieces) for i in range(1,num_pieces): q = n + i * frames_per_piece self.S[q] = True @@ -1290,22 +1291,22 @@ def main(): dest='hard_max_segment_length', default=15.0, \ help="Hard maximum on the segment length above which the segment " \ + "will be broken even if in the middle of speech (default: %(default)s)") - parser.add_argument('--first-separator', type=str, \ + parser.add_argument('--first-separator', \ dest='first_separator', default="-", \ help="Separator between recording-id and start-time (default: %(default)s)") - parser.add_argument('--second-separator', type=str, \ + parser.add_argument('--second-separator', \ dest='second_separator', default="-", \ help="Separator between start-time and end-time (default: %(default)s)") - parser.add_argument('--remove-noise-only-segments', type=str, \ + parser.add_argument('--remove-noise-only-segments', \ dest='remove_noise_only_segments', default="true", choices=("true", "false"), \ help="Remove segments that have only noise. (default: %(default)s)") parser.add_argument('--min-inter-utt-silence-length', type=float, \ dest='min_inter_utt_silence_length', default=1.0, \ help="Minimum silence that must exist between two separate utterances (default: %(default)s)"); - parser.add_argument('--channel1-file', type=str, \ + parser.add_argument('--channel1-file', \ dest='channel1_file', default="inLine", \ help="String that matches with the channel 1 file (default: %(default)s)") - parser.add_argument('--channel2-file', type=str, \ + parser.add_argument('--channel2-file', \ dest='channel2_file', default="outLine", \ help="String that matches with the channel 2 file (default: %(default)s)") parser.add_argument('--isolated-resegmentation', \ @@ -1388,7 +1389,7 @@ def main(): speech_cap = None if options.speech_cap_length != None: - speech_cap = int( options.speech_cap_length / options.frame_shift ) + speech_cap = int(options.speech_cap_length/options.frame_shift) # End if for f in pred_files: @@ -1454,7 +1455,7 @@ def main(): f2 = f3 # End if - if (len(A1) - len(A2)) > options.max_length_diff / options.frame_shift: + if (len(A1) - len(A2)) > int(options.max_length_diff/options.frame_shift): sys.stderr.write( \ "%s: Warning: Lengths of %s and %s differ by more than %f. " \ % (sys.argv[0], f1,f2, options.max_length_diff) \ diff --git a/egs/babel/s5c/local/syllab/generate_syllable_lang.sh b/egs/babel/s5c/local/syllab/generate_syllable_lang.sh index 2d1fcb2259e..4a0810b9415 100755 --- a/egs/babel/s5c/local/syllab/generate_syllable_lang.sh +++ b/egs/babel/s5c/local/syllab/generate_syllable_lang.sh @@ -118,8 +118,7 @@ ln -s lex.syllabs2phones.disambig.fst $out/L_disambig.fst echo "Validating the output lang dir" utils/validate_lang.pl $out || exit 1 -sed -i'' 's/#1$//g' $lout/lexicon.txt -sed -i'' 's/#1$//g' $lout/lexiconp.txt +perl -i -pe 's/#1$//g' $lout/lexicon.txt $lout/lexiconp.txt echo "Done OK." exit 0 diff --git a/egs/babel/s5d/conf/lang/404-georgian.FLP.official.conf b/egs/babel/s5d/conf/lang/404-georgian.FLP.official.conf index a6b22de419f..9cd043716ce 100644 --- a/egs/babel/s5d/conf/lang/404-georgian.FLP.official.conf +++ b/egs/babel/s5d/conf/lang/404-georgian.FLP.official.conf @@ -75,8 +75,8 @@ unsup_data_list=./conf/lists/404-georgian/untranscribed-training.list unsup_nj=32 -lexicon_file= -lexiconFlags="--romanized --oov " +lexicon_file=/export/corpora/LDC/LDC2016S12/IARPA_BABEL_OP3_404/conversational/reference_materials/lexicon.txt +lexiconFlags=" --romanized --oov " diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn.sh index 4f485edf7da..7b4535f8c5e 100755 --- a/egs/babel/s5d/local/chain/tuning/run_tdnn.sh +++ b/egs/babel/s5d/local/chain/tuning/run_tdnn.sh @@ -128,7 +128,7 @@ if [ $stage -le 17 ]; then num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; } - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) mkdir -p $dir/configs cat < $dir/configs/network.xconfig diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm.sh index 72f7a3c32dd..5fc14dda826 100755 --- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm.sh +++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm.sh @@ -129,7 +129,7 @@ if [ $stage -le 17 ]; then num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; } - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) lstm_opts="decay-time=20" label_delay=5 diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab1.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab1.sh index be0c2cc4b9b..8c7de5d18d4 100755 --- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab1.sh +++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab1.sh @@ -127,7 +127,7 @@ if [ $stage -le 17 ]; then num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; } - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) lstm_opts="decay-time=20" label_delay=5 diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab2.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab2.sh index 8f21a239794..0b3e70b5a04 100755 --- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab2.sh +++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab2.sh @@ -127,7 +127,7 @@ if [ $stage -le 17 ]; then num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; } - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) lstm_opts="decay-time=20" label_delay=5 diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab3.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab3.sh index 7898d172242..45f2907645e 100755 --- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab3.sh +++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab3.sh @@ -128,7 +128,7 @@ if [ $stage -le 17 ]; then num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; } - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) lstm_opts="decay-time=20" label_delay=5 diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab4.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab4.sh index 49462573245..0d92aff5c28 100755 --- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab4.sh +++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab4.sh @@ -128,7 +128,7 @@ if [ $stage -le 17 ]; then num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; } - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) lstm_opts="decay-time=20" label_delay=5 diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab5.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab5.sh index c888d985f5e..4129c00dcb4 100755 --- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab5.sh +++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab5.sh @@ -128,7 +128,7 @@ if [ $stage -le 17 ]; then num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; } - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) lstm_opts="decay-time=20" label_delay=5 diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab6.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab6.sh index e9a045e113a..1cfa50c1aa1 100755 --- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab6.sh +++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab6.sh @@ -128,7 +128,7 @@ if [ $stage -le 17 ]; then num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; } - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) lstm_opts="decay-time=20" label_delay=5 diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab7.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab7.sh index ce192a91665..ba8ac1e0373 100755 --- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab7.sh +++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab7.sh @@ -129,7 +129,7 @@ if [ $stage -le 17 ]; then num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; } - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) lstm_opts="decay-time=20 dropout-proportion=0.0" label_delay=5 diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab8.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab8.sh index 3fc0ef2206c..5de285e080e 100755 --- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab8.sh +++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab8.sh @@ -129,7 +129,7 @@ if [ $stage -le 17 ]; then num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; } - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) lstm_opts="decay-time=20 dropout-proportion=0.0 " label_delay=5 diff --git a/egs/babel/s5d/local/lexicon/make_unicode_lexicon.py b/egs/babel/s5d/local/lexicon/make_unicode_lexicon.py index 68280762597..91419f6e920 100755 --- a/egs/babel/s5d/local/lexicon/make_unicode_lexicon.py +++ b/egs/babel/s5d/local/lexicon/make_unicode_lexicon.py @@ -106,6 +106,7 @@ # Import Statements from __future__ import print_function +from __future__ import division import codecs import argparse import unicodedata @@ -340,7 +341,7 @@ def encode(unicode_transcription, tag_percentage, log=False): int2graph = {v: k for k, v in graph2int.items()} graph_list_int = [graph2int[g] for g in graph_list] bin_edges = range(0, len(int2graph.keys()) + 1) - graph_counts = np.histogram(graph_list_int, bins=bin_edges)[0] / float(len(graph_list_int)) + graph_counts = np.histogram(graph_list_int, bins=bin_edges)[0]/float(len(graph_list_int)) # Set count threshold to frequency that tags the bottom 10% of graphemes bottom_idx = int(np.floor(tag_percentage * len(graph_counts))) count_thresh = sorted(graph_counts)[bottom_idx] @@ -465,7 +466,7 @@ def encode(unicode_transcription, tag_percentage, log=False): for g_dict in table: g_map = "" map_number = 0 - for g_field, g_val in sorted(g_dict.iteritems()): + for g_field, g_val in sorted(g_dict.items()): if(g_field == ("MAP" + str(map_number))): g_map = g_map + g_val + " " map_number = map_number + 1 @@ -561,7 +562,7 @@ def write_table(table, outfile): # Start writing to output with codecs.open(outfile, "w", "utf-8") as fo: # Get header names - header_names = sorted(set().union(*[d.keys() for d in table])) + header_names = sorted(set().union(*[list(d.keys()) for d in table])) # Write headers for h in header_names[:-1]: fo.write("%s\t" % h) @@ -595,7 +596,7 @@ def write_map(grapheme_map, mapfile): ''' with codecs.open(mapfile, 'w', encoding='utf-8') as f: - for g, g_map in grapheme_map.iteritems(): + for g, g_map in grapheme_map.items(): print(g, g_map, file=f) @@ -613,14 +614,14 @@ def write_lexicon(baseforms, encoded_transcription, outfile, sil_lex=None, with codecs.open(outfile, "w", "utf-8") as f: # First write the non-speech words try: - for w in sil_lex.iterkeys(): + for w in sil_lex.keys(): f.write("%s\t%s\n" % (w, sil_lex[w])) except AttributeError: pass # Then write extra-speech words try: - for w in extra_lex.iterkeys(): + for w in extra_lex.keys(): f.write("%s\t%s\n" % (w, extra_lex[w])) except AttributeError: pass @@ -629,9 +630,9 @@ def write_lexicon(baseforms, encoded_transcription, outfile, sil_lex=None, for idx, w in enumerate(baseforms): # This is really just for BABEL in case is written as a word if(w[0].lower() == ""): - f.write("%s\t\n" % (unicode(w[0]))) + f.write("%s\t\n" % (w[0])) else: - f.write("%s\t%s\n" % (unicode(w[0]), + f.write("%s\t%s\n" % (w[0], encoded_transcription[idx])) if __name__ == "__main__": diff --git a/egs/babel/s5d/local/lexicon/make_word_list.py b/egs/babel/s5d/local/lexicon/make_word_list.py index 9a9e17f6c60..c1473b8ced8 100755 --- a/egs/babel/s5d/local/lexicon/make_word_list.py +++ b/egs/babel/s5d/local/lexicon/make_word_list.py @@ -85,7 +85,7 @@ def main(): # Print the word list with codecs.open(args.word_list, "w", encoding="utf-8") as f: for word, count in words: - f.write("%d %s\n" % (count, unicode(word))) + f.write("%d %s\n" % (count, word)) if args.misprons is not None: with codecs.open(args.misprons, "w", encoding="utf-8") as f: diff --git a/egs/babel/s5d/local/make_L_align.sh b/egs/babel/s5d/local/make_L_align.sh index 50e46a00493..41e9ff32958 100755 --- a/egs/babel/s5d/local/make_L_align.sh +++ b/egs/babel/s5d/local/make_L_align.sh @@ -34,18 +34,24 @@ tmpdir=$1 dir=$2 outdir=$3 +for f in $dir/phones/optional_silence.txt $dir/phones.txt $dir/words.txt ; do + [ ! -f $f ] && echo "$0: The file $f must exist!" exit 1 +fi + silphone=`cat $dir/phones/optional_silence.txt` || exit 1; +if [ ! -f $tmpdir/lexicon.txt ] && [ ! -f $tmpdir/lexiconp.txt ] ; then + echo "$0: At least one of the files $tmpdir/lexicon.txt or $tmpdir/lexiconp.txt must exist" >&2 + exit 1 +fi + # Create lexicon with alignment info if [ -f $tmpdir/lexicon.txt ] ; then cat $tmpdir/lexicon.txt | \ awk '{printf("%s #1 ", $1); for (n=2; n <= NF; n++) { printf("%s ", $n); } print "#2"; }' -elif [ -f $tmpdir/lexiconp.txt ] ; then +else cat $tmpdir/lexiconp.txt | \ awk '{printf("%s #1 ", $1); for (n=3; n <= NF; n++) { printf("%s ", $n); } print "#2"; }' -else - echo "Neither $tmpdir/lexicon.txt nor $tmpdir/lexiconp.txt does not exist" - exit 1 fi | utils/make_lexicon_fst.pl - 0.5 $silphone | \ fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \ --keep_isymbols=false --keep_osymbols=false | \ diff --git a/egs/babel/s5d/local/prepare_unicode_lexicon.py b/egs/babel/s5d/local/prepare_unicode_lexicon.py index 86fa4d60ba1..3b9dc1abd86 100755 --- a/egs/babel/s5d/local/prepare_unicode_lexicon.py +++ b/egs/babel/s5d/local/prepare_unicode_lexicon.py @@ -89,7 +89,7 @@ def extract_phonemes(lexicon): # Read all baseform units into dictionary with {a: [a, a_1, a_2], # b: [b_1, b_3], ...} phonemes_dict = {} - for word, pron in lexicon.iteritems(): + for word, pron in lexicon.items(): for p in pron.split(): try: base = p.split("_",1)[0] @@ -98,11 +98,11 @@ def extract_phonemes(lexicon): phonemes_dict[base] = [p] # Makes sure there are no repeats in the list - phonemes_dict = {k: set(v) for k, v in phonemes_dict.iteritems()} + phonemes_dict = {k: set(v) for k, v in phonemes_dict.items()} # Get all unique phonemes phonemes = [] - for v in phonemes_dict.itervalues(): + for v in phonemes_dict.values(): for p in v: phonemes.append(p) @@ -137,11 +137,11 @@ def write_extra_questions(nonsil_phonemes, nonsil_phonemes_dict, # Write all possible phone_tag combinations that occur in the lexicon for tag in tags: - for p in nonsil_phonemes_dict.iterkeys(): + for p in nonsil_phonemes_dict.keys(): tagged_phoneme = "_".join([p, tag]) if(tagged_phoneme in nonsil_phonemes_dict[p]): fp.write("%s " % tagged_phoneme) - for p in sil_phonemes_dict.iterkeys(): + for p in sil_phonemes_dict.keys(): tagged_phoneme = "_".join([p, tag]) if(tagged_phoneme in sil_phonemes_dict[p]): fp.write("%s " % tagged_phoneme) diff --git a/egs/babel/s5d/local/resegment/segmentation.py b/egs/babel/s5d/local/resegment/segmentation.py index 7c5c8665a16..02fd7646b96 100755 --- a/egs/babel/s5d/local/resegment/segmentation.py +++ b/egs/babel/s5d/local/resegment/segmentation.py @@ -3,6 +3,7 @@ # Copyright 2014 Vimal Manohar # Apache 2.0 +from __future__ import division import os, glob, argparse, sys, re, time from argparse import ArgumentParser @@ -19,12 +20,12 @@ def mean(l): if len(l) > 0: - return float(sum(l)) / len(l) + return float(sum(l))/len(l) return 0 # Analysis class # Stores statistics like the confusion matrix, length of the segments etc. -class Analysis: +class Analysis(object): def __init__(self, file_id, frame_shift, prefix): self.confusion_matrix = [0] * 9 self.type_counts = [ [[] for j in range(0,9)] for i in range(0,3) ] @@ -274,8 +275,8 @@ def read_rttm_file(rttm_file, temp_dir, frame_shift): i = len(this_file) category = splits[6] word = splits[5] - start_time = int(float(splits[3])/frame_shift + 0.5) - duration = int(float(splits[4])/frame_shift + 0.5) + start_time = int((float(splits[3])/frame_shift) + 0.5) + duration = int((float(splits[4])/frame_shift) + 0.5) if i < start_time: this_file.extend(["0"]*(start_time - i)) if type1 == "NON-LEX": @@ -295,7 +296,7 @@ def read_rttm_file(rttm_file, temp_dir, frame_shift): # Stats class to store some basic stats about the number of # times the post-processor goes through particular loops or blocks # of code in the algorithm. This is just for debugging. -class Stats: +class Stats(object): def __init__(self): self.inter_utt_nonspeech = 0 self.merge_nonspeech_segment = 0 @@ -321,7 +322,7 @@ def reset(self): self.noise_only = 0 # Timer class to time functions -class Timer: +class Timer(object): def __enter__(self): self.start = time.clock() return self @@ -332,7 +333,7 @@ def __exit__(self, *args): # The main class for post-processing a file. # This does the segmentation either looking at the file isolated # or by looking at both classes simultaneously -class JointResegmenter: +class JointResegmenter(object): def __init__(self, P, A, f, options, phone_map, stats = None, reference = None): # Pointers to prediction arrays and Initialization @@ -351,8 +352,8 @@ def __init__(self, P, A, f, options, phone_map, stats = None, reference = None): self.frame_shift = options.frame_shift # Convert length in seconds to frames - self.max_frames = int(options.max_segment_length / options.frame_shift) - self.hard_max_frames = int(options.hard_max_segment_length / options.frame_shift) + self.max_frames = int(options.max_segment_length/options.frame_shift) + self.hard_max_frames = int(options.hard_max_segment_length/options.frame_shift) self.min_inter_utt_nonspeech_length = int(options.min_inter_utt_silence_length / options.frame_shift) if ( options.remove_noise_only_segments == "false" ): self.remove_noise_segments = False @@ -540,7 +541,7 @@ def set_nonspeech_proportion(self): # Set the number of non-speech frames to be added depending on the # silence proportion. The target number of frames in the segments # is computed as below: - target_segment_frames = int(num_speech_frames / (1.0 - self.options.silence_proportion)) + target_segment_frames = int(num_speech_frames/(1.0 - self.options.silence_proportion)) # The number of frames currently in the segments num_segment_frames = num_speech_frames @@ -599,7 +600,7 @@ def set_nonspeech_proportion(self): if not changed: # avoid an infinite loop. if no changes, then break. break if num_segment_frames < target_segment_frames: - proportion = float(num_segment_frames - num_speech_frames) / num_segment_frames + proportion = float(num_segment_frames - num_speech_frames)/ num_segment_frames sys.stderr.write("%s: Warning: for recording %s, only got a proportion %f of non-speech frames, versus target %f\n" % (sys.argv[0], self.file_id, proportion, self.options.silence_proportion)) ########################################################################### @@ -863,14 +864,14 @@ def split_long_segments(self): # Count the number of times long segments are split self.stats.split_segments += 1 - num_pieces = int((float(segment_length) / self.hard_max_frames) + 0.99999) + num_pieces = int((float(segment_length)/self.hard_max_frames) + 0.99999) sys.stderr.write("%s: Warning: for recording %s, " \ % (sys.argv[0], self.file_id) \ + "splitting segment of length %f seconds into %d pieces " \ % (segment_length * self.frame_shift, num_pieces) \ + "(--hard-max-segment-length %f)\n" \ % self.options.hard_max_segment_length) - frames_per_piece = int(segment_length / num_pieces) + frames_per_piece = int(segment_length/num_pieces) for i in range(1,num_pieces): q = n + i * frames_per_piece self.S[q] = True @@ -1388,7 +1389,7 @@ def main(): speech_cap = None if options.speech_cap_length != None: - speech_cap = int( options.speech_cap_length / options.frame_shift ) + speech_cap = int(options.speech_cap_length/options.frame_shift) # End if for f in pred_files: @@ -1454,7 +1455,7 @@ def main(): f2 = f3 # End if - if (len(A1) - len(A2)) > options.max_length_diff / options.frame_shift: + if (len(A1) - len(A2)) > options.max_length_diff/options.frame_shift: sys.stderr.write( \ "%s: Warning: Lengths of %s and %s differ by more than %f. " \ % (sys.argv[0], f1,f2, options.max_length_diff) \ diff --git a/egs/babel/s5d/local/syllab/generate_phone_lang.sh b/egs/babel/s5d/local/syllab/generate_phone_lang.sh index fc21a23231b..81d8a0acdc7 100755 --- a/egs/babel/s5d/local/syllab/generate_phone_lang.sh +++ b/egs/babel/s5d/local/syllab/generate_phone_lang.sh @@ -122,8 +122,7 @@ ln -s lex.syllabs2phones.disambig.fst $out/L_disambig.fst echo "Validating the output lang dir" utils/validate_lang.pl $out || exit 1 -sed -i'' 's/#1$//g' $lout/lexicon.txt -sed -i'' 's/#1$//g' $lout/lexiconp.txt +perl -i -pe 's/#1$//g' $lout/lexicon.txt $lout/lexiconp.txt echo "Done OK." exit 0 diff --git a/egs/babel/s5d/local/syllab/generate_syllable_lang.sh b/egs/babel/s5d/local/syllab/generate_syllable_lang.sh index db7b0902425..a7bd667027c 100755 --- a/egs/babel/s5d/local/syllab/generate_syllable_lang.sh +++ b/egs/babel/s5d/local/syllab/generate_syllable_lang.sh @@ -122,8 +122,7 @@ ln -s lex.syllabs2phones.disambig.fst $out/L_disambig.fst echo "Validating the output lang dir" utils/validate_lang.pl $out || exit 1 -sed -i'' 's/#1$//g' $lout/lexicon.txt -sed -i'' 's/#1$//g' $lout/lexiconp.txt +perl -i -pe 's/#1$//g' $lout/lexicon.txt $lout/lexiconp.txt echo "Done OK." exit 0 diff --git a/egs/bentham/README.txt b/egs/bentham/README.txt new file mode 100644 index 00000000000..02870c265f6 --- /dev/null +++ b/egs/bentham/README.txt @@ -0,0 +1,5 @@ +This directory contains example scripts for handwriting recognition on +the Bentham dataset: +http://www.transcriptorium.eu/~htrcontest/contestICFHR2014/public_html/ +In the ICFHR 2014 contest, the best performing system in the unrestricted +track obtained a WER of 8.6%. diff --git a/egs/bentham/v1/cmd.sh b/egs/bentham/v1/cmd.sh new file mode 100755 index 00000000000..3c8eb9f93a5 --- /dev/null +++ b/egs/bentham/v1/cmd.sh @@ -0,0 +1,13 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export cmd="queue.pl" diff --git a/egs/bentham/v1/image b/egs/bentham/v1/image new file mode 120000 index 00000000000..6a4b3afeb09 --- /dev/null +++ b/egs/bentham/v1/image @@ -0,0 +1 @@ +../../cifar/v1/image \ No newline at end of file diff --git a/egs/bentham/v1/local/chain/compare_wer.sh b/egs/bentham/v1/local/chain/compare_wer.sh new file mode 100755 index 00000000000..2ce14e13694 --- /dev/null +++ b/egs/bentham/v1/local/chain/compare_wer.sh @@ -0,0 +1,120 @@ +#!/bin/bash + +# this script is used for comparing decoding results between systems. +# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b} + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora + +if [ $# == 0 ]; then + echo "Usage: $0: [ ... ]" + echo "e.g.: $0 exp/chain/cnn{1a,1b}" + exit 1 +fi +. ./path.sh + +echo "# $0 $*" +used_epochs=false + +echo -n "# System " +for x in $*; do printf "% 10s" " $(basename $x)"; done +echo + +echo -n "# WER " +for x in $*; do + wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "# WER (rescored) " +for x in $*; do + wer="--" + [ -d $x/decode_test_rescored ] && wer=$(cat $x/decode_test_rescored/scoring_kaldi/best_wer | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "# CER " +for x in $*; do + cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}') + printf "% 10s" $cer +done +echo + +echo -n "# CER (rescored) " +for x in $*; do + cer="--" + [ -d $x/decode_test_rescored ] && cer=$(cat $x/decode_test_rescored/scoring_kaldi/best_cer | awk '{print $2}') + printf "% 10s" $cer +done +echo + +echo -n "# WER val " +for x in $*; do + wer=$(cat $x/decode_val/scoring_kaldi/best_wer | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "# WER (rescored) val " +for x in $*; do + wer="--" + [ -d $x/decode_val_rescored ] && wer=$(cat $x/decode_val_rescored/scoring_kaldi/best_wer | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "# CER val " +for x in $*; do + cer=$(cat $x/decode_val/scoring_kaldi/best_cer | awk '{print $2}') + printf "% 10s" $cer +done +echo + +echo -n "# CER (rescored) val " +for x in $*; do + cer="--" + [ -d $x/decode_val_rescored ] && cer=$(cat $x/decode_val_rescored/scoring_kaldi/best_cer | awk '{print $2}') + printf "% 10s" $cer +done +echo + +if $used_epochs; then + exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. +fi + +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final train prob (xent) " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob (xent) " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Parameters " +for x in $*; do + params=$(nnet3-info $x/final.mdl 2>/dev/null | grep num-parameters | cut -d' ' -f2 | awk '{printf "%0.2fM\n",$1/1000000}') + printf "% 10s" $params +done +echo diff --git a/egs/bentham/v1/local/chain/run_cnn_e2eali.sh b/egs/bentham/v1/local/chain/run_cnn_e2eali.sh new file mode 120000 index 00000000000..e2545b0186e --- /dev/null +++ b/egs/bentham/v1/local/chain/run_cnn_e2eali.sh @@ -0,0 +1 @@ +tuning/run_cnn_e2eali_1a.sh \ No newline at end of file diff --git a/egs/bentham/v1/local/chain/run_e2e_cnn.sh b/egs/bentham/v1/local/chain/run_e2e_cnn.sh new file mode 120000 index 00000000000..d26ba0182ce --- /dev/null +++ b/egs/bentham/v1/local/chain/run_e2e_cnn.sh @@ -0,0 +1 @@ +tuning/run_e2e_cnn_1a.sh \ No newline at end of file diff --git a/egs/bentham/v1/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/bentham/v1/local/chain/tuning/run_cnn_e2eali_1a.sh new file mode 100755 index 00000000000..ec530ef1ce4 --- /dev/null +++ b/egs/bentham/v1/local/chain/tuning/run_cnn_e2eali_1a.sh @@ -0,0 +1,261 @@ +#!/bin/bash + +# local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/ exp/chain/cnn_e2eali_1a +# System e2e_cnn_1a cnn_e2eali_1a +# WER 13.72 8.14 +# WER (rescored) 13.40 8.00 +# CER 6.56 2.82 +# CER (rescored) 6.33 2.73 +# WER val 13.51 8.19 +# WER (rescored) val 13.38 7.97 +# CER val 6.40 2.93 +# CER (rescored) val 6.29 2.90 +# Final train prob 0.1037 -0.0613 +# Final valid prob 0.0720 -0.0988 +# Final train prob (xent) -0.3706 +# Final valid prob (xent) -0.4669 +# Parameters 11.54M 4.29M + +# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1a +# exp/chain/cnn_e2eali_1a: num-iters=20 nj=3..5 num-params=4.3M dim=40->336 combine=-0.066->-0.066 (over 1) xent:train/valid[12,19,final]=(-0.822,-0.437,-0.371/-0.859,-0.514,-0.467) logprob:train/valid[12,19,final]=(-0.188,-0.078,-0.061/-0.204,-0.114,-0.099) + +set -e -o pipefail + +stage=0 + +nj=30 +train_set=train +decode_val=true +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +e2echain_model_dir=exp/chain/e2e_cnn_1a +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=4 +# training chunk-options +chunk_width=340,300,200,100 +num_leaves=500 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +tdnn_dim=550 +# training options +srand=0 +remove_egs=true +lang_decode=data/lang +lang_rescore=data/lang_rescore_6g +if $decode_val; then maybe_val=val; else maybe_val= ; fi +dropout_schedule='0,0@0.20,0.2@0.50,0' +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --acoustic-scale 1.0 \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + ${train_data_dir} data/lang $e2echain_model_dir $lat_dir + echo "" >$lat_dir/splice_opts +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves ${train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) + cnn_opts="l2-regularize=0.03 dropout-proportion=0.0" + tdnn_opts="l2-regularize=0.03" + output_opts="l2-regularize=0.04" + common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-dropout-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-dropout-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-dropout-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-dropout-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-dropout-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common3 height-subsample-out=2 + conv-relu-batchnorm-dropout-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 + relu-batchnorm-dropout-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + relu-batchnorm-dropout-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + relu-batchnorm-dropout-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=true \ + --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=1 \ + --chain.left-tolerance 3 \ + --chain.right-tolerance 3 \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=5 \ + --trainer.frames-per-iter=1500000 \ + --trainer.optimization.num-jobs-initial=3 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=32,16 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0 --constrained false" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 $lang_decode \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + for decode_set in test $maybe_val; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/$decode_set $dir/decode_${decode_set}{,_rescored} || exit 1 + done +fi + + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/bentham/v1/local/chain/tuning/run_e2e_cnn_1a.sh b/egs/bentham/v1/local/chain/tuning/run_e2e_cnn_1a.sh new file mode 100755 index 00000000000..716bdce3729 --- /dev/null +++ b/egs/bentham/v1/local/chain/tuning/run_e2e_cnn_1a.sh @@ -0,0 +1,166 @@ +#!/bin/bash +# Copyright 2017 Hossein Hadian + +# This script does end2end chain training (i.e. from scratch) +# ./local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/ +# System e2e_cnn_1b +# WER 13.72 +# WER (rescored) 13.40 +# CER 6.56 +# CER (rescored) 6.33 +# WER val 13.51 +# WER (rescored) val 13.38 +# CER val 6.40 +# CER (rescored) val 6.29 +# Final train prob 0.1037 +# Final valid prob 0.0720 +# Final train prob (xent) +# Final valid prob (xent) +# Parameters 11.54M +# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a +# exp/chain/e2e_cnn_1a: num-iters=26 nj=2..4 num-params=11.5M dim=40->17112 combine=0.054->0.054 (over 1) logprob:train/valid[16,25,final]=(0.078,0.102,0.104/0.051,0.069,0.072) +set -e + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +affix=1a +nj=30 + +# training options +tdnn_dim=450 +minibatch_size=150=100,64/300=50,32/600=25,16/1200=16,8 +common_egs_dir= +train_set=train +decode_val=true +lang_decode=data/lang +lang_rescore=data/lang_rescore_6g +if $decode_val; then maybe_val=val; else maybe_val= ; fi +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 1 ]; then + steps/nnet3/chain/e2e/prepare_e2e.sh --nj 30 --cmd "$cmd" \ + --shared-phones true \ + --type biphone \ + data/$train_set $lang $treedir + $cmd $treedir/log/make_phone_lm.log \ + cat data/$train_set/text \| \ + steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \ + utils/sym2int.pl -f 2- data/lang/phones.txt \| \ + chain-est-phone-lm --num-extra-lm-states=500 \ + ark:- $treedir/phone_lm.fst +fi + +if [ $stage -le 2 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}') + common1="height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="height-offsets=-2,-1,0,1,2 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn4 input=Append(-4,0,4) dim=$tdnn_dim + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $output_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs +fi + +if [ $stage -le 3 ]; then + # no need to store the egs in a shared storage because we always + # remove them. Anyway, it takes only 5 minutes to generate them. + + steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \ + --cmd "$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \ + --chain.frame-subsampling-factor 4 \ + --chain.alignment-subsampling-factor 4 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1000000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.momentum 0 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 4 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.shrink-value 1.0 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs true \ + --feat-dir data/${train_set} \ + --tree-dir $treedir \ + --dir $dir || exit 1; +fi + +if [ $stage -le 4 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 $lang_decode \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 5 ]; then + for decode_set in test $maybe_val; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/$decode_set $dir/decode_${decode_set}{,_rescored} || exit 1 + done +fi + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/bentham/v1/local/check_tools.sh b/egs/bentham/v1/local/check_tools.sh new file mode 100755 index 00000000000..5b4d3107d3b --- /dev/null +++ b/egs/bentham/v1/local/check_tools.sh @@ -0,0 +1,43 @@ +#!/bin/bash -u + +# Copyright 2015 (c) Johns Hopkins University (Jan Trmal ) + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +[ -f ./path.sh ] && . ./path.sh +set +e + +command -v python3 >&/dev/null \ + || { echo >&2 "python3 not found on PATH. You will have to install Python3, preferably >= 3.6"; exit 1; } + +python3 -c "import numpy" +if [ $? -ne 0 ] ; then + echo >&2 "This recipe needs numpy installed." + exit 1 +fi + +python3 -c "import scipy" +if [ $? -ne 0 ] ; then + echo >&2 "This recipe needs scipy installed." + exit 1 +fi + +python3 -c "import scipy.misc; scipy.misc.__dict__['imread']" +if [ $? -ne 0 ] ; then + echo >&2 "This recipe needs scipy-image and Pillow installed." + exit 1 +fi + + +exit 0 diff --git a/egs/bentham/v1/local/create_splits.sh b/egs/bentham/v1/local/create_splits.sh new file mode 100755 index 00000000000..e8ea2279a49 --- /dev/null +++ b/egs/bentham/v1/local/create_splits.sh @@ -0,0 +1,37 @@ +#!/bin/bash +# Copyright 2018 Desh Raj (Johns Hopkins University) + +# This script reads the extracted Bentham database files and creates +# the following files (for all the data subsets): +# text, utt2spk, images.scp. + +download_dir=$1 +save_dir=$2 +mkdir -p $save_dir/{train,val,test} +touch $save_dir/{train,val,test}/{text,images.scp,utt2spk,spk2utt} + +partition_dir=$download_dir"/gt/Partitions/" +lines_dir=$download_dir"/gt/Images/Lines/" +text_dir=$download_dir"/gt/Transcriptions/" + +function split { + echo "Creating $1 split" + split_dir=$save_dir/$1 + line_file=$partition_dir/$2 + + while read -r line; do + name="$line" + spkid=${name:0:11} + echo -n $name" " | cat - $text_dir/$name* >> $split_dir/text + echo >> $split_dir/text + echo $name $lines_dir"/"$name".png" >> $split_dir/images.scp + echo $name $spkid >> $split_dir/utt2spk + done < "$line_file" + + perl -i -ne 'print if /\S/' $split_dir/images.scp $split_dir/text $split_dir/utt2spk + utils/utt2spk_to_spk2utt.pl $split_dir/utt2spk > $split_dir/spk2utt +} + +split train TrainLines.lst +split val ValidationLines.lst +split test TestLines.lst diff --git a/egs/bentham/v1/local/download_bentham_text.sh b/egs/bentham/v1/local/download_bentham_text.sh new file mode 100755 index 00000000000..e09403718a1 --- /dev/null +++ b/egs/bentham/v1/local/download_bentham_text.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# Copyright 2018 Desh Raj +# Apache 2.0 + +## Download all written works of Jeremy Bentham for the Bentham HWR task LM training + +baseurl='http://oll.libertyfund.org/titles/' +savedir=$1 + +mkdir -p $savedir + +declare -a texts=("bentham-the-works-of-jeremy-bentham-vol-1/simple" + "bentham-the-works-of-jeremy-bentham-vol-2/simple" + "bentham-the-works-of-jeremy-bentham-vol-3/simple" + "bentham-the-works-of-jeremy-bentham-vol-5-scotch-reform-real-property-codification-petitions/simple" + "bentham-the-works-of-jeremy-bentham-vol-6/simple" + "bentham-the-works-of-jeremy-bentham-vol-7-rationale-of-judicial-evidence-part-2/simple" + "bentham-the-works-of-jeremy-bentham-vol-8/simple" + "bentham-the-works-of-jeremy-bentham-vol-9-constitutional-code" + "bentham-the-works-of-jeremy-bentham-vol-10-memoirs-part-i-and-correspondence/simple" + "bentham-the-works-of-jeremy-bentham-vol-11-memoirs-of-bentham-part-ii-and-analytical-index") + +counter=1 +for i in "${texts[@]}" +do + echo "Downloading $baseurl$i" + curl -s -N {$baseurl}{$i} | sed -e 's/<[^>]*>//g' > $savedir"/bentham"$counter".txt" + ((counter++)) +done + +cat $savedir"/*.txt" > $savedir"/complete.txt" +rm $savedir"/bentham*.txt" diff --git a/egs/bentham/v1/local/extract_features.sh b/egs/bentham/v1/local/extract_features.sh new file mode 100755 index 00000000000..460e467e99c --- /dev/null +++ b/egs/bentham/v1/local/extract_features.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +# Copyright 2017 Yiwen Shao +# 2018 Ashish Arora + +# Apache 2.0 +# This script runs the make features script in parallel. + +nj=4 +cmd=run.pl +feat_dim=40 +augment='no_aug' +fliplr=false +echo "$0 $@" + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; + +data=$1 +featdir=$data/data +scp=$data/images.scp +logdir=$data/log + +mkdir -p $logdir +mkdir -p $featdir + +# make $featdir an absolute pathname +featdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $featdir ${PWD}` + +for n in $(seq $nj); do + split_scps="$split_scps $logdir/images.$n.scp" +done + +# split images.scp +utils/split_scp.pl $scp $split_scps || exit 1; + +$cmd JOB=1:$nj $logdir/extract_features.JOB.log \ + image/ocr/make_features.py $logdir/images.JOB.scp \ + --allowed_len_file_path $data/allowed_lengths.txt \ + --num-channels 4 \ + --feat-dim $feat_dim --fliplr $fliplr --augment_type $augment \| \ + copy-feats --compress=true --compression-method=7 \ + ark:- ark,scp:$featdir/images.JOB.ark,$featdir/images.JOB.scp + +## aggregates the output scp's to get feats.scp +for n in $(seq $nj); do + cat $featdir/images.$n.scp || exit 1; +done > $data/feats.scp || exit 1 diff --git a/egs/bentham/v1/local/gen_topo.py b/egs/bentham/v1/local/gen_topo.py new file mode 100755 index 00000000000..af9e20317d8 --- /dev/null +++ b/egs/bentham/v1/local/gen_topo.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python + +# Copyright 2017 (author: Chun-Chieh Chang) + +# Generate a topology file. This allows control of the number of states in the +# non-silence HMMs, and in the silence HMMs. This is a modified version of +# 'utils/gen_topo.pl'. The difference is that this creates two topologies for +# the non-silence HMMs. The number of states for punctuations is different than +# the number of states for other characters. + +from __future__ import print_function +from __future__ import division +import argparse +import string + +parser = argparse.ArgumentParser(description="Usage: steps/nnet3/chain/gen_topo.py " + " " + "e.g.: steps/nnet3/chain/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n", + epilog="See egs/swbd/s5c/local/chain/train_tdnn_a.sh for example of usage."); +parser.add_argument("num_nonsil_states", type=int, help="number of states for nonsilence phones"); +parser.add_argument("num_sil_states", type=int, help="number of states for silence phones"); +parser.add_argument("num_punctuation_states", type=int, help="number of states for punctuation"); +parser.add_argument("nonsilence_phones", + help="List of non-silence phones as integers, separated by colons, e.g. 4:5:6:7:8:9"); +parser.add_argument("silence_phones", + help="List of silence phones as integers, separated by colons, e.g. 1:2:3"); +parser.add_argument("phone_list", help="file containing all phones and their corresponding number."); + +args = parser.parse_args() + +silence_phones = [ int(x) for x in args.silence_phones.split(":") ] +nonsilence_phones = [ int(x) for x in args.nonsilence_phones.split(":") ] +all_phones = silence_phones + nonsilence_phones + +punctuation_phones = [] +exclude = set("!(),.?;:'-\"") +with open(args.phone_list) as f: + for line in f: + line = line.strip() + phone = line.split(' ')[0] + if len(phone) == 1 and phone in exclude: + punctuation_phones.append(int(line.split(' ')[1])) +# For nonsilence phones that are not punctuations +print("") +print("") +print("") +print(" ".join([str(x) for x in nonsilence_phones if x not in punctuation_phones])) +print("") +for x in range(0, args.num_nonsil_states): + xp1 = x + 1 + print(" {0} {0} {0} 0.75 {1} 0.25 ".format(x, xp1)) +print(" {} ".format(args.num_nonsil_states)) +print("") + +# For nonsilence phones that ar punctuations +print("") +print("") +print(" ".join([str(x) for x in nonsilence_phones if x in punctuation_phones])) +print("") +for x in range(0, args.num_punctuation_states): + xp1 = x + 1 + print(" {0} {0} {0} 0.75 {1} 0.25 ".format(x, xp1)) +print(" {} ".format(args.num_punctuation_states)) +print("") + +# For silence phones +print("") +print("") +print(" ".join([str(x) for x in silence_phones])) +print("") +if(args.num_sil_states > 1): + transp = 1.0/(args.num_sil_states - 1) + + state_str = " 0 0 " + for x in range(0, (args.num_sil_states - 1)): + state_str = "{} {} {} ".format(state_str, x, transp) + state_str = state_str + "" + print(state_str) + + for x in range(1, (args.num_sil_states - 1)): + state_str = " {0} {0} ".format(x) + for y in range(1, args.num_sil_states): + state_str = "{} {} {} ".format(state_str, y, transp) + state_str = state_str + "" + print(state_str) + second_last = args.num_sil_states - 1 + print(" {0} {0} {0} 0.75 {1} 0.25 ".format(second_last, args.num_sil_states)) + print(" {} ".format(args.num_sil_states)) +else: + print(" 0 0 0 0.75 1 0.25 ") + print(" {} ".format(args.num_sil_states)) +print("") +print("") diff --git a/egs/bentham/v1/local/prepare_data.sh b/egs/bentham/v1/local/prepare_data.sh new file mode 100755 index 00000000000..bbcc9863611 --- /dev/null +++ b/egs/bentham/v1/local/prepare_data.sh @@ -0,0 +1,69 @@ +#!/bin/bash + +# Copyright 2018 Desh Raj (Johns Hopkins University) + +# Apache 2.0 + +# This script downloads the Bentham handwriting database and prepares the training +# and test data (i.e text, images.scp, utt2spk and spk2utt) by calling create_splits.sh. + +# In addition, it downloads data for all texts of Bentham for LM training purpose. + +stage=0 +download_dir=data/local/download/ +database_dir="" +text_corpus_dir="" + +mkdir -p $download_dir + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; + +BENTHAM_IMAGES_URL='http://transcriptorium.eu/~tsdata/BenthamR0/BenthamDatasetR0-Images.zip' +BENTHAM_GT_URL='http://transcriptorium.eu/~tsdata/BenthamR0/BenthamDatasetR0-GT.zip' +bentham_images=$database_dir"/images.zip" +bentham_gt=$database_dir"/gt.zip" +bentham_text=$download_dir"/text" + +# download and extract images and transcriptions +if [ ! -f $bentham_images ]; then + echo "Downloading images and transcriptions to $database_dir" + mkdir -p $database_dir + wget $BENTHAM_IMAGES_URL -O $bentham_images + wget $BENTHAM_GT_URL -O $bentham_gt +else + echo "Not downloading since corpus already exists" +fi + +if [ ! -d $download_dir/"gt" ]; then + unzip $bentham_gt -d $download_dir + mv $download_dir"/BenthamDatasetR0-GT" $download_dir"/gt" +else + echo "Local extracted corpus already exists" +fi + +# Download extra Bentham text for LM training +if [ -d $text_corpus_dir ]; then + echo "$0: Not downloading Bentham text corpus as it is already there." +else + local/download_bentham_text.sh $text_corpus_dir +fi + +# Copy extra Bentham text to local +if [ -d $bentham_text ]; then + echo "$0: Not copying as local Bentham already present." +else + mkdir -p $bentham_text + cp $text_corpus_dir/Bentham-Text/* $bentham_text + echo "$0: Done copying extra Bentham text to local." +fi + +# Creating train, val, and test splits for all directories +if [ -d data/train ]; then + echo "Data splits and files already exist. Not creating again." +else + echo "Creating train, val, and test splits and corresponding files.." + local/create_splits.sh $download_dir "data/" +fi + diff --git a/egs/bentham/v1/local/prepare_dict.sh b/egs/bentham/v1/local/prepare_dict.sh new file mode 100755 index 00000000000..22db5ae834d --- /dev/null +++ b/egs/bentham/v1/local/prepare_dict.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash + +# Copyright 2017 Hossein Hadian +# 2017 Babak Rekabdar +# 2017 Chun Chieh Chang +# 2017 Ashish Arora + +# This script prepares the dictionary. + +set -e +dir=data/local/dict +. ./utils/parse_options.sh || exit 1; + +mkdir -p $dir + +local/prepare_lexicon.py $dir + +cut -d' ' -f2- $dir/lexicon.txt | sed 's/SIL//g' | tr ' ' '\n' | sort -u | sed '/^$/d' >$dir/nonsilence_phones.txt || exit 1; + +echo ' SIL' >> $dir/lexicon.txt + +echo SIL > $dir/silence_phones.txt + +echo SIL >$dir/optional_silence.txt + +echo -n "" >$dir/extra_questions.txt diff --git a/egs/bentham/v1/local/prepare_lexicon.py b/egs/bentham/v1/local/prepare_lexicon.py new file mode 100755 index 00000000000..3de96056c2a --- /dev/null +++ b/egs/bentham/v1/local/prepare_lexicon.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python3 + +# Copyright 2017 Babak Rekabdar +# 2017 Hossein Hadian +# 2017 Chun Chieh Chang +# 2017 Ashish Arora +# Apache 2.0 + +# This script prepares lexicon for BPE. It gets the set of all words that occur in data/train/text. +# Since this lexicon is based on BPE, it replaces '|' with silence. + +import argparse +import os + +parser = argparse.ArgumentParser(description="""Creates the list of characters and words in lexicon""") +parser.add_argument('dir', type=str, help='output path') +args = parser.parse_args() + +### main ### +lex = {} +text_path = os.path.join('data', 'train', 'text') +with open(text_path, 'r', encoding='utf-8') as f: + for line in f: + line_vect = line.strip().split(' ') + for i in range(1, len(line_vect)): + characters = list(line_vect[i]) + characters = " ".join([ 'SIL' if char == '|' else char for char in characters]) + characters = list(characters) + characters = "".join([ '' if char == '#' else char for char in characters]) + lex[line_vect[i]] = characters + +with open(os.path.join(args.dir, 'lexicon.txt'), 'w', encoding='utf-8') as fp: + for key in sorted(lex): + fp.write(key + " " + lex[key] + "\n") diff --git a/egs/bentham/v1/local/score.sh b/egs/bentham/v1/local/score.sh new file mode 100755 index 00000000000..1d84815fc69 --- /dev/null +++ b/egs/bentham/v1/local/score.sh @@ -0,0 +1,6 @@ + +#!/bin/bash + + +steps/scoring/score_kaldi_wer.sh "$@" +steps/scoring/score_kaldi_cer.sh --stage 2 "$@" diff --git a/egs/bentham/v1/local/train_lm.sh b/egs/bentham/v1/local/train_lm.sh new file mode 100755 index 00000000000..48632a90769 --- /dev/null +++ b/egs/bentham/v1/local/train_lm.sh @@ -0,0 +1,141 @@ +#!/bin/bash + +# Copyright 2016 Vincent Nguyen +# 2016 Johns Hopkins University (author: Daniel Povey) +# 2017 Ashish Arora +# 2017 Hossein Hadian +# 2018 Desh Raj +# Apache 2.0 +# +# This script trains an LM on the Bentham text corpus and training transcriptions. +# It is based on the example scripts distributed with PocoLM + +# It will check if pocolm is installed and if not will proceed with installation + +set -e +stage=0 +vocab_size=50000 + +echo "$0 $@" # Print the command line for logging +. ./utils/parse_options.sh || exit 1; + +dir=data/local/local_lm +lm_dir=${dir}/data +bentham_text_dir=data/local/download/text/ + +mkdir -p $dir +. ./path.sh || exit 1; # for KALDI_ROOT +export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH +( # First make sure the pocolm toolkit is installed. + cd $KALDI_ROOT/tools || exit 1; + if [ -d pocolm ]; then + echo Not installing the pocolm toolkit since it is already there. + else + echo "$0: Please install the PocoLM toolkit with: " + echo " cd ../../../tools; extras/install_pocolm.sh; cd -" + exit 1; + fi +) || exit 1; + +bypass_metaparam_optim_opt= +# If you want to bypass the metaparameter optimization steps with specific metaparameters +# un-comment the following line, and change the numbers to some appropriate values. +# You can find the values from output log of train_lm.py. +# These example numbers of metaparameters is for 4-gram model (with min-counts) +# running with train_lm.py. +# The dev perplexity should be close to the non-bypassed model. +#bypass_metaparam_optim_opt= +# Note: to use these example parameters, you may need to remove the .done files +# to make sure the make_lm_dir.py be called and tain only 3-gram model +#for order in 3; do +#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done + +if [ $stage -le 0 ]; then + mkdir -p ${dir}/data + mkdir -p ${dir}/data/text + + echo "$0: Getting the Data sources" + + rm ${dir}/data/text/* 2>/dev/null || true + + # Using Bentham text with last 5000 lines for dev + + cat $bentham_text_dir/complete.txt | \ + sed '/^\s*$/d' | \ + utils/lang/bpe/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ + | sed 's/@@//g' > ${dir}/bentham.txt + tail -n +5000 ${dir}/bentham.txt > ${dir}/data/text/bentham.txt + + # use the validation data as the dev set. + # Note: the name 'dev' is treated specially by pocolm, it automatically + # becomes the dev set. + head -5000 ${dir}/bentham.txt > ${dir}/data/text/dev.txt + + # use the training data as an additional data source. + # we can later fold the dev data into this. + cat data/train/text | cut -d " " -f 2- > ${dir}/data/text/hwr.txt + + # for reporting perplexities, we'll use the "real" dev set. + # (the validation data is used as ${dir}/data/text/dev.txt to work + # out interpolation weights.) + # note, we can't put it in ${dir}/data/text/, because then pocolm would use + # it as one of the data sources. + cut -d " " -f 2- < data/val/text > ${dir}/data/real_dev_set.txt + + # get the wordlist from Bentham text + cat ${dir}/data/text/{bentham,hwr}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count + head -n $vocab_size ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist +fi + +order=6 + +if [ $stage -le 1 ]; then + # decide on the vocabulary. + # Note: you'd use --wordlist if you had a previously determined word-list + # that you wanted to use. + # Note: if you have more than one order, use a certain amount of words as the + # vocab and want to restrict max memory for 'sort', + echo "$0: training the unpruned LM" + min_counts='bentham=1 hwr=1' + wordlist=${dir}/data/wordlist + + lm_name="`basename ${wordlist}`_${order}" + if [ -n "${min_counts}" ]; then + lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`" + fi + unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm + + train_lm.py --wordlist=${wordlist} --num-splits=10 --warm-start-ratio=20 \ + --limit-unk-history=true \ + ${bypass_metaparam_optim_opt} \ + ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir} + + mkdir -p ${dir}/data/arpa + format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz + + get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' +fi + +if [ $stage -le 2 ]; then + echo "$0: pruning the LM (to larger size)" + # Using 1 million n-grams for a big LM for rescoring purposes. + size=1000000 + prune_lm_dir.py --target-num-ngrams=$size --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big + + get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity' + + mkdir -p ${dir}/data/arpa + format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz +fi + +if [ $stage -le 3 ]; then + echo "$0: pruning the LM (to smaller size)" + # Using 500,000 n-grams for a smaller LM for graph building. Prune from the + # bigger-pruned LM, it'll be faster. + size=500000 + prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small + + get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity' + + format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz +fi diff --git a/egs/bentham/v1/local/wer_output_filter b/egs/bentham/v1/local/wer_output_filter new file mode 100755 index 00000000000..24691a160a9 --- /dev/null +++ b/egs/bentham/v1/local/wer_output_filter @@ -0,0 +1,31 @@ +#!/usr/bin/env python3 +# Copyright 2017 Hossein Hadian + +# This is a filter used in scoring. It separates all +# punctuations from words. For e.g. this sentence: + +# "They have come!" he said reverently, gripping his +# hands. "Isn't it a glorious thing! Long awaited." + +# is converted to this: + +# " They have come ! " he said reverently , gripping his +# hands . " Isn ' t it a glorious thing ! Long awaited . " + +# Sample BPE-based output: +# |He |ro se |from |his |b re ak f as t - s ch oo l |b en ch + +import sys +import re + +punctuations = "!(),.?;:'-\"" +escaped_punctuations = re.escape(punctuations) + +for line in sys.stdin: + words = line.strip().split() + uttid = words[0] + transcript = ''.join(words[1:]) + transcript = transcript.replace('|', ' ') + split_transcript = " ".join(re.split("([{}])".format(escaped_punctuations), + transcript)).strip() + print("{} {}".format(uttid, split_transcript)) diff --git a/egs/bentham/v1/path.sh b/egs/bentham/v1/path.sh new file mode 100755 index 00000000000..2d17b17a84a --- /dev/null +++ b/egs/bentham/v1/path.sh @@ -0,0 +1,6 @@ +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C diff --git a/egs/bentham/v1/run_end2end.sh b/egs/bentham/v1/run_end2end.sh new file mode 100755 index 00000000000..63c034e41f6 --- /dev/null +++ b/egs/bentham/v1/run_end2end.sh @@ -0,0 +1,121 @@ +#!/bin/bash +# Copyright 2018 Ashish Arora (Johns Hopkins University) +# 2018 Desh Raj (Johns Hopkins University) + +set -e +stage=0 +nj=20 +# bentham_hwr_database points to the official database path on the JHU grid. If you have not +# already downloaded the data, you will have to first download it and then name the Images +# and Ground Truth zipped files as images.zip and gt.zip. Then, point the path below to the +# location where your zipped files are present on the grid. +bentham_hwr_database=/export/corpora5/handwriting_ocr/hwr1/ICDAR-HTR-Competition-2015 +# bentham_text_database points to the database path on the JHU grid. +# It contains all of the written works of Bentham, and can be used to train +# an LM for the HWR task. We have provided a script which downloads the data +# and saves it to the location provided below. +bentham_text_corpus=/export/corpora5/handwriting_ocr/hwr1/ICDAR-HTR-Competition-2015/Bentham-Text + +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. +. ./path.sh +. ./utils/parse_options.sh # e.g. this parses the above options + # if supplied. + + +./local/check_tools.sh + +if [ $stage -le 0 ]; then + echo "$0: Preparing data..." + local/prepare_data.sh --database-dir $bentham_hwr_database \ + --text-corpus-dir $bentham_text_corpus +fi + +if [ $stage -le 1 ]; then + image/get_image2num_frames.py data/train # This will be needed for the next command + # The next command creates a "allowed_lengths.txt" file in data/train + # which will be used by local/make_features.py to enforce the images to + # have allowed lengths. The allowed lengths will be spaced by 10% difference in length. + image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train + echo "$(date) Extracting features, creating feats.scp file" + for dataset in train val test; do + local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/$dataset + steps/compute_cmvn_stats.sh data/$dataset + done + utils/fix_data_dir.sh data/train +fi + +if [ $stage -le 2 ]; then + echo "$0: Preparing BPE..." + # getting non-silence phones. + cut -d' ' -f2- data/train/text | \ +python3 <( +cat << "END" +import os, sys, io; +infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8'); +output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8'); +phone_dict = dict(); +for line in infile: + line_vect = line.strip().split(); + for word in line_vect: + for phone in word: + phone_dict[phone] = phone; +for phone in phone_dict.keys(): + output.write(phone+ '\n'); +END + ) > data/local/phones.txt + + cut -d' ' -f2- data/train/text > data/local/train_data.txt + cat data/local/phones.txt data/local/train_data.txt | \ + utils/lang/bpe/prepend_words.py | \ + utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt + for set in test train val; do + cut -d' ' -f1 data/$set/text > data/$set/ids + cut -d' ' -f2- data/$set/text | \ + utils/lang/bpe/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ + | sed 's/@@//g' > data/$set/bpe_text + mv data/$set/text data/$set/text.old + paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text + done +fi + +if [ $stage -le 3 ]; then + echo "$0: Estimating a language model for decoding..." + local/train_lm.sh +fi + +if [ $stage -le 4 ]; then + echo "$0: Preparing dictionary and lang..." + local/prepare_dict.sh + # This recipe uses byte-pair encoding, the silences are part of the words' pronunciations. + # So we set --sil-prob to 0.0 + utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \ + data/local/dict "" data/lang/temp data/lang + silphonelist=`cat data/lang/phones/silence.csl` + nonsilphonelist=`cat data/lang/phones/nonsilence.csl` + local/gen_topo.py 8 4 4 $nonsilphonelist $silphonelist data/lang/phones.txt >data/lang/topo + utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang + + utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_big.arpa.gz \ + data/local/dict/lexicon.txt data/lang + utils/build_const_arpa_lm.sh data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \ + data/lang data/lang_rescore_6g +fi + +if [ $stage -le 5 ]; then + echo "$0: Calling the flat-start chain recipe..." + local/chain/run_e2e_cnn.sh +fi + +if [ $stage -le 6 ]; then + echo "$0: Aligning the training data using the e2e chain model..." + steps/nnet3/align.sh --nj 50 --cmd "$cmd" \ + --use-gpu false \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \ + data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train +fi + +if [ $stage -le 7 ]; then + echo "$0: Building a tree and training a regular chain model using the e2e alignments..." + local/chain/run_cnn_e2eali.sh +fi diff --git a/egs/bentham/v1/steps b/egs/bentham/v1/steps new file mode 120000 index 00000000000..1b186770dd1 --- /dev/null +++ b/egs/bentham/v1/steps @@ -0,0 +1 @@ +../../wsj/s5/steps/ \ No newline at end of file diff --git a/egs/bentham/v1/utils b/egs/bentham/v1/utils new file mode 120000 index 00000000000..a3279dc8679 --- /dev/null +++ b/egs/bentham/v1/utils @@ -0,0 +1 @@ +../../wsj/s5/utils/ \ No newline at end of file diff --git a/egs/bn_music_speech/v1/local/make_annotations_bn.py b/egs/bn_music_speech/v1/local/make_annotations_bn.py index 53cebf52ea4..86bec7b16ae 100755 --- a/egs/bn_music_speech/v1/local/make_annotations_bn.py +++ b/egs/bn_music_speech/v1/local/make_annotations_bn.py @@ -9,6 +9,7 @@ # # This file is meant to be invoked by make_bn.sh. +from __future__ import print_function import sys, re, os def is_speech(line): @@ -37,7 +38,7 @@ def extract_speech(line): m = re.search('(?<=E_time=)\d+.\d+', line) end = float(m.group(0)) if start > end: - print "Skipping annotation where end time is before start time:", line + print("Skipping annotation where end time is before start time: {}".format(line)) return start, end def extract_other_type2(line): @@ -46,7 +47,7 @@ def extract_other_type2(line): m = re.search('(?<=E_time=)\d+.\d+', line) end = float(m.group(0)) if start > end: - print "Skipping annotation where end time is before start time:", line + print("Skipping annotation where end time is before start time: {}".format(line)) return start, end def extract_music(line): @@ -60,7 +61,7 @@ def extract_music(line): elif level == "O": is_on = False else: - print "Encountered bad token on line:", line + print("Encountered bad token on line: {}".format(line)) sys.exit() return time, is_on @@ -75,7 +76,7 @@ def extract_other_type1(line): elif level == "O": is_on = False else: - print "Encountered bad token on line:", line + print("Encountered bad token on line: {}".format(line)) sys.exit() return time, is_on @@ -92,11 +93,11 @@ def process_file(annos): for line in annos: if is_speech(line): speech_start, speech_end = extract_speech(line) - speech = speech + str(speech_start) + " " + str(speech_end) + "\n" + speech = "{}{} {}\n".format(speech, speech_start, speech_end) max_time = max(speech_end, max_time) elif is_other_type2(line): other_type2_start, other_type2_end = extract_other_type2(line) - other_type2 = other_type2 + str(other_type2_start) + " " + str(other_type2_end) + "\n" + other_type2 = "{}{} {}\n".format(other_type2, other_type2_start, other_type2_end) max_time = max(other_type2_end, max_time) elif is_music(line): time, is_on = extract_music(line) @@ -105,7 +106,7 @@ def process_file(annos): prev_music_time = time start_new_music_segment = False elif not is_on and not start_new_music_segment: - music = music + str(prev_music_time) + " " + str(time) + "\n" + music = "{}{} {}\n".format(music, prev_music_time, time) start_new_music_segment = True elif is_other_type1(line): time, is_on = extract_other_type1(line) @@ -114,13 +115,13 @@ def process_file(annos): prev_other_time = time start_new_other_segment = False elif not is_on and not start_new_other_segment: - other_type1 = other_type1 + str(prev_other_time) + " " + str(time) + "\n" + other_type1 = "{}{} {}\n".format(other_type1, prev_other_time, time) start_new_other_segment = True if not start_new_music_segment: - music = music + str(prev_music_time) + " " + str(max_time) + "\n" + music = "{}{} {}\n".format(music, prev_music_time, max_time) if not start_new_other_segment: - other_type1 = other_type1 + str(prev_other_time) + " " + str(max_time) + "\n" + other_type1 = "{}{} {}\n".format(other_type1, prev_other_time, max_time) other = other_type1 + other_type2 return speech, music, other diff --git a/egs/bn_music_speech/v1/local/make_bn.py b/egs/bn_music_speech/v1/local/make_bn.py index 98836d32534..7ec9aabcbdf 100755 --- a/egs/bn_music_speech/v1/local/make_bn.py +++ b/egs/bn_music_speech/v1/local/make_bn.py @@ -20,7 +20,7 @@ for file in files: utt = str(file).replace(".sph", "") if file.endswith(".sph") and utt in utts: - wav = wav + utt + " sox " + subdir + "/" + utt + ".sph" + " -c 1 -r 16000 -t wav - |\n" + wav = "{0}{1} sox {2}/{1}.sph -c 1 -r 16000 -t -wav - |\n".format(wav, utt, subdir) wav_fi = open(os.path.join(out_dir, "wav.scp"), 'w') wav_fi.write(wav) @@ -32,14 +32,14 @@ count = 1 for line in music_fi: left, right = line.rstrip().split(" ") - segments = segments + utt + "-music-" + str(count) + " " + utt + " " + left + " " + right + "\n" - utt2spk = utt2spk + utt + "-music-" + str(count) + " " + utt + "-music-" + str(count) + "\n" + segments = "{0}{1}-music-{2} {1} {3} {4}\n".format(segments, utt, count, left, right) + utt2spk = "{0}{1}-music-{2} {1}-music-{2}".format(utt2spk, utt,count) count += 1 count = 1 for line in speech_fi: left, right = line.rstrip().split(" ") - segments = segments + utt + "-speech-" + str(count) + " " + utt + " " + left + " " + right + "\n" - utt2spk = utt2spk + utt + "-speech-" + str(count) + " " + utt + "-speech-" + str(count) + "\n" + segments = "{0}{1}-speech-{2} {1} {3} {4}\n".format(segments, utt, count, left, right) + utt2spk = "{0}{1}-speech-{2} {1}-music-{2}".format(utt2spk, utt, count) count += 1 utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), 'w') utt2spk_fi.write(utt2spk) diff --git a/egs/bn_music_speech/v1/local/make_musan.py b/egs/bn_music_speech/v1/local/make_musan.py deleted file mode 100755 index b3795fe2b7d..00000000000 --- a/egs/bn_music_speech/v1/local/make_musan.py +++ /dev/null @@ -1,119 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2015 David Snyder -# Apache 2.0. -# -# This file is meant to be invoked by make_musan.sh. - -import os, sys - -def process_music_annotations(path): - utt2spk = {} - utt2vocals = {} - lines = open(path, 'r').readlines() - for line in lines: - utt, genres, vocals, musician = line.rstrip().split()[:4] - # For this application, the musican ID isn't important - utt2spk[utt] = utt - utt2vocals[utt] = vocals == "Y" - return utt2spk, utt2vocals - -def prepare_music(root_dir, use_vocals): - utt2vocals = {} - utt2spk = {} - utt2wav = {} - num_good_files = 0 - num_bad_files = 0 - music_dir = os.path.join(root_dir, "music") - for root, dirs, files in os.walk(music_dir): - for file in files: - file_path = os.path.join(root, file) - if file.endswith(".wav"): - utt = str(file).replace(".wav", "") - utt2wav[utt] = file_path - elif str(file) == "ANNOTATIONS": - utt2spk_part, utt2vocals_part = process_music_annotations(file_path) - utt2spk.update(utt2spk_part) - utt2vocals.update(utt2vocals_part) - utt2spk_str = "" - utt2wav_str = "" - for utt in utt2vocals: - if utt in utt2wav: - if use_vocals or not utt2vocals[utt]: - utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" - utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n" - num_good_files += 1 - else: - print("Missing file", utt) - num_bad_files += 1 - print("In music directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") - return utt2spk_str, utt2wav_str - -def prepare_speech(root_dir): - utt2spk = {} - utt2wav = {} - num_good_files = 0 - num_bad_files = 0 - speech_dir = os.path.join(root_dir, "speech") - for root, dirs, files in os.walk(speech_dir): - for file in files: - file_path = os.path.join(root, file) - if file.endswith(".wav"): - utt = str(file).replace(".wav", "") - utt2wav[utt] = file_path - utt2spk[utt] = utt - utt2spk_str = "" - utt2wav_str = "" - for utt in utt2spk: - if utt in utt2wav: - utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" - utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n" - num_good_files += 1 - else: - print("Missing file", utt) - num_bad_files += 1 - print("In speech directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") - return utt2spk_str, utt2wav_str - -def prepare_noise(root_dir): - utt2spk = {} - utt2wav = {} - num_good_files = 0 - num_bad_files = 0 - noise_dir = os.path.join(root_dir, "noise") - for root, dirs, files in os.walk(noise_dir): - for file in files: - file_path = os.path.join(root, file) - if file.endswith(".wav"): - utt = str(file).replace(".wav", "") - utt2wav[utt] = file_path - utt2spk[utt] = utt - utt2spk_str = "" - utt2wav_str = "" - for utt in utt2spk: - if utt in utt2wav: - utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" - utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n" - num_good_files += 1 - else: - print("Missing file", utt) - num_bad_files += 1 - print("In noise directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") - return utt2spk_str, utt2wav_str - -def main(): - in_dir = sys.argv[1] - out_dir = sys.argv[2] - use_vocals = sys.argv[3] == "Y" - utt2spk_music, utt2wav_music = prepare_music(in_dir, use_vocals) - utt2spk_speech, utt2wav_speech = prepare_speech(in_dir) - utt2spk_noise, utt2wav_noise = prepare_noise(in_dir) - utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise - utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise - wav_fi = open(os.path.join(out_dir, "wav.scp"), 'w') - wav_fi.write(utt2wav) - utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), 'w') - utt2spk_fi.write(utt2spk) - - -if __name__=="__main__": - main() diff --git a/egs/bn_music_speech/v1/local/make_musan.sh b/egs/bn_music_speech/v1/local/make_musan.sh deleted file mode 100755 index 694940ad70f..00000000000 --- a/egs/bn_music_speech/v1/local/make_musan.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash -# Copyright 2015 David Snyder -# Apache 2.0. -# -# This script, called by ../run.sh, creates the MUSAN -# data directory. The required dataset is freely available at -# http://www.openslr.org/17/ - -set -e -in_dir=$1 -data_dir=$2 -use_vocals='Y' - -mkdir -p local/musan.tmp - -echo "Preparing ${data_dir}/musan..." -mkdir -p ${data_dir}/musan -local/make_musan.py ${in_dir} ${data_dir}/musan ${use_vocals} - -utils/fix_data_dir.sh ${data_dir}/musan - -grep "music" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_music -grep "speech" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_speech -grep "noise" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_noise -utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_music \ - ${data_dir}/musan ${data_dir}/musan_music -utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_speech \ - ${data_dir}/musan ${data_dir}/musan_speech -utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_noise \ - ${data_dir}/musan ${data_dir}/musan_noise - -utils/fix_data_dir.sh ${data_dir}/musan_music -utils/fix_data_dir.sh ${data_dir}/musan_speech -utils/fix_data_dir.sh ${data_dir}/musan_noise - -rm -rf local/musan.tmp - diff --git a/egs/bn_music_speech/v1/local/print_scores.py b/egs/bn_music_speech/v1/local/print_scores.py index c2b587cdcad..e563afb63d7 100755 --- a/egs/bn_music_speech/v1/local/print_scores.py +++ b/egs/bn_music_speech/v1/local/print_scores.py @@ -11,6 +11,7 @@ # those strings to determine if it is a target or nontarget # utterance. We arbitrarily pick music to be the target class. +from __future__ import print_function import sys utt2score = open(sys.argv[1], 'r').readlines() for i in range(0, len(utt2score)): @@ -19,4 +20,4 @@ type = "target" else: type = "nontarget" - print score, type + print(score, type) diff --git a/egs/bn_music_speech/v1/local/refine_annotations_bn.py b/egs/bn_music_speech/v1/local/refine_annotations_bn.py index 52ac87c8640..31cb1803f57 100755 --- a/egs/bn_music_speech/v1/local/refine_annotations_bn.py +++ b/egs/bn_music_speech/v1/local/refine_annotations_bn.py @@ -10,6 +10,7 @@ # designated length are created. # # This file is meant to be invoked from make_bn.sh. +from __future__ import division import sys, os def seg_to_string(seg): @@ -23,7 +24,7 @@ def seg_to_string(seg): def process_segs(raw_segs): segs = [] for seg in raw_segs: - lower, upper = map(float, seg.rstrip().split(" ")) + lower, upper = [float(i) for i in seg.rstrip().split(" ")] segs.append((lower, upper)) return segs @@ -60,8 +61,8 @@ def resegment(music, speech, other, frame_length, min_seg): start_frame = 0 for i in range(1, len(frame2classes)): if curr_class != frame2classes[i]: - start = float(start_frame) / frame_length - end = float(i) / frame_length + start = float(start_frame)/frame_length + end = float(i)/frame_length if end - start > min_seg: if curr_class == "music": new_music.append((start, end)) diff --git a/egs/bn_music_speech/v1/run.sh b/egs/bn_music_speech/v1/run.sh index 6cc0531e9d7..08d5c022a9d 100755 --- a/egs/bn_music_speech/v1/run.sh +++ b/egs/bn_music_speech/v1/run.sh @@ -20,7 +20,7 @@ vaddir=`pwd`/mfcc local/make_bn.sh /export/corpora5/LDC/LDC97S44 \ /export/corpora/LDC/LDC97T22 data -local/make_musan.sh /export/corpora/JHU/musan data +steps/data/make_musan.sh --sampling-rate 16000 /export/corpora/JHU/musan data steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 30 --cmd "$train_cmd" \ data/musan_speech exp/make_mfcc $mfccdir diff --git a/egs/callhome_diarization/v1/diarization/VB_diarization.py b/egs/callhome_diarization/v1/diarization/VB_diarization.py new file mode 100755 index 00000000000..62676d64510 --- /dev/null +++ b/egs/callhome_diarization/v1/diarization/VB_diarization.py @@ -0,0 +1,353 @@ +#!/usr/bin/env python3 +# Copyright 2013-2017 Lukas Burget (burget@fit.vutbr.cz) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# +# Revision History +# L. Burget 16/07/13 01:00AM - original version +# L. Burget 20/06/17 12:07AM - np.asarray replaced by .toarray() +# - minor bug fix in initializing q +# - minor bug fix in ELBO calculation +# - few more optimizations + +import numpy as np +from scipy.sparse import coo_matrix +import scipy.linalg as spl +#import numexpr as ne # the dependency on this modul can be avoided by replacing +# # logsumexp_ne and exp_ne with logsumexp and np.exp + +#[q sp Li] = +def VB_diarization(X, m, iE, w, V, sp=None, q=None, + maxSpeakers = 10, maxIters = 10, + epsilon = 1e-4, loopProb = 0.99, statScale = 1.0, + alphaQInit = 1.0, downsample = None, VtiEV = None, ref=None, + plot=False, sparsityThr=0.001, llScale=1.0, minDur=1): + + """ + This a generalized version of speaker diarization described in: + + Kenny, P. Bayesian Analysis of Speaker Diarization with Eigenvoice Priors, + Montreal, CRIM, May 2008. + + Kenny, P., Reynolds, D., and Castaldo, F. Diarization of Telephone + Conversations using Factor Analysis IEEE Journal of Selected Topics in Signal + Processing, December 2010. + + The generalization introduced in this implementation lies in using an HMM + instead of the simple mixture model when modeling generation of segments + (or even frames) from speakers. HMM limits the probability of switching + between speakers when changing frames, which makes it possible to use + the model on frame-by-frame bases without any need to iterate between + 1) clustering speech segments and 2) re-segmentation (i.e. as it was done in + the paper above). + + Inputs: + X - T x D array, where columns are D dimensional feature vectors for T frames + m - C x D array of GMM component means + iE - C x D array of GMM component inverse covariance matrix diagonals + w - C dimensional column vector of GMM component weights + V - R x C x D array of eigenvoices + maxSpeakers - maximum number of speakers expected in the utterance + maxIters - maximum number of algorithm iterations + epsilon - stop iterating, if obj. fun. improvement is less than epsilon + loopProb - probability of not switching speakers between frames + statScale - scale sufficient statiscits collected using UBM + llScale - scale UBM likelihood (i.e. llScale < 1.0 make atribution of + frames to UBM componets more uncertain) + sparsityThr - set occupations smaller that this threshold to 0.0 (saves memory + as the posteriors are represented by sparse matrix) + alphaQInit - Dirichlet concentraion parameter for initializing q + downsample - perform diarization on input downsampled by this factor + VtiEV - C x (R**2+R)/2 matrix normally calculated by VB_diarization when + VtiEV is None. However, it can be pre-calculated using function + precalculate_VtiEV(V) and used across calls of VB_diarization. + minDur - minimum number of frames between speaker turns imposed by linear + chains of HMM states corresponding to each speaker. All the states + in a chain share the same output distribution + ref - T dim. integer vector with reference speaker ID (0:maxSpeakers) + per frame + plot - if set to True, plot per-frame speaker posteriors. + + Outputs: + q - S x T matrix of posteriors attribution each frame to one of S possible + speakers, where S is given by opts.maxSpeakers + sp - S dimensional column vector of ML learned speaker priors. Ideally, these + should allow to estimate # of speaker in the utterance as the + probabilities of the redundant speaker should converge to zero. + Li - values of auxiliary function (and DER and frame cross-entropy between q + and reference if 'ref' is provided) over iterations. + """ + + # The references to equations corresponds to the technical report: + # Kenny, P. Bayesian Analysis of Speaker Diarization with Eigenvoice Priors, + # Montreal, CRIM, May 2008. + + D=X.shape[1] # feature dimensionality + C=len(w) # number of mixture components + R=V.shape[0] # subspace rank + nframes=X.shape[0] + + if VtiEV is None: + VtiEV = precalculate_VtiEV(V, iE) + + V = V.reshape(V.shape[0],-1) + + if sp is None: + sp = np.ones(maxSpeakers)/maxSpeakers + else: + maxSpeakers = len(sp) + + if q is None: + # initialize q from flat Dirichlet prior with concentrsaion parameter alphaQInit + q = np.random.gamma(alphaQInit, size=(nframes, maxSpeakers)) + q = q / q.sum(1, keepdims=True) + + # calculate UBM mixture frame posteriors (i.e. per-frame zero order statistics) + ll = (X**2).dot(-0.5*iE.T) + X.dot(iE.T*m.T)-0.5*((iE * m**2 - np.log(iE)).sum(1) - 2*np.log(w) + D*np.log(2*np.pi)) + ll *= llScale + G = logsumexp(ll, axis=1) + NN = np.exp(ll - G[:,np.newaxis]) * statScale + NN[NN 0 and L - Li[-2][0] < epsilon: + if L - Li[-1][0] < 0: print('WARNING: Value of auxiliary function has decreased!') + break + + if downsample is not None: + #upsample resulting q to match number of frames in the input utterance + q = downsampler.T.dot(q) + + return q, sp, Li + + +def precalculate_VtiEV(V, iE): + tril_ind = np.tril_indices(V.shape[0]) + VtiEV = np.empty((V.shape[1],len(tril_ind[0])), V.dtype) + for c in range(V.shape[1]): + VtiEV[c,:] = np.dot(V[:,c,:]*iE[np.newaxis,c,:], V[:,c,:].T)[tril_ind] + return VtiEV + + +# Initialize q (per-frame speaker posteriors) from a reference +# (vector of per-frame zero based integer speaker IDs) +def frame_labels2posterior_mx(labels, maxSpeakers): + #initialize from reference + #pmx = np.zeros((len(labels), labels.max()+1)) + pmx = np.zeros((len(labels), maxSpeakers)) + pmx[np.arange(len(labels)), labels] = 1 + return pmx + +# Calculates Diarization Error Rate (DER) or per-frame cross-entropy between +# reference (vector of per-frame zero based integer speaker IDs) and q (per-frame +# speaker posteriors). If expected=False, q is converted into hard labels before +# calculating DER. If expected=TRUE, posteriors in q are used to calculated +# "expected" DER. +def DER(q, ref, expected=True, xentropy=False): + from itertools import permutations + + if not expected: + # replce probabiities in q by zeros and ones + hard_labels = q.argmax(1) + q = np.zeros_like(q) + q[range(len(q)), hard_labels] = 1 + + err_mx = np.empty((ref.max()+1, q.shape[1])) + for s in range(err_mx.shape[0]): + tmpq = q[ref == s,:] + err_mx[s] = (-np.log(tmpq) if xentropy else tmpq).sum(0) + + if err_mx.shape[0] < err_mx.shape[1]: + err_mx = err_mx.T + + # try all alignments (permutations) of reference and detected speaker + #could be written in more efficient way using dynamic programing + acc = [err_mx[perm[:err_mx.shape[1]], range(err_mx.shape[1])].sum() + for perm in permutations(range(err_mx.shape[0]))] + if xentropy: + return min(acc)/float(len(ref)) + else: + return (len(ref) - max(acc))/float(len(ref)) + + +############################################################################### +# Module private functions +############################################################################### +def logsumexp(x, axis=0): + xmax = x.max(axis) + x = xmax + np.log(np.sum(np.exp(x - np.expand_dims(xmax, axis)), axis)) + infs = np.isinf(xmax) + if np.ndim(x) > 0: + x[infs] = xmax[infs] + elif infs: + x = xmax + return x + + +# The folowing two functions are only versions optimized for speed using numexpr +# module and can be replaced by logsumexp and np.exp functions to avoid +# the dependency on the module. +def logsumexp_ne(x, axis=0): + xmax = np.array(x).max(axis=axis) + xmax_e = np.expand_dims(xmax, axis) + x = ne.evaluate("sum(exp(x - xmax_e), axis=%d)" % axis) + x = ne.evaluate("xmax + log(x)") + infs = np.isinf(xmax) + if np.ndim(x) > 0: + x[infs] = xmax[infs] + elif infs: + x = xmax + return x + + +def exp_ne(x, out=None): + return ne.evaluate("exp(x)", out=None) + + +# Convert vector with lower-triangular coefficients into symetric matrix +def tril_to_sym(tril): + R = np.sqrt(len(tril)*2).astype(int) + tril_ind = np.tril_indices(R) + S = np.empty((R,R)) + S[tril_ind] = tril + S[tril_ind[::-1]] = tril + return S + + +def logdet(A): + return 2*np.sum(np.log(np.diag(spl.cholesky(A)))) + + +def forward_backward(lls, tr, ip): + """ + Inputs: + lls - matrix of per-frame log HMM state output probabilities + tr - transition probability matrix + ip - vector of initial state probabilities (i.e. statrting in the state) + Outputs: + sp - matrix of per-frame state occupation posteriors + tll - total (forward) log-likelihood + lfw - log forward probabilities + lfw - log backward probabilities + """ + ltr = np.log(tr) + lfw = np.empty_like(lls) + lbw = np.empty_like(lls) + lfw[:] = -np.inf + lbw[:] = -np.inf + lfw[0] = lls[0] + np.log(ip) + lbw[-1] = 0.0 + + for ii in range(1,len(lls)): + lfw[ii] = lls[ii] + logsumexp(lfw[ii-1] + ltr.T, axis=1) + + for ii in reversed(range(len(lls)-1)): + lbw[ii] = logsumexp(ltr + lls[ii+1] + lbw[ii+1], axis=1) + + tll = logsumexp(lfw[-1]) + sp = np.exp(lfw + lbw - tll) + return sp, tll, lfw, lbw diff --git a/egs/callhome_diarization/v1/diarization/VB_resegmentation.py b/egs/callhome_diarization/v1/diarization/VB_resegmentation.py new file mode 100755 index 00000000000..e507c088563 --- /dev/null +++ b/egs/callhome_diarization/v1/diarization/VB_resegmentation.py @@ -0,0 +1,221 @@ +#!/usr/bin/env python3 + +# Copyright 2019 Zili Huang + +# This script is evoked by diarization/VB_resegmentation.sh. It prepares the necessary +# inputs for the VB system and creates the output RTTM file. The inputs include data directory +# (data_dir), the rttm file to initialize the VB system(init_rttm_filename), the directory to +# output the rttm prediction(output_dir), path to diagonal UBM model(dubm_model) and path to +# i-vector extractor model(ie_model). + +import numpy as np +import VB_diarization +import kaldi_io +import argparse +from convert_VB_model import load_dubm, load_ivector_extractor + +def get_utt_list(utt2spk_filename): + with open(utt2spk_filename, 'r') as fh: + content = fh.readlines() + utt_list = [line.split()[0] for line in content] + print("{} utterances in total".format(len(utt_list))) + return utt_list + +# prepare utt2num_frames dictionary +def get_utt2num_frames(utt2num_frames_filename): + utt2num_frames = {} + with open(utt2num_frames_filename, 'r') as fh: + content = fh.readlines() + for line in content: + line = line.strip('\n') + line_split = line.split() + utt2num_frames[line_split[0]] = int(line_split[1]) + return utt2num_frames + +# prepare utt2feats dictionary +def get_utt2feats(utt2feats_filename): + utt2feats = {} + with open(utt2feats_filename, 'r') as fh: + content = fh.readlines() + for line in content: + line = line.strip('\n') + line_split = line.split(None, 1) + utt2feats[line_split[0]] = line_split[1] + return utt2feats + +def create_ref(uttname, utt2num_frames, full_rttm_filename): + num_frames = utt2num_frames[uttname] + + # We use 0 to denote silence frames and 1 to denote overlapping frames. + ref = np.zeros(num_frames) + speaker_dict = {} + num_spk = 0 + + with open(full_rttm_filename, 'r') as fh: + content = fh.readlines() + for line in content: + line = line.strip('\n') + line_split = line.split() + uttname_line = line_split[1] + if uttname != uttname_line: + continue + start_time, duration = int(float(line_split[3]) * 100), int(float(line_split[4]) * 100) + end_time = start_time + duration + spkname = line_split[7] + if spkname not in speaker_dict.keys(): + spk_idx = num_spk + 2 + speaker_dict[spkname] = spk_idx + num_spk += 1 + + for i in range(start_time, end_time): + if i < 0: + raise ValueError("Time index less than 0") + elif i >= num_frames: + print("Time index exceeds number of frames") + break + else: + if ref[i] == 0: + ref[i] = speaker_dict[spkname] + else: + ref[i] = 1 # The overlapping speech is marked as 1. + return ref.astype(int) + +# create output rttm file +def create_rttm_output(uttname, predicted_label, output_dir, channel): + num_frames = len(predicted_label) + + start_idx = 0 + seg_list = [] + + last_label = predicted_label[0] + for i in range(num_frames): + if predicted_label[i] == last_label: # The speaker label remains the same. + continue + else: # The speaker label is different. + if last_label != 0: # Ignore the silence. + seg_list.append([start_idx, i, last_label]) + start_idx = i + last_label = predicted_label[i] + if last_label != 0: + seg_list.append([start_idx, num_frames, last_label]) + + with open("{}/{}_predict.rttm".format(output_dir, uttname), 'w') as fh: + for i in range(len(seg_list)): + start_frame = (seg_list[i])[0] + end_frame = (seg_list[i])[1] + label = (seg_list[i])[2] + duration = end_frame - start_frame + fh.write("SPEAKER {} {} {:.2f} {:.2f} {} \n".format(uttname, channel, start_frame / 100.0, duration / 100.0, label)) + return 0 + +def main(): + parser = argparse.ArgumentParser(description='VB Resegmentation Wrapper') + parser.add_argument('data_dir', type=str, help='Subset data directory') + parser.add_argument('init_rttm_filename', type=str, + help='The rttm file to initialize the VB system, usually the AHC cluster result') + parser.add_argument('output_dir', type=str, help='Output directory') + parser.add_argument('dubm_model', type=str, help='Path of the diagonal UBM model') + parser.add_argument('ie_model', type=str, help='Path of the i-vector extractor model') + + parser.add_argument('--max-speakers', type=int, default=10, + help='Maximum number of speakers expected in the utterance (default: 10)') + parser.add_argument('--max-iters', type=int, default=10, + help='Maximum number of algorithm iterations (default: 10)') + parser.add_argument('--downsample', type=int, default=25, + help='Perform diarization on input downsampled by this factor (default: 25)') + parser.add_argument('--alphaQInit', type=float, default=100.0, + help='Dirichlet concentraion parameter for initializing q') + parser.add_argument('--sparsityThr', type=float, default=0.001, + help='Set occupations smaller that this threshold to 0.0 (saves memory as \ + the posteriors are represented by sparse matrix)') + parser.add_argument('--epsilon', type=float, default=1e-6, + help='Stop iterating, if obj. fun. improvement is less than epsilon') + parser.add_argument('--minDur', type=int, default=1, + help='Minimum number of frames between speaker turns imposed by linear \ + chains of HMM states corresponding to each speaker. All the states \ + in a chain share the same output distribution') + parser.add_argument('--loopProb', type=float, default=0.9, + help='Probability of not switching speakers between frames') + parser.add_argument('--statScale', type=float, default=0.2, + help='Scale sufficient statiscits collected using UBM') + parser.add_argument('--llScale', type=float, default=1.0, + help='Scale UBM likelihood (i.e. llScale < 1.0 make atribution of \ + frames to UBM componets more uncertain)') + parser.add_argument('--channel', type=int, default=0, + help='Channel information in the rttm file') + parser.add_argument('--initialize', type=int, default=1, + help='Whether to initalize the speaker posterior') + + args = parser.parse_args() + print(args) + + utt_list = get_utt_list("{}/utt2spk".format(args.data_dir)) + utt2num_frames = get_utt2num_frames("{}/utt2num_frames".format(args.data_dir)) + + # Load the diagonal UBM and i-vector extractor + dubm_para = load_dubm(args.dubm_model) + ie_para = load_ivector_extractor(args.ie_model) + + # Check the diagonal UBM and i-vector extractor model + assert '' in dubm_para and '' in dubm_para and '' in dubm_para + DUBM_WEIGHTS, DUBM_MEANS_INVVARS, DUBM_INV_VARS = dubm_para[''], dubm_para[''], dubm_para[''] + assert 'M' in ie_para + IE_M = np.transpose(ie_para['M'], (2, 0, 1)) + + m = DUBM_MEANS_INVVARS / DUBM_INV_VARS + iE = DUBM_INV_VARS + w = DUBM_WEIGHTS + V = IE_M + + # Load the MFCC features + feats_dict = get_utt2feats("{}/feats.scp".format(args.data_dir)) + + for utt in utt_list: + # Get the alignments from the clustering result. + # In init_ref, 0 denotes the silence silence frames + # 1 denotes the overlapping speech frames, the speaker + # label starts from 2. + init_ref = create_ref(utt, utt2num_frames, args.init_rttm_filename) + + # load MFCC features + X = kaldi_io.read_mat(feats_dict[utt]).astype(np.float64) + assert len(init_ref) == len(X) + + # Keep only the voiced frames (0 denotes the silence + # frames, 1 denotes the overlapping speech frames). + mask = (init_ref >= 2) + X_voiced = X[mask] + init_ref_voiced = init_ref[mask] - 2 + + if X_voiced.shape[0] == 0: + print("Warning: {} has no voiced frames in the initialization file".format(utt)) + continue + + # Initialize the posterior of each speaker based on the clustering result. + if args.initialize: + q = VB_diarization.frame_labels2posterior_mx(init_ref_voiced, args.max_speakers) + else: + q = None + + # VB resegmentation + + # q - S x T matrix of posteriors attribution each frame to one of S possible + # speakers, where S is given by opts.maxSpeakers + # sp - S dimensional column vector of ML learned speaker priors. Ideally, these + # should allow to estimate # of speaker in the utterance as the + # probabilities of the redundant speaker should converge to zero. + # Li - values of auxiliary function (and DER and frame cross-entropy between q + # and reference if 'ref' is provided) over iterations. + q_out, sp_out, L_out = VB_diarization.VB_diarization(X_voiced, m, iE, w, V, sp=None, q=q, maxSpeakers=args.max_speakers, maxIters=args.max_iters, VtiEV=None, + downsample=args.downsample, alphaQInit=args.alphaQInit, sparsityThr=args.sparsityThr, epsilon=args.epsilon, minDur=args.minDur, + loopProb=args.loopProb, statScale=args.statScale, llScale=args.llScale, ref=None, plot=False) + predicted_label_voiced = np.argmax(q_out, 1) + 2 + predicted_label = (np.zeros(len(mask))).astype(int) + predicted_label[mask] = predicted_label_voiced + + # Create the output rttm file + create_rttm_output(utt, predicted_label, args.output_dir, args.channel) + return 0 + +if __name__ == "__main__": + main() diff --git a/egs/callhome_diarization/v1/diarization/VB_resegmentation.sh b/egs/callhome_diarization/v1/diarization/VB_resegmentation.sh new file mode 100755 index 00000000000..765c4eee8b8 --- /dev/null +++ b/egs/callhome_diarization/v1/diarization/VB_resegmentation.sh @@ -0,0 +1,115 @@ +#!/bin/bash + +# Copyright 2019 Zili Huang + +# This script is a wrapper for Variational Bayes resegmentation. +# It shows how to use the code from Brno University of Technology +# to do resegmentation. + +# Begin configuration section. +nj=20 +cmd=run.pl +stage=0 +max_speakers=10 +max_iters=10 +downsample=25 +alphaQInit=100.0 +sparsityThr=0.001 +epsilon=1e-6 +minDur=1 +loopProb=0.9 +statScale=0.2 +llScale=1.0 +channel=0 +initialize=1 +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 5 ]; then + echo "Usage: diarization/VB_resegmentation.sh " + echo "Variational Bayes Re-segmenatation" + echo "Options: " + echo " --cmd (utils/run.pl|utils/queue.pl ) # How to run jobs." + echo " --nj # Number of parallel jobs to run." + echo " --max-speakers # Maximum number of speakers" + echo " # expected in the utterance" + echo " # (default: 10)" + echo " --max-iters # Maximum number of algorithm" + echo " # iterations (default: 10)" + echo " --downsample # Perform diarization on input" + echo " # downsampled by this factor" + echo " # (default: 25)" + echo " --alphaQInit # Dirichlet concentraion" + echo " # parameter for initializing q" + echo " --sparsityThr # Set occupations smaller that" + echo " # this threshold to 0.0 (saves" + echo " # memory as the posteriors are" + echo " # represented by sparse matrix)" + echo " --epsilon # Stop iterating, if obj. fun." + echo " # improvement is less than" + echo " # epsilon" + echo " --minDur # Minimum number of frames" + echo " # between speaker turns imposed" + echo " # by linear chains of HMM" + echo " # state corresponding to each" + echo " # speaker. All the states in" + echo " # a chain share the same output" + echo " # distribution" + echo " --loopProb # Probability of not switching" + echo " # speakers between frames" + echo " --statScale # Scale sufficient statistics" + echo " # collected using UBM" + echo " --llScale # Scale UBM likelihood (i.e." + echo " # llScale < 1.0 make" + echo " # attribution of frames to UBM" + echo " # componets more uncertain)" + echo " --channel # Channel information in the rttm file" + echo " --initialize # Whether to initalize the" + echo " # speaker posterior (if not)" + echo " # the speaker posterior will be" + echo " # randomly initilized" + + exit 1; +fi + +data_dir=$1 +init_rttm_filename=$2 +output_dir=$3 +dubm_model=$4 +ie_model=$5 + +mkdir -p $output_dir/tmp + +sdata=$data_dir/split$nj; +utils/split_data.sh $data_dir $nj || exit 1; + +if [ $stage -le 0 ]; then + # Dump the diagonal UBM model into txt format. + "$train_cmd" $output_dir/log/convert_diag_ubm.log \ + gmm-global-copy --binary=false \ + $dubm_model \ + $output_dir/tmp/dubm.tmp || exit 1; + + # Dump the ivector extractor model into txt format. + "$train_cmd" $output_dir/log/convert_ie.log \ + ivector-extractor-copy --binary=false \ + $ie_model \ + $output_dir/tmp/ie.tmp || exit 1; +fi + +if [ $stage -le 1 ]; then + # VB resegmentation + $cmd JOB=1:$nj $output_dir/log/VB_resegmentation.JOB.log \ + python3 diarization/VB_resegmentation.py --max-speakers $max_speakers \ + --max-iters $max_iters --downsample $downsample --alphaQInit $alphaQInit \ + --sparsityThr $sparsityThr --epsilon $epsilon --minDur $minDur \ + --loopProb $loopProb --statScale $statScale --llScale $llScale \ + --channel $channel --initialize $initialize \ + $sdata/JOB $init_rttm_filename $output_dir/tmp $output_dir/tmp/dubm.tmp $output_dir/tmp/ie.tmp || exit 1; + + cat $output_dir/tmp/*.rttm > $output_dir/rttm/VB_rttm +fi diff --git a/egs/callhome_diarization/v1/diarization/cluster.sh b/egs/callhome_diarization/v1/diarization/cluster.sh index 4f46b3ba5ef..5e5c6e9dbe5 100755 --- a/egs/callhome_diarization/v1/diarization/cluster.sh +++ b/egs/callhome_diarization/v1/diarization/cluster.sh @@ -14,6 +14,9 @@ stage=0 nj=10 cleanup=true threshold=0.5 +max_spk_fraction=1.0 +first_pass_max_utterances=32767 +rttm_channel=0 read_costs=false reco2num_spk= # End configuration section. @@ -35,6 +38,17 @@ if [ $# != 2 ]; then echo " --threshold # Cluster stopping criterion. Clusters with scores greater" echo " # than this value will be merged until all clusters" echo " # exceed this value." + echo " --max-spk-fraction # Clusters with total fraction of utterances greater than" + echo " # this value will not be merged. This is active only when" + echo " # reco2num-spk is supplied and" + echo " # 1.0 / num-spk <= max-spk-fraction <= 1.0." + echo " --first-pass-max-utterances # If the number of utterances is larger than first-pass-max-utterances," + echo " # then clustering is done in two passes. In the first pass, input points" + echo " # are divided into contiguous subsets of size first-pass-max-utterances" + echo " # and each subset is clustered separately. In the second pass, the first" + echo " # pass clusters are merged into the final set of clusters." + echo " --rttm-channel # The value passed into the RTTM channel field. Only affects" + echo " # the format of the RTTM file." echo " --read-costs # If true, interpret input scores as costs, i.e. similarity" echo " # is indicated by smaller values. If enabled, clusters will" echo " # be merged until all cluster scores are less than the" @@ -75,8 +89,10 @@ if [ $stage -le 0 ]; then echo "$0: clustering scores" $cmd JOB=1:$nj $dir/log/agglomerative_cluster.JOB.log \ agglomerative-cluster --threshold=$threshold --read-costs=$read_costs \ - --reco2num-spk-rspecifier=$reco2num_spk scp:"$feats" \ - ark,t:$sdata/JOB/spk2utt ark,t:$dir/labels.JOB || exit 1; + --reco2num-spk-rspecifier=$reco2num_spk \ + --max-spk-fraction=$max_spk_fraction \ + --first-pass-max-utterances=$first_pass_max_utterances \ + scp:"$feats" ark,t:$sdata/JOB/spk2utt ark,t:$dir/labels.JOB || exit 1; fi if [ $stage -le 1 ]; then @@ -86,7 +102,7 @@ fi if [ $stage -le 2 ]; then echo "$0: computing RTTM" - diarization/make_rttm.py $srcdir/segments $dir/labels $dir/rttm || exit 1; + diarization/make_rttm.py --rttm-channel $rttm_channel $srcdir/segments $dir/labels $dir/rttm || exit 1; fi if $cleanup ; then diff --git a/egs/callhome_diarization/v1/diarization/convert_VB_model.py b/egs/callhome_diarization/v1/diarization/convert_VB_model.py new file mode 100755 index 00000000000..b1f25b0dbfd --- /dev/null +++ b/egs/callhome_diarization/v1/diarization/convert_VB_model.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python + +# Copyright 2019 Zili Huang +# Apache 2.0 + +# This script loads diagonal UBM and ivector extractor from text file. + +import os +import numpy as np + +def load_dubm(dubm_text): + assert os.path.exists(dubm_text) + + para_dict = {} + state = 0 + data_array = [] + + with open(dubm_text, 'r') as fh: + content = fh.readlines() + + for line in content: + line = line.strip('\n') + line_split = line.split() + if state == 0: + if len(line_split) == 1: + continue + elif len(line_split) == 2 and line_split[1] == "[": # Start of a multi-line matrix like and + para_name = line_split[0] + state = 1 + data_array = [] + elif len(line_split) >= 3 and line_split[1] == "[" and line_split[-1] == "]": # Single line vector like + para_name = line_split[0] + data_list = [] + for i in range(2, len(line_split) - 1): + data_list.append(float(line_split[i])) + data_list = np.array(data_list) + para_dict[para_name] = data_list + else: + raise ValueError("Condition not defined.") + elif state == 1: + if line_split[-1] == "]": # End of a multi-line matrix like and + data_list = [] + for i in range(len(line_split) - 1): + data_list.append(float(line_split[i])) + data_list = np.array(data_list) + data_array.append(data_list) + data_array = np.array(data_array) + para_dict[para_name] = data_array + state = 0 + else: + data_list = [] + for i in range(len(line_split)): + data_list.append(float(line_split[i])) + data_list = np.array(data_list) + data_array.append(data_list) + else: + raise ValueError("Condition not defined.") + return para_dict # the diagonal ubm parameter includes , , , + +def load_ivector_extractor(ie_text): + assert os.path.exists(ie_text) + + para_dict = {} + state = 0 + data_3dmatrix = [] + + with open(ie_text, 'r') as fh: + content = fh.readlines() + + for line in content: + line = line.strip('\n') + if line == " [": + break + if state == 0: + if not line.startswith(""): + continue + else: + state = 1 + data_matrix = [] + elif state == 1: + line_split = line.split() + if line_split[0] == "[": + data_matrix = [] + continue + elif line_split[-1] == "]": + data_array = [] + for i in range(len(line_split)-1): + data_array.append(float(line_split[i])) + data_matrix.append(data_array) + data_3dmatrix.append(data_matrix) + else: + data_array = [] + for i in range(len(line_split)): + data_array.append(float(line_split[i])) + data_matrix.append(data_array) + else: + raise ValueError("Condition not defined.") + para_dict['M'] = np.array(data_3dmatrix) + return para_dict # the ivector extractor parameter is a 3d matrix of shape [num-gaussian, feat-dim, ivec-dim] diff --git a/egs/callhome_diarization/v1/diarization/extract_ivectors.sh b/egs/callhome_diarization/v1/diarization/extract_ivectors.sh index 370a37b873e..d7bb389bad5 100755 --- a/egs/callhome_diarization/v1/diarization/extract_ivectors.sh +++ b/egs/callhome_diarization/v1/diarization/extract_ivectors.sh @@ -29,6 +29,10 @@ min_post=0.025 # Minimum posterior to use (posteriors below this are pruned out) posterior_scale=1.0 # This scale helps to control for successve features being highly # correlated. E.g. try 0.1 or 0.3. apply_cmn=true # If true, apply sliding window cepstral mean normalization +apply_deltas=true # If true, copy the delta options from the i-vector extractor directory. + # If false, we won't add deltas in this step. For speaker diarization, + # we sometimes need to write features to disk that already have various + # post-processing applied so adding deltas is no longer needed in this stage. # End configuration section. echo "$0 $@" # Print the command line for logging @@ -57,6 +61,12 @@ if [ $# != 3 ]; then echo " --min-post # Pruning threshold for posteriors" echo " --apply-cmn # if true, apply sliding window cepstral mean" echo " # normalization to features" + echo " --apply-deltas # If true, copy the delta options from the i-vector" + echo " # extractor directory. If false, we won't add deltas" + echo " # in this step. For speaker diarization, we sometimes" + echo " # need to write features to disk that already have" + echo " # various post-processing applied so adding deltas is" + echo " # no longer needed in this stage." exit 1; fi @@ -82,7 +92,7 @@ if [ $stage -le 0 ]; then fi utils/data/get_uniform_subsegments.py \ --max-segment-duration=$window \ - --overlap-duration=$(echo "$window-$period" | bc) \ + --overlap-duration=$(perl -e "print $window-$period") \ --max-remaining-duration=$min_segment \ --constant-duration=True \ $segments > $dir/subsegments @@ -95,7 +105,11 @@ mkdir -p $dir/log sub_sdata=$sub_data/split$nj; utils/split_data.sh $sub_data $nj || exit 1; -delta_opts=`cat $srcdir/delta_opts 2>/dev/null` +if $apply_deltas; then + delta_opts=`cat $srcdir/delta_opts 2>/dev/null` +else + delta_opts="--delta-order=0" +fi ## Set up features. if $apply_cmn; then diff --git a/egs/callhome_diarization/v1/diarization/make_rttm.py b/egs/callhome_diarization/v1/diarization/make_rttm.py index 1705411069f..fc32eafd530 100755 --- a/egs/callhome_diarization/v1/diarization/make_rttm.py +++ b/egs/callhome_diarization/v1/diarization/make_rttm.py @@ -34,9 +34,7 @@ import argparse import sys - -sys.path.append('steps/libs') -import common as common_lib +import codecs def get_args(): @@ -51,6 +49,9 @@ def get_args(): help="Input labels file") parser.add_argument("rttm_file", type=str, help="Output RTTM file") + parser.add_argument("--rttm-channel", type=int, default=0, + help="The value passed into the RTTM channel field. \ + Only affects the format of the RTTM file.") args = parser.parse_args() return args @@ -60,14 +61,14 @@ def main(): # File containing speaker labels per segment seg2label = {} - with common_lib.smart_open(args.labels) as labels_file: + with codecs.open(args.labels, 'r', 'utf-8') as labels_file: for line in labels_file: seg, label = line.strip().split() seg2label[seg] = label # Segments file reco2segs = {} - with common_lib.smart_open(args.segments) as segments_file: + with codecs.open(args.segments, 'r', 'utf-8') as segments_file: for line in segments_file: seg, reco, start, end = line.strip().split() try: @@ -80,7 +81,7 @@ def main(): # Cut up overlapping segments so they are contiguous contiguous_segs = [] - for reco in reco2segs: + for reco in sorted(reco2segs): segs = reco2segs[reco].strip().split() new_segs = "" for i in range(1, len(segs)-1): @@ -114,14 +115,14 @@ def main(): new_segs += " " + start + "," + end + "," + label merged_segs.append(reco + new_segs) - with common_lib.smart_open(args.rttm_file, 'w') as rttm_writer: + with codecs.open(args.rttm_file, 'w', 'utf-8') as rttm_writer: for reco_line in merged_segs: segs = reco_line.strip().split() reco = segs[0] for i in range(1, len(segs)): start, end, label = segs[i].strip().split(',') - print("SPEAKER {0} 0 {1:7.3f} {2:7.3f} {3} ".format( - reco, float(start), float(end)-float(start), label), file=rttm_writer) + print("SPEAKER {0} {1} {2:7.3f} {3:7.3f} {4} ".format( + reco, args.rttm_channel, float(start), float(end)-float(start), label), file=rttm_writer) if __name__ == '__main__': main() diff --git a/egs/callhome_diarization/v1/diarization/nnet3/xvector/extract_xvectors.sh b/egs/callhome_diarization/v1/diarization/nnet3/xvector/extract_xvectors.sh index d7591a6a3a8..8d579138c73 100755 --- a/egs/callhome_diarization/v1/diarization/nnet3/xvector/extract_xvectors.sh +++ b/egs/callhome_diarization/v1/diarization/nnet3/xvector/extract_xvectors.sh @@ -102,7 +102,7 @@ if [ $stage -le 0 ]; then fi utils/data/get_uniform_subsegments.py \ --max-segment-duration=$window \ - --overlap-duration=$(echo "$window-$period" | bc) \ + --overlap-duration=$(perl -e "print ($window-$period);") \ --max-remaining-duration=$min_segment \ --constant-duration=True \ $segments > $dir/subsegments diff --git a/egs/callhome_diarization/v1/diarization/train_ivector_extractor_diag.sh b/egs/callhome_diarization/v1/diarization/train_ivector_extractor_diag.sh new file mode 100755 index 00000000000..9254012f3b0 --- /dev/null +++ b/egs/callhome_diarization/v1/diarization/train_ivector_extractor_diag.sh @@ -0,0 +1,151 @@ +#!/bin/bash + +# Copyright 2013 Daniel Povey +# 2014 David Snyder +# 2019 Zili Huang +# Apache 2.0. + +# This script trains the i-vector extractor for VB resegmentation. It is very similar to +# sid/train_ivector_extractor.sh except that the UBM is assumed to be diagonal in this script. + +# Begin configuration section. +nj=10 # this is the number of separate queue jobs we run, but each one + # contains num_processes sub-jobs.. the real number of threads we + # run is nj * num_processes * num_threads, and the number of + # separate pieces of data is nj * num_processes. +num_threads=4 +num_processes=4 # each job runs this many processes, each with --num-threads threads +cmd="run.pl" +stage=-4 +num_gselect=20 # Gaussian-selection using diagonal model: number of Gaussians to select +ivector_dim=400 # dimension of the extracted i-vector +use_weights=false # set to true to turn on the regression of log-weights on the ivector. +num_iters=10 +min_post=0.025 # Minimum posterior to use (posteriors below this are pruned out) +num_samples_for_weights=3 # smaller than the default for speed (relates to a sampling method) +cleanup=true +apply_cmn=true # If true, apply sliding window cepstral mean normalization +posterior_scale=1.0 # This scale helps to control for successve features being highly + # correlated. E.g. try 0.1 or 0.3 +sum_accs_opt= +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + + +if [ $# != 3 ]; then + echo "Usage: $0 " + echo " e.g.: $0 exp/ubm_2048_male/final.dubm data/train_male exp/extractor_male" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --num-iters <#iters|10> # Number of iterations of E-M" + echo " --nj # Number of jobs (also see num-processes and num-threads)" + echo " --num-processes # Number of processes for each queue job (relates" + echo " # to summing accs in memory)" + echo " --num-threads # Number of threads for each process (can't be usefully" + echo " # increased much above 4)" + echo " --stage # To control partial reruns" + echo " --num-gselect # Number of Gaussians to select using" + echo " # diagonal model." + echo " --sum-accs-opt # Option e.g. '-l hostname=a15' to localize" + echo " # sum-accs process to nfs server." + echo " --apply-cmn # if true, apply sliding window cepstral mean" + echo " # normalization to features" + exit 1; +fi + +gmm_model=$1 +data=$2 +dir=$3 +srcdir=$(dirname $gmm_model) + +for f in $gmm_model $data/feats.scp ; do + [ ! -f $f ] && echo "No such file $f" && exit 1; +done + +# Set various variables. +mkdir -p $dir/log +nj_full=$[$nj*$num_processes] +sdata=$data/split$nj_full; +utils/split_data.sh $data $nj_full || exit 1; + +delta_opts=`cat $srcdir/delta_opts 2>/dev/null` +if [ -f $srcdir/delta_opts ]; then + cp $srcdir/delta_opts $dir/ 2>/dev/null +fi + +parallel_opts="--num-threads $[$num_threads*$num_processes]" +## Set up features. +if $apply_cmn; then + feats="ark,s,cs:add-deltas $delta_opts scp:$sdata/JOB/feats.scp ark:- | apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=300 ark:- ark:- | select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- |" +else + feats="ark,s,cs:add-deltas $delta_opts scp:$sdata/JOB/feats.scp ark:- | select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- |" +fi + +# Initialize the i-vector extractor using the FGMM input +if [ $stage -le -2 ]; then + cp $gmm_model $dir/final.dubm || exit 1; + $cmd $dir/log/init.log \ + ivector-extractor-init --ivector-dim=$ivector_dim --use-weights=$use_weights \ + "gmm-global-to-fgmm $dir/final.dubm -|" $dir/0.ie || exit 1 +fi + +# Do Gaussian selection and posterior extracion + +if [ $stage -le -1 ]; then + echo $nj_full > $dir/num_jobs + echo "$0: doing Gaussian selection and posterior computation" + $cmd JOB=1:$nj_full $dir/log/gselect.JOB.log \ + gmm-global-get-post --n=$num_gselect --min-post=$min_post $dir/final.dubm "$feats" ark:- \| \ + scale-post ark:- $posterior_scale "ark:|gzip -c >$dir/post.JOB.gz" || exit 1; +else + if ! [ $nj_full -eq $(cat $dir/num_jobs) ]; then + echo "Num-jobs mismatch $nj_full versus $(cat $dir/num_jobs)" + exit 1 + fi +fi + +x=0 +while [ $x -lt $num_iters ]; do + if [ $stage -le $x ]; then + rm $dir/.error 2>/dev/null + + Args=() # bash array of training commands for 1:nj, that put accs to stdout. + for j in $(seq $nj_full); do + Args[$j]=`echo "ivector-extractor-acc-stats --num-threads=$num_threads --num-samples-for-weights=$num_samples_for_weights $dir/$x.ie '$feats' 'ark,s,cs:gunzip -c $dir/post.JOB.gz|' -|" | sed s/JOB/$j/g` + done + + echo "Accumulating stats (pass $x)" + for g in $(seq $nj); do + start=$[$num_processes*($g-1)+1] + $cmd $parallel_opts $dir/log/acc.$x.$g.log \ + ivector-extractor-sum-accs --parallel=true "${Args[@]:$start:$num_processes}" \ + $dir/acc.$x.$g || touch $dir/.error & + done + wait + [ -f $dir/.error ] && echo "Error accumulating stats on iteration $x" && exit 1; + accs="" + for j in $(seq $nj); do + accs+="$dir/acc.$x.$j " + done + echo "Summing accs (pass $x)" + $cmd $sum_accs_opt $dir/log/sum_acc.$x.log \ + ivector-extractor-sum-accs $accs $dir/acc.$x || exit 1; + echo "Updating model (pass $x)" + nt=$[$num_threads*$num_processes] # use the same number of threads that + # each accumulation process uses, since we + # can be sure the queue will support this many. + $cmd $parallel_opts $dir/log/update.$x.log \ + ivector-extractor-est --num-threads=$nt $dir/$x.ie $dir/acc.$x $dir/$[$x+1].ie || exit 1; + rm $dir/acc.$x.* + $cleanup && rm $dir/acc.$x $dir/$x.ie + fi + x=$[$x+1] +done +$cleanup && rm -f $dir/post.*.gz +rm -f $dir/final.ie +ln -s $x.ie $dir/final.ie diff --git a/egs/callhome_diarization/v1/local/make_callhome.sh b/egs/callhome_diarization/v1/local/make_callhome.sh index caa8f679f22..21411fb6194 100755 --- a/egs/callhome_diarization/v1/local/make_callhome.sh +++ b/egs/callhome_diarization/v1/local/make_callhome.sh @@ -70,4 +70,9 @@ utils/filter_scp.pl $data_dir/callhome1/wav.scp $data_dir/callhome/reco2num_spk utils/filter_scp.pl $data_dir/callhome2/wav.scp $data_dir/callhome/reco2num_spk \ > $data_dir/callhome2/reco2num_spk +rm $data_dir/callhome/segments || exit 1; +awk '{print $1, $1}' $data_dir/callhome/wav.scp > $data_dir/callhome/utt2spk +utils/utt2spk_to_spk2utt.pl $data_dir/callhome/utt2spk > $data_dir/callhome/spk2utt +utils/fix_data_dir.sh $data_dir/callhome + rm -rf $tmp_dir 2> /dev/null diff --git a/egs/callhome_diarization/v1/local/make_musan.py b/egs/callhome_diarization/v1/local/make_musan.py deleted file mode 100755 index b3f6652ba40..00000000000 --- a/egs/callhome_diarization/v1/local/make_musan.py +++ /dev/null @@ -1,119 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2015 David Snyder -# Apache 2.0. -# -# This file is meant to be invoked by make_musan.sh. - -import os, sys - -def process_music_annotations(path): - utt2spk = {} - utt2vocals = {} - lines = open(path, 'r').readlines() - for line in lines: - utt, genres, vocals, musician = line.rstrip().split()[:4] - # For this application, the musican ID isn't important - utt2spk[utt] = utt - utt2vocals[utt] = vocals == "Y" - return utt2spk, utt2vocals - -def prepare_music(root_dir, use_vocals): - utt2vocals = {} - utt2spk = {} - utt2wav = {} - num_good_files = 0 - num_bad_files = 0 - music_dir = os.path.join(root_dir, "music") - for root, dirs, files in os.walk(music_dir): - for file in files: - file_path = os.path.join(root, file) - if file.endswith(".wav"): - utt = str(file).replace(".wav", "") - utt2wav[utt] = file_path - elif str(file) == "ANNOTATIONS": - utt2spk_part, utt2vocals_part = process_music_annotations(file_path) - utt2spk.update(utt2spk_part) - utt2vocals.update(utt2vocals_part) - utt2spk_str = "" - utt2wav_str = "" - for utt in utt2vocals: - if utt in utt2wav: - if use_vocals or not utt2vocals[utt]: - utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" - utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n" - num_good_files += 1 - else: - print("Missing file", utt) - num_bad_files += 1 - print("In music directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") - return utt2spk_str, utt2wav_str - -def prepare_speech(root_dir): - utt2spk = {} - utt2wav = {} - num_good_files = 0 - num_bad_files = 0 - speech_dir = os.path.join(root_dir, "speech") - for root, dirs, files in os.walk(speech_dir): - for file in files: - file_path = os.path.join(root, file) - if file.endswith(".wav"): - utt = str(file).replace(".wav", "") - utt2wav[utt] = file_path - utt2spk[utt] = utt - utt2spk_str = "" - utt2wav_str = "" - for utt in utt2spk: - if utt in utt2wav: - utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" - utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n" - num_good_files += 1 - else: - print("Missing file", utt) - num_bad_files += 1 - print("In speech directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") - return utt2spk_str, utt2wav_str - -def prepare_noise(root_dir): - utt2spk = {} - utt2wav = {} - num_good_files = 0 - num_bad_files = 0 - noise_dir = os.path.join(root_dir, "noise") - for root, dirs, files in os.walk(noise_dir): - for file in files: - file_path = os.path.join(root, file) - if file.endswith(".wav"): - utt = str(file).replace(".wav", "") - utt2wav[utt] = file_path - utt2spk[utt] = utt - utt2spk_str = "" - utt2wav_str = "" - for utt in utt2spk: - if utt in utt2wav: - utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" - utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n" - num_good_files += 1 - else: - print("Missing file", utt) - num_bad_files += 1 - print("In noise directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") - return utt2spk_str, utt2wav_str - -def main(): - in_dir = sys.argv[1] - out_dir = sys.argv[2] - use_vocals = sys.argv[3] == "Y" - utt2spk_music, utt2wav_music = prepare_music(in_dir, use_vocals) - utt2spk_speech, utt2wav_speech = prepare_speech(in_dir) - utt2spk_noise, utt2wav_noise = prepare_noise(in_dir) - utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise - utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise - wav_fi = open(os.path.join(out_dir, "wav.scp"), 'w') - wav_fi.write(utt2wav) - utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), 'w') - utt2spk_fi.write(utt2spk) - - -if __name__=="__main__": - main() diff --git a/egs/callhome_diarization/v1/local/make_musan.sh b/egs/callhome_diarization/v1/local/make_musan.sh deleted file mode 100755 index 694940ad70f..00000000000 --- a/egs/callhome_diarization/v1/local/make_musan.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash -# Copyright 2015 David Snyder -# Apache 2.0. -# -# This script, called by ../run.sh, creates the MUSAN -# data directory. The required dataset is freely available at -# http://www.openslr.org/17/ - -set -e -in_dir=$1 -data_dir=$2 -use_vocals='Y' - -mkdir -p local/musan.tmp - -echo "Preparing ${data_dir}/musan..." -mkdir -p ${data_dir}/musan -local/make_musan.py ${in_dir} ${data_dir}/musan ${use_vocals} - -utils/fix_data_dir.sh ${data_dir}/musan - -grep "music" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_music -grep "speech" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_speech -grep "noise" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_noise -utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_music \ - ${data_dir}/musan ${data_dir}/musan_music -utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_speech \ - ${data_dir}/musan ${data_dir}/musan_speech -utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_noise \ - ${data_dir}/musan ${data_dir}/musan_noise - -utils/fix_data_dir.sh ${data_dir}/musan_music -utils/fix_data_dir.sh ${data_dir}/musan_speech -utils/fix_data_dir.sh ${data_dir}/musan_noise - -rm -rf local/musan.tmp - diff --git a/egs/callhome_diarization/v1/local/make_swbd2_phase1.pl b/egs/callhome_diarization/v1/local/make_swbd2_phase1.pl new file mode 100755 index 00000000000..71b26b55de5 --- /dev/null +++ b/egs/callhome_diarization/v1/local/make_swbd2_phase1.pl @@ -0,0 +1,106 @@ +#!/usr/bin/perl +use warnings; #sed replacement for -w perl parameter +# +# Copyright 2017 David Snyder +# Apache 2.0 + +if (@ARGV != 2) { + print STDERR "Usage: $0 \n"; + print STDERR "e.g. $0 /export/corpora3/LDC/LDC98S75 data/swbd2_phase1_train\n"; + exit(1); +} +($db_base, $out_dir) = @ARGV; + +if (system("mkdir -p $out_dir")) { + die "Error making directory $out_dir"; +} + +open(CS, "<$db_base/doc/callstat.tbl") || die "Could not open $db_base/doc/callstat.tbl"; +open(GNDR, ">$out_dir/spk2gender") || die "Could not open the output file $out_dir/spk2gender"; +open(SPKR, ">$out_dir/utt2spk") || die "Could not open the output file $out_dir/utt2spk"; +open(WAV, ">$out_dir/wav.scp") || die "Could not open the output file $out_dir/wav.scp"; + +@badAudio = ("3", "4"); + +$tmp_dir = "$out_dir/tmp"; +if (system("mkdir -p $tmp_dir") != 0) { + die "Error making directory $tmp_dir"; +} + +if (system("find $db_base -name '*.sph' > $tmp_dir/sph.list") != 0) { + die "Error getting list of sph files"; +} + +open(WAVLIST, "<$tmp_dir/sph.list") or die "cannot open wav list"; + +%wavs = (); +while() { + chomp; + $sph = $_; + @t = split("/",$sph); + @t1 = split("[./]",$t[$#t]); + $uttId = $t1[0]; + $wavs{$uttId} = $sph; +} + +while () { + $line = $_ ; + @A = split(",", $line); + @A1 = split("[./]",$A[0]); + $wav = $A1[0]; + if (/$wav/i ~~ @badAudio) { + # do nothing + print "Bad Audio = $wav"; + } else { + $spkr1= "sw_" . $A[2]; + $spkr2= "sw_" . $A[3]; + $gender1 = $A[5]; + $gender2 = $A[6]; + if ($gender1 eq "M") { + $gender1 = "m"; + } elsif ($gender1 eq "F") { + $gender1 = "f"; + } else { + die "Unknown Gender in $line"; + } + if ($gender2 eq "M") { + $gender2 = "m"; + } elsif ($gender2 eq "F") { + $gender2 = "f"; + } else { + die "Unknown Gender in $line"; + } + if (-e "$wavs{$wav}") { + $uttId = $spkr1 ."_" . $wav ."_1"; + if (!$spk2gender{$spkr1}) { + $spk2gender{$spkr1} = $gender1; + print GNDR "$spkr1"," $gender1\n"; + } + print WAV "$uttId"," sph2pipe -f wav -p -c 1 $wavs{$wav} |\n"; + print SPKR "$uttId"," $spkr1","\n"; + + $uttId = $spkr2 . "_" . $wav ."_2"; + if (!$spk2gender{$spkr2}) { + $spk2gender{$spkr2} = $gender2; + print GNDR "$spkr2"," $gender2\n"; + } + print WAV "$uttId"," sph2pipe -f wav -p -c 2 $wavs{$wav} |\n"; + print SPKR "$uttId"," $spkr2","\n"; + } else { + print STDERR "Missing $wavs{$wav} for $wav\n"; + } + } +} + +close(WAV) || die; +close(SPKR) || die; +close(GNDR) || die; +if (system("utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_dir"; +} +if (system("utils/fix_data_dir.sh $out_dir") != 0) { + die "Error fixing data dir $out_dir"; +} +if (system("utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) { + die "Error validating directory $out_dir"; +} diff --git a/egs/callhome_diarization/v1/run.sh b/egs/callhome_diarization/v1/run.sh index acc48bd24f9..f4652c0c0ef 100755 --- a/egs/callhome_diarization/v1/run.sh +++ b/egs/callhome_diarization/v1/run.sh @@ -188,7 +188,7 @@ if [ $stage -le 6 ]; then der=$(grep -oP 'DIARIZATION\ ERROR\ =\ \K[0-9]+([.][0-9]+)?' \ exp/tuning/${dataset}_t${threshold}) - if [ $(echo $der'<'$best_der | bc -l) -eq 1 ]; then + if [ $(perl -e "print ($der < $best_der ? 1 : 0);") -eq 1 ]; then best_der=$der best_threshold=$threshold fi diff --git a/egs/callhome_diarization/v2/run.sh b/egs/callhome_diarization/v2/run.sh index 4f730d4753c..85a2c7fdf2b 100755 --- a/egs/callhome_diarization/v2/run.sh +++ b/egs/callhome_diarization/v2/run.sh @@ -19,6 +19,8 @@ vaddir=`pwd`/mfcc data_root=/export/corpora5/LDC stage=0 nnet_dir=exp/xvector_nnet_1a/ +num_components=1024 # the number of UBM components (used for VB resegmentation) +ivector_dim=400 # the dimension of i-vector (used for VB resegmentation) # Prepare datasets if [ $stage -le 0 ]; then @@ -53,7 +55,7 @@ if [ $stage -le 1 ]; then # callhome1 and callhome2. Each partition is treated like a held-out # dataset, and used to estimate various quantities needed to perform # diarization on the other part (and vice versa). - for name in train callhome1 callhome2; do + for name in train callhome1 callhome2 callhome; do steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 40 \ --cmd "$train_cmd" --write-utt2num-frames true \ data/$name exp/make_mfcc $mfccdir @@ -115,7 +117,7 @@ if [ $stage -le 2 ]; then # Make a reverberated version of the SWBD+SRE list. Note that we don't add any # additive noise here. - python steps/data/reverberate_data_dir.py \ + steps/data/reverberate_data_dir.py \ "${rvb_opts[@]}" \ --speech-rvb-probability 1 \ --pointsource-noise-addition-probability 0 \ @@ -130,7 +132,7 @@ if [ $stage -le 2 ]; then # Prepare the MUSAN corpus, which consists of music, speech, and noise # suitable for augmentation. - local/make_musan.sh /export/corpora/JHU/musan data + steps/data/make_musan.sh --sampling-rate 8000 /export/corpora/JHU/musan data # Get the duration of the MUSAN recordings. This will be used by the # script augment_data_dir.py. @@ -140,11 +142,11 @@ if [ $stage -le 2 ]; then done # Augment with musan_noise - python steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0" --fg-noise-dir "data/musan_noise" data/train data/train_noise + steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0" --fg-noise-dir "data/musan_noise" data/train data/train_noise # Augment with musan_music - python steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/train data/train_music + steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/train data/train_music # Augment with musan_speech - python steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "data/musan_speech" data/train data/train_babble + steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "data/musan_speech" data/train data/train_babble # Combine reverb, noise, music, and babble into one directory. utils/combine_data.sh data/train_aug data/train_reverb data/train_noise data/train_music data/train_babble @@ -297,7 +299,7 @@ if [ $stage -le 10 ]; then der=$(grep -oP 'DIARIZATION\ ERROR\ =\ \K[0-9]+([.][0-9]+)?' \ $nnet_dir/tuning/${dataset}_t${threshold}) - if [ $(echo $der'<'$best_der | bc -l) -eq 1 ]; then + if [ $(perl -e "print ($der < $best_der ? 1 : 0);") -eq 1 ]; then best_der=$der best_threshold=$threshold fi @@ -356,3 +358,47 @@ if [ $stage -le 11 ]; then # Compare to 8.69% in ../v1/run.sh echo "Using the oracle number of speakers, DER: $der%" fi + +# Variational Bayes resegmentation using the code from Brno University of Technology +# Please see https://speech.fit.vutbr.cz/software/vb-diarization-eigenvoice-and-hmm-priors +# for details +if [ $stage -le 12 ]; then + utils/subset_data_dir.sh data/train 32000 data/train_32k + # Train the diagonal UBM. + sid/train_diag_ubm.sh --cmd "$train_cmd --mem 20G" \ + --nj 40 --num-threads 8 --subsample 1 --delta-order 0 --apply-cmn false \ + data/train_32k $num_components exp/diag_ubm_$num_components + + # Train the i-vector extractor. The UBM is assumed to be diagonal. + diarization/train_ivector_extractor_diag.sh \ + --cmd "$train_cmd --mem 35G" \ + --ivector-dim $ivector_dim --num-iters 5 --apply-cmn false \ + --num-threads 1 --num-processes 1 --nj 40 \ + exp/diag_ubm_$num_components/final.dubm data/train \ + exp/extractor_diag_c${num_components}_i${ivector_dim} +fi + +if [ $stage -le 13 ]; then + output_rttm_dir=exp/VB/rttm + mkdir -p $output_rttm_dir || exit 1; + cat $nnet_dir/xvectors_callhome1/plda_scores/rttm \ + $nnet_dir/xvectors_callhome2/plda_scores/rttm > $output_rttm_dir/x_vector_rttm + init_rttm_file=$output_rttm_dir/x_vector_rttm + + # VB resegmentation. In this script, I use the x-vector result to + # initialize the VB system. You can also use i-vector result or random + # initize the VB system. The following script uses kaldi_io. + # You could use `sh ../../../tools/extras/install_kaldi_io.sh` to install it + diarization/VB_resegmentation.sh --nj 20 --cmd "$train_cmd --mem 10G" \ + --initialize 1 data/callhome $init_rttm_file exp/VB \ + exp/diag_ubm_$num_components/final.dubm exp/extractor_diag_c${num_components}_i${ivector_dim}/final.ie || exit 1; + + # Compute the DER after VB resegmentation + mkdir -p exp/VB/results || exit 1; + md-eval.pl -1 -c 0.25 -r data/callhome/fullref.rttm -s $output_rttm_dir/VB_rttm 2> exp/VB/log/VB_DER.log \ + > exp/VB/results/VB_DER.txt + der=$(grep -oP 'DIARIZATION\ ERROR\ =\ \K[0-9]+([.][0-9]+)?' \ + exp/VB/results/VB_DER.txt) + # After VB resegmentation, DER: 6.48% + echo "After VB resegmentation, DER: $der%" +fi diff --git a/egs/callhome_egyptian/s5/local/callhome_prepare_dict.sh b/egs/callhome_egyptian/s5/local/callhome_prepare_dict.sh index 62bca974e53..d9faa97f266 100755 --- a/egs/callhome_egyptian/s5/local/callhome_prepare_dict.sh +++ b/egs/callhome_egyptian/s5/local/callhome_prepare_dict.sh @@ -54,9 +54,8 @@ cat $dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > \ $dir/extra_questions.txt || exit 1; # Add prons for laughter, noise, oov -for w in `grep -v sil $dir/silence_phones.txt`; do -sed -i "/\[$w\]/d" $tmpdir/lexicon.3 -done +w=$(grep -v sil $dir/silence_phones.txt | tr '\n' '|') +perl -i -ne "print unless /\[(${w%?})\]/" $tmpdir/lexicon.3 for w in `grep -v sil $dir/silence_phones.txt`; do echo "[$w] $w" diff --git a/egs/callhome_egyptian/s5/local/convert_symtable_to_utf.py b/egs/callhome_egyptian/s5/local/convert_symtable_to_utf.py index f5b69a1ff86..7192ff7a1cc 100644 --- a/egs/callhome_egyptian/s5/local/convert_symtable_to_utf.py +++ b/egs/callhome_egyptian/s5/local/convert_symtable_to_utf.py @@ -1,3 +1,4 @@ +from __future__ import print_function #!/usr/bin/env py # Converts a romanized ECA word list (symbol table) to @@ -7,9 +8,9 @@ import codecs if len(sys.argv) < 3: - print "USAGE: local/convert_symtable_to_utf.py [SYMTABLE] [ECA-LEXICON]" - print "E.g., local/convert_symtable_to_utf.py data/lang/words.txt \ - /export/corpora/LDC/LDC99L22" + print("USAGE: local/convert_symtable_to_utf.py [SYMTABLE] [ECA-LEXICON]") + print("E.g., local/convert_symtable_to_utf.py data/lang/words.txt \ + /export/corpora/LDC/LDC99L22") sys.exit(1) # Note that the ECA lexicon's default encoding is ISO-8859-6, not UTF8 diff --git a/egs/callhome_egyptian/s5/local/ctm.sh b/egs/callhome_egyptian/s5/local/ctm.sh index 14056b7a44b..64a7cf0d4f6 100755 --- a/egs/callhome_egyptian/s5/local/ctm.sh +++ b/egs/callhome_egyptian/s5/local/ctm.sh @@ -18,9 +18,9 @@ fi steps/get_ctm.sh $data_dir $lang_dir $decode_dir # Make sure that channel markers match -#sed -i "s:\s.*_fsp-([AB]): \1:g" data/dev/stm -#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} sed -i -r 's:fsp\s1\s:fsp A :g' {} -#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} sed -i -r 's:fsp\s2\s:fsp B :g' {} +#perl -i -pe "s:\s.*_fsp-([AB]): \1:g" data/dev/stm +#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} perl -i -pe 's:fsp\s1\s:fsp A :g' {} +#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} perl -i -pe 's:fsp\s2\s:fsp B :g' {} # Get the environment variables . /export/babel/data/software/env.sh diff --git a/egs/callhome_egyptian/s5/local/splits/get_conversation.py b/egs/callhome_egyptian/s5/local/splits/get_conversation.py index c999d3e597e..80f66174e2b 100755 --- a/egs/callhome_egyptian/s5/local/splits/get_conversation.py +++ b/egs/callhome_egyptian/s5/local/splits/get_conversation.py @@ -1,5 +1,6 @@ #!/usr/bin/env python +from __future__ import print_function import os import re @@ -37,14 +38,14 @@ evaltest[pathComponents[12]] = numberOfConversations testConv = testConv + numberOfConversations -print "==============Train===============" -print train -print "Total Conversations in train = " + str(trainConv) -print "==============Dev===============" -print devtest -print "Total Conversations in dev = " + str(devConv) -print "==============Test===============" -print evaltest -print "Total Conversations in test = " + str(testConv) -print "=================================" -print "Total Conversations in Corpus = " + str(trainConv + devConv + testConv) +print("==============Train===============") +print(train) +print("Total Conversations in train = {}".format(trainConv)) +print("==============Dev===============") +print(devtest) +print("Total Conversations in dev = {}".format(devConv)) +print("==============Test===============") +print(evaltest) +print("Total Conversations in test = {}".format(testConv)) +print("=================================") +print("Total Conversations in Corpus = {}".format(trainConv + devConv + testConv)) diff --git a/egs/casia_hwdb/v1/cmd.sh b/egs/casia_hwdb/v1/cmd.sh new file mode 100755 index 00000000000..3c8eb9f93a5 --- /dev/null +++ b/egs/casia_hwdb/v1/cmd.sh @@ -0,0 +1,13 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export cmd="queue.pl" diff --git a/egs/casia_hwdb/v1/image b/egs/casia_hwdb/v1/image new file mode 120000 index 00000000000..6a4b3afeb09 --- /dev/null +++ b/egs/casia_hwdb/v1/image @@ -0,0 +1 @@ +../../cifar/v1/image \ No newline at end of file diff --git a/egs/casia_hwdb/v1/local/augment_data.sh b/egs/casia_hwdb/v1/local/augment_data.sh new file mode 100755 index 00000000000..1f13ed15ded --- /dev/null +++ b/egs/casia_hwdb/v1/local/augment_data.sh @@ -0,0 +1,37 @@ +#!/bin/bash +# Copyright 2018 Hossein Hadian +# 2018 Ashish Arora + +# Apache 2.0 +# This script performs data augmentation. + +nj=4 +cmd=run.pl +feat_dim=40 +fliplr=false +verticle_shift=0 +echo "$0 $@" + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; + +srcdir=$1 +outdir=$2 +datadir=$3 + +mkdir -p $datadir/augmentations +echo "copying $srcdir to $datadir/augmentations/aug1, allowed length, creating feats.scp" + +for set in aug1; do + image/copy_data_dir.sh --spk-prefix $set- --utt-prefix $set- \ + $srcdir $datadir/augmentations/$set + cat $srcdir/allowed_lengths.txt > $datadir/augmentations/$set/allowed_lengths.txt + local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim $feat_dim \ + --vertical-shift $verticle_shift \ + --fliplr $fliplr --augment 'random_scale' $datadir/augmentations/$set +done + +echo " combine original data and data from different augmentations" +utils/combine_data.sh --extra-files images.scp $outdir $srcdir $datadir/augmentations/aug1 +cat $srcdir/allowed_lengths.txt > $outdir/allowed_lengths.txt diff --git a/egs/casia_hwdb/v1/local/chain/compare_wer.sh b/egs/casia_hwdb/v1/local/chain/compare_wer.sh new file mode 100755 index 00000000000..ab880c1adb5 --- /dev/null +++ b/egs/casia_hwdb/v1/local/chain/compare_wer.sh @@ -0,0 +1,67 @@ +#!/bin/bash + +# this script is used for comparing decoding results between systems. +# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b} + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora + +if [ $# == 0 ]; then + echo "Usage: $0: [ ... ]" + echo "e.g.: $0 exp/chain/cnn{1a,1b}" + exit 1 +fi + +echo "# $0 $*" +used_epochs=false + +echo -n "# System " +for x in $*; do printf "% 10s" " $(basename $x)"; done +echo + +echo -n "# WER " +for x in $*; do + wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "# CER " +for x in $*; do + cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}') + printf "% 10s" $cer +done +echo + + +if $used_epochs; then + exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. +fi + +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final train prob (xent) " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob (xent) " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo diff --git a/egs/casia_hwdb/v1/local/chain/run_cnn_e2eali_1b.sh b/egs/casia_hwdb/v1/local/chain/run_cnn_e2eali_1b.sh new file mode 100755 index 00000000000..300c8ae8e31 --- /dev/null +++ b/egs/casia_hwdb/v1/local/chain/run_cnn_e2eali_1b.sh @@ -0,0 +1,231 @@ +#!/bin/bash + +# e2eali_1b is the same as chainali_1a but uses the e2e chain model to get the +# lattice alignments and to build a tree + +# ./local/chain/compare_wer.sh exp_yomdle_chinese/chain/e2e_cnn_1a exp_yomdle_chinese/chain/cnn_e2eali_1b +# System e2e_cnn_1a cnn_e2eali_1b +# CER 15.44 13.57 +# Final train prob 0.0616 -0.0512 +# Final valid prob 0.0390 -0.0718 +# Final train prob (xent) -0.6199 +# Final valid prob (xent) -0.7448 + +set -e -o pipefail + +data_dir=data +exp_dir=exp + +stage=0 + +nj=30 +train_set=train +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1b #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=4 +# training chunk-options +chunk_width=340,300,200,100 +num_leaves=2000 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +tdnn_dim=450 +# training options +srand=0 +remove_egs=true +lang_test=lang_test +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --acoustic-scale 1.0 \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + ${train_data_dir} $data_dir/lang $e2echain_model_dir $lat_dir + echo "" >$lat_dir/splice_opts + +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + fi + + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves ${train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + cnn_opts="l2-regularize=0.075" + tdnn_opts="l2-regularize=0.075" + output_opts="l2-regularize=0.1" + common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=32" + common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=128" + common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=512" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=180 name=input + + conv-relu-batchnorm-layer name=cnn1 height-in=60 height-out=60 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=60 height-out=60 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn3 height-in=60 height-out=30 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn4 height-in=30 height-out=30 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=30 height-out=30 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn6 height-in=30 height-out=15 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn7 height-in=15 height-out=15 time-offsets=-4,0,4 $common3 + conv-relu-batchnorm-layer name=cnn8 height-in=15 height-out=15 time-offsets=-4,0,4 $common3 + conv-relu-batchnorm-layer name=cnn9 height-in=15 height-out=15 time-offsets=-4,0,4 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-8,-4,0,4,8) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1500" \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=1 \ + --chain.left-tolerance 3 \ + --chain.right-tolerance 3 \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=6 \ + --trainer.frames-per-iter=1000000 \ + --trainer.optimization.num-jobs-initial=4 \ + --trainer.optimization.num-jobs-final=8 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=16,8 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0 --constrained false" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=wait \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 $data_dir/$lang_test \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph $data_dir/test $dir/decode_test || exit 1; +fi diff --git a/egs/casia_hwdb/v1/local/chain/run_flatstart_cnn1a.sh b/egs/casia_hwdb/v1/local/chain/run_flatstart_cnn1a.sh new file mode 100755 index 00000000000..023fbff1c14 --- /dev/null +++ b/egs/casia_hwdb/v1/local/chain/run_flatstart_cnn1a.sh @@ -0,0 +1,170 @@ +#!/bin/bash +# Copyright 2017 Hossein Hadian + +# This script does end2end chain training (i.e. from scratch) + +# ./local/chain/compare_wer.sh exp_yomdle_chinese/chain/e2e_cnn_1a exp_yomdle_chinese/chain/cnn_e2eali_1b +# System e2e_cnn_1a cnn_e2eali_1b +# CER 15.44 13.57 +# Final train prob 0.0616 -0.0512 +# Final valid prob 0.0390 -0.0718 +# Final train prob (xent) -0.6199 +# Final valid prob (xent) -0.7448 + +set -e + +data_dir=data +exp_dir=exp + +# configs for 'chain' +stage=0 +nj=30 +train_stage=-10 +get_egs_stage=-10 +affix=1a + +# training options +tdnn_dim=450 +num_epochs=4 +num_jobs_initial=4 +num_jobs_final=8 +minibatch_size=150=64,32/300=32,16/600=16,8/1200=8,4 +common_egs_dir= +l2_regularize=0.00005 +frames_per_iter=1000000 +cmvn_opts="--norm-means=false --norm-vars=false" +train_set=train +lang_test=lang_test + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 1 ]; then + steps/nnet3/chain/e2e/prepare_e2e.sh --nj $nj --cmd "$cmd" \ + --shared-phones true \ + --type mono \ + $data_dir/$train_set $lang $treedir + $cmd $treedir/log/make_phone_lm.log \ + cat $data_dir/$train_set/text \| \ + steps/nnet3/chain/e2e/text_to_phones.py $data_dir/lang \| \ + utils/sym2int.pl -f 2- $data_dir/lang/phones.txt \| \ + chain-est-phone-lm --num-extra-lm-states=1500 \ + ark:- $treedir/phone_lm.fst +fi + +if [ $stage -le 2 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}') + + cnn_opts="l2-regularize=0.075" + tdnn_opts="l2-regularize=0.075" + output_opts="l2-regularize=0.1" + + common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=32" + common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=128" + common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=512" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=180 name=input + conv-relu-batchnorm-layer name=cnn1 height-in=60 height-out=60 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=60 height-out=30 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=30 height-out=30 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=30 height-out=30 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=30 height-out=15 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=15 height-out=15 time-offsets=-4,0,4 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=15 height-out=15 time-offsets=-4,0,4 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $output_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs +fi + +if [ $stage -le 3 ]; then + # no need to store the egs in a shared storage because we always + # remove them. Anyway, it takes only 5 minutes to generate them. + + steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \ + --cmd "$cmd" \ + --feat.cmvn-opts "$cmvn_opts" \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize $l2_regularize \ + --chain.apply-deriv-weights false \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \ + --chain.frame-subsampling-factor 4 \ + --chain.alignment-subsampling-factor 4 \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter $frames_per_iter \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.momentum 0 \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.shrink-value 1.0 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs true \ + --use-gpu=wait \ + --feat-dir $data_dir/${train_set} \ + --tree-dir $treedir \ + --dir $dir || exit 1; +fi + +if [ $stage -le 4 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 $data_dir/$lang_test \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 5 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $nj --cmd "$cmd" \ + $dir/graph $data_dir/test $dir/decode_test || exit 1; +fi + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/casia_hwdb/v1/local/extract_database.sh b/egs/casia_hwdb/v1/local/extract_database.sh new file mode 100755 index 00000000000..1af3713d586 --- /dev/null +++ b/egs/casia_hwdb/v1/local/extract_database.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# Copyright 2018 Chun-Chieh Chang + +# The original format of the dataset given is GEDI and page images. +# This script is written to create line images from page images. +# It also creates csv files from the GEDI files. + +database_train=/export/corpora5/handwriting_ocr/CASIA_HWDB/Offline/ +database_competition=/export/corpora5/handwriting_ocr/CASIA_HWDB/Offline/ +cangjie_url=https://raw.githubusercontent.com/wanleung/libcangjie/master/tables/cj5-cc.txt +download_dir=download + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; + +mkdir -p ${download_dir}/{Train,Test} +for task in 0 1 2; do + for datasplit in Train Test; do + unzip -q -d ${download_dir}/${datasplit} ${database_train}/CASIA-HWDB2.${task}/${datasplit}_Dgr.zip + done +done + +unzip -q -d ${download_dir}/Competition ${database_competition}/competition-dgr.zip + +echo "Downloading table for CangJie." +wget -P $download_dir/ $cangjie_url || exit 1; +sed -ie '1,8d' $download_dir/cj5-cc.txt diff --git a/egs/casia_hwdb/v1/local/extract_features.sh b/egs/casia_hwdb/v1/local/extract_features.sh new file mode 100755 index 00000000000..f75837ae5b3 --- /dev/null +++ b/egs/casia_hwdb/v1/local/extract_features.sh @@ -0,0 +1,45 @@ +#!/bin/bash +# Copyright 2017 Yiwen Shao +# 2018 Ashish Arora + +nj=4 +cmd=run.pl +feat_dim=40 +fliplr=false +augment='no_aug' +num_channels=3 +echo "$0 $@" + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; + +data=$1 +featdir=$data/data +scp=$data/images.scp +logdir=$data/log + +mkdir -p $logdir +mkdir -p $featdir + +# make $featdir an absolute pathname +featdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $featdir ${PWD}` + +for n in $(seq $nj); do + split_scps="$split_scps $logdir/images.$n.scp" +done + +# split images.scp +utils/split_scp.pl $scp $split_scps || exit 1; + +$cmd JOB=1:$nj $logdir/extract_features.JOB.log \ + image/ocr/make_features.py $logdir/images.JOB.scp \ + --allowed_len_file_path $data/allowed_lengths.txt \ + --feat-dim $feat_dim --num-channels $num_channels --fliplr $fliplr --augment_type $augment \| \ + copy-feats --compress=true --compression-method=7 \ + ark:- ark,scp:$featdir/images.JOB.ark,$featdir/images.JOB.scp + +## aggregates the output scp's to get feats.scp +for n in $(seq $nj); do + cat $featdir/images.$n.scp || exit 1; +done > $data/feats.scp || exit 1 diff --git a/egs/casia_hwdb/v1/local/gen_topo.py b/egs/casia_hwdb/v1/local/gen_topo.py new file mode 100755 index 00000000000..f64dcc5eec1 --- /dev/null +++ b/egs/casia_hwdb/v1/local/gen_topo.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python + +# Copyright 2017 (author: Chun-Chieh Chang) + +# Generate a topology file. This allows control of the number of states in the +# non-silence HMMs, and in the silence HMMs. This is a modified version of +# 'utils/gen_topo.pl'. The difference is that this creates two topologies for +# the non-silence HMMs. The number of states for punctuations is different than +# the number of states for other characters. + +from __future__ import print_function +import argparse +import string + +parser = argparse.ArgumentParser(description="Usage: steps/nnet3/chain/gen_topo.py " + " " + "e.g.: steps/nnet3/chain/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n", + epilog="See egs/swbd/s5c/local/chain/train_tdnn_a.sh for example of usage."); +parser.add_argument("num_nonsil_states", type=int, help="number of states for nonsilence phones"); +parser.add_argument("num_sil_states", type=int, help="number of states for silence phones"); +parser.add_argument("num_cj5_states", type=int, help="number of states for punctuation"); +parser.add_argument("nonsilence_phones", type=str, + help="List of non-silence phones as integers, separated by colons, e.g. 4:5:6:7:8:9"); +parser.add_argument("silence_phones", type=str, + help="List of silence phones as integers, separated by colons, e.g. 1:2:3"); +parser.add_argument("phone_list", type=str, help="file containing all phones and their corresponding number."); + +args = parser.parse_args() + +silence_phones = [ int(x) for x in args.silence_phones.split(":") ] +nonsilence_phones = [ int(x) for x in args.nonsilence_phones.split(":") ] +all_phones = silence_phones + nonsilence_phones + +cj5_phones = [] +with open(args.phone_list) as f: + for line in f: + line = line.strip() + phone = line.split(' ')[0] + if "cj5" in phone: + cj5_phones.append(int(line.split(' ')[1])) +# For nonsilence phones that are not punctuations +print("") +print("") +print("") +print(" ".join([str(x) for x in nonsilence_phones if x not in cj5_phones])) +print("") +for x in range(0, args.num_nonsil_states): + xp1 = x + 1 + print(" " + str(x) + " " + str(x) + " " + str(x) + " 0.75 " + str(xp1) + " 0.25 ") +print(" " + str(args.num_nonsil_states) + " ") +print("") + +# For nonsilence phones that are cj5 +print("") +print("") +print(" ".join([str(x) for x in nonsilence_phones if x in cj5_phones])) +print("") +for x in range(0, args.num_cj5_states): + xp1 = x + 1 + print(" " + str(x) + " " + str(x) + " " + str(x) + " 0.75 " + str(xp1) + " 0.25 ") +print(" " + str(args.num_cj5_states) + " ") +print("") + +# For silence phones +print("") +print("") +print(" ".join([str(x) for x in silence_phones])) +print("") +if(args.num_sil_states > 1): + transp = 1.0 / (args.num_sil_states - 1) + + state_str = " 0 0 " + for x in range(0, (args.num_sil_states - 1)): + state_str = state_str + " " + str(x) + " " + str(transp) + " " + state_str = state_str + "" + print(state_str) + + for x in range(1, (args.num_sil_states - 1)): + state_str = " " + str(x) + " " + str(x) + " " + for y in range(1, args.num_sil_states): + state_str = state_str + " " + str(y) + " " + str(transp) + " " + state_str = state_str + "" + print(state_str) + second_last = args.num_sil_states - 1 + print(" " + str(second_last) + " " + str(second_last) + " " + str(second_last) + " 0.75 " + str(args.num_sil_states) + " 0.25 ") + print(" " + str(args.num_sil_states) + " ") +else: + print(" 0 0 0 0.75 1 0.25 ") + print(" " + str(args.num_sil_states) + " ") +print("") +print("") diff --git a/egs/casia_hwdb/v1/local/normalize_text.py b/egs/casia_hwdb/v1/local/normalize_text.py new file mode 100755 index 00000000000..80c4e3ad3ab --- /dev/null +++ b/egs/casia_hwdb/v1/local/normalize_text.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python3 +# Copyright 2018 Chun-Chieh Chang + +# This script reads in text and outputs the normalized version + +import io +import re +import sys +import unicodedata + +sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding="utf8") +sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf8") +for line in sys.stdin: + line = line.strip() + line = unicodedata.normalize('NFC', line) + line = re.sub(r'\s', ' ', line) + sys.stdout.write(line + '\n') diff --git a/egs/casia_hwdb/v1/local/prepare_dict.sh b/egs/casia_hwdb/v1/local/prepare_dict.sh new file mode 100755 index 00000000000..cf2ecb1ce9b --- /dev/null +++ b/egs/casia_hwdb/v1/local/prepare_dict.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash + +# Copyright 2017 Hossein Hadian +# 2017 Chun Chieh Chang +# 2017 Ashish Arora + +# This script prepares the dictionary. + +set -e +dir=data/local/dict +data_dir=data + +. ./utils/parse_options.sh || exit 1; + +base_dir=$(echo "$DIRECTORY" | cut -d "/" -f2) + +mkdir -p $dir + +cut -d' ' -f1 download/cj5-cc.txt | ./utils/lang/bpe/learn_bpe.py -s 300 > $dir/bpe.out +cut -d' ' -f1 download/cj5-cc.txt | ./utils/lang/bpe/apply_bpe.py -c $dir/bpe.out | sed 's/@@//g' > $dir/bpe_text +cut -d' ' -f2- download/cj5-cc.txt | sed 's/ //g' > $dir/ids +paste -d' ' $dir/bpe_text $dir/ids > $dir/cj5-cc.txt +local/prepare_lexicon.py --data-dir $data_dir $dir + +cut -d' ' -f2- $dir/lexicon.txt | sed 's/SIL//g' | tr ' ' '\n' | sort -u | sed '/^$/d' >$dir/nonsilence_phones.txt || exit 1; + +echo ' SIL' >> $dir/lexicon.txt + +echo SIL > $dir/silence_phones.txt + +echo SIL >$dir/optional_silence.txt + +echo -n "" >$dir/extra_questions.txt diff --git a/egs/casia_hwdb/v1/local/prepare_lexicon.py b/egs/casia_hwdb/v1/local/prepare_lexicon.py new file mode 100755 index 00000000000..224c199ecef --- /dev/null +++ b/egs/casia_hwdb/v1/local/prepare_lexicon.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 + +# Copyright 2018 Ashish Arora +# Chun-Chieh Chang + +import argparse +import os + +parser = argparse.ArgumentParser(description="""Creates the list of characters and words in lexicon""") +parser.add_argument('dir', type=str, help='output path') +parser.add_argument('--data-dir', type=str, default='data', help='Path to text file') +args = parser.parse_args() + +### main ### +radical = ['日', '月', '金', '木', '水', '火', '土', '竹', '戈', '十', '大', '中', '一', '弓', '人', '心', '手','口','尸','廿','山','女','田','卜'] +lex = {} +text_path = os.path.join(args.data_dir, 'train', 'text') +text_fh = open(text_path, 'r', encoding='utf-8') + +# Used specially for Chinese. +# Uses the ChangJie keyboard input method to create subword units for Chinese. +cj5_table = {} +with open(os.path.join(args.dir, 'cj5-cc.txt'), 'r', encoding='utf-8') as f: + for line in f: + line_vect = line.strip().split() + if not line_vect[0].startswith('yyy') and not line_vect[0].startswith('z'): + cj5_table[line_vect[-1]] = "cj5_" + " cj5_".join(line_vect[:-1]) +# lex[line_vect[1]] = "cj5_" + " cj5_".join(list(line_vect[0])) + +with open(text_path, 'r', encoding='utf-8') as f: + for line in f: + line_vect = line.strip().split() + for i in range(1, len(line_vect)): + characters = list(line_vect[i]) + # Put SIL instead of "|". Because every "|" in the beginning of the words is for initial-space of that word + characters = " ".join([ 'SIL' if char == '|' else char if char in radical else cj5_table[char] if char in cj5_table else char for char in characters]) + characters = characters.replace('#','') + lex[line_vect[i]] = characters + +with open(os.path.join(args.dir, 'lexicon.txt'), 'w', encoding='utf-8') as fp: + for key in sorted(lex): + fp.write(key + " " + lex[key] + "\n") diff --git a/egs/casia_hwdb/v1/local/process_data.py b/egs/casia_hwdb/v1/local/process_data.py new file mode 100755 index 00000000000..8548ac2c58e --- /dev/null +++ b/egs/casia_hwdb/v1/local/process_data.py @@ -0,0 +1,464 @@ +#!/usr/bin/env python3 + +# Copyright 2018 Ashish Arora +# 2018 Chun Chieh Chang + +""" This script reads the extracted Farsi OCR (yomdle and slam) database files + and creates the following files (for the data subset selected via --dataset): + text, utt2spk, images.scp. + Eg. local/process_data.py data/download/ data/local/splits/train.txt data/train + Eg. text file: english_phone_books_0001_1 To sum up, then, it would appear that + utt2spk file: english_phone_books_0001_0 english_phone_books_0001 + images.scp file: english_phone_books_0001_0 \ + data/download/truth_line_image/english_phone_books_0001_0.png +""" + +import argparse +import numpy as np +import os +import re +import struct +import sys +import unicodedata +from collections import namedtuple +from math import atan2, cos, sin, pi, degrees, sqrt +from PIL import Image +from scipy import misc +from scipy.spatial import ConvexHull + +parser = argparse.ArgumentParser(description="Creates text, utt2spk, and images.scp files") +parser.add_argument('database_path', type=str, help='Path to data') +parser.add_argument('out_dir', type=str, help='directory to output files') +parser.add_argument('--padding', type=int, default=100, help='Padding so BBox does not exceed image area') +args = parser.parse_args() + +""" +bounding_box is a named tuple which contains: + area (float): area of the rectangle + length_parallel (float): length of the side that is parallel to unit_vector + length_orthogonal (float): length of the side that is orthogonal to unit_vector + rectangle_center(int, int): coordinates of the rectangle center + (use rectangle_corners to get the corner points of the rectangle) + unit_vector (float, float): direction of the length_parallel side. + (it's orthogonal vector can be found with the orthogonal_vector function + unit_vector_angle (float): angle of the unit vector to be in radians. + corner_points [(float, float)]: set that contains the corners of the rectangle +""" + +bounding_box_tuple = namedtuple('bounding_box_tuple', 'area ' + 'length_parallel ' + 'length_orthogonal ' + 'rectangle_center ' + 'unit_vector ' + 'unit_vector_angle ' + 'corner_points') + + +def unit_vector(pt0, pt1): + """ Given two points pt0 and pt1, return a unit vector that + points in the direction of pt0 to pt1. + Returns + ------- + (float, float): unit vector + """ + dis_0_to_1 = sqrt((pt0[0] - pt1[0])**2 + (pt0[1] - pt1[1])**2) + return (pt1[0] - pt0[0]) / dis_0_to_1, \ + (pt1[1] - pt0[1]) / dis_0_to_1 + + +def orthogonal_vector(vector): + """ Given a vector, returns a orthogonal/perpendicular vector of equal length. + Returns + ------ + (float, float): A vector that points in the direction orthogonal to vector. + """ + return -1 * vector[1], vector[0] + +def bounding_area(index, hull): + """ Given index location in an array and convex hull, it gets two points + hull[index] and hull[index+1]. From these two points, it returns a named + tuple that mainly contains area of the box that bounds the hull. This + bounding box orintation is same as the orientation of the lines formed + by the point hull[index] and hull[index+1]. + Returns + ------- + a named tuple that contains: + area: area of the rectangle + length_parallel: length of the side that is parallel to unit_vector + length_orthogonal: length of the side that is orthogonal to unit_vector + rectangle_center: coordinates of the rectangle center + unit_vector: direction of the length_parallel side. + (it's orthogonal vector can be found with the orthogonal_vector function) + """ + unit_vector_p = unit_vector(hull[index], hull[index+1]) + unit_vector_o = orthogonal_vector(unit_vector_p) + + dis_p = tuple(np.dot(unit_vector_p, pt) for pt in hull) + dis_o = tuple(np.dot(unit_vector_o, pt) for pt in hull) + + min_p = min(dis_p) + min_o = min(dis_o) + len_p = max(dis_p) - min_p + len_o = max(dis_o) - min_o + + return {'area': len_p * len_o, + 'length_parallel': len_p, + 'length_orthogonal': len_o, + 'rectangle_center': (min_p + float(len_p) / 2, min_o + float(len_o) / 2), + 'unit_vector': unit_vector_p} + +def to_xy_coordinates(unit_vector_angle, point): + """ Given angle from horizontal axis and a point from origin, + returns converted unit vector coordinates in x, y coordinates. + angle of unit vector should be in radians. + Returns + ------ + (float, float): converted x,y coordinate of the unit vector. + """ + angle_orthogonal = unit_vector_angle + pi / 2 + return point[0] * cos(unit_vector_angle) + point[1] * cos(angle_orthogonal), \ + point[0] * sin(unit_vector_angle) + point[1] * sin(angle_orthogonal) + +def rotate_points(center_of_rotation, angle, points): + """ Rotates a point cloud around the center_of_rotation point by angle + input + ----- + center_of_rotation (float, float): angle of unit vector to be in radians. + angle (float): angle of rotation to be in radians. + points [(float, float)]: Points to be a list or tuple of points. Points to be rotated. + Returns + ------ + [(float, float)]: Rotated points around center of rotation by angle + """ + rot_points = [] + ang = [] + for pt in points: + diff = tuple([pt[d] - center_of_rotation[d] for d in range(2)]) + diff_angle = atan2(diff[1], diff[0]) + angle + ang.append(diff_angle) + diff_length = sqrt(sum([d**2 for d in diff])) + rot_points.append((center_of_rotation[0] + diff_length * cos(diff_angle), + center_of_rotation[1] + diff_length * sin(diff_angle))) + + return rot_points + +def rectangle_corners(rectangle): + """ Given rectangle center and its inclination, returns the corner + locations of the rectangle. + Returns + ------ + [(float, float)]: 4 corner points of rectangle. + """ + corner_points = [] + for i1 in (.5, -.5): + for i2 in (i1, -1 * i1): + corner_points.append((rectangle['rectangle_center'][0] + i1 * rectangle['length_parallel'], + rectangle['rectangle_center'][1] + i2 * rectangle['length_orthogonal'])) + + return rotate_points(rectangle['rectangle_center'], rectangle['unit_vector_angle'], corner_points) + +def get_orientation(origin, p1, p2): + """ + Given origin and two points, return the orientation of the Point p1 with + regards to Point p2 using origin. + Returns + ------- + integer: Negative if p1 is clockwise of p2. + """ + difference = ( + ((p2[0] - origin[0]) * (p1[1] - origin[1])) + - ((p1[0] - origin[0]) * (p2[1] - origin[1])) + ) + return difference + +def compute_hull(points): + """ + Given input list of points, return a list of points that + made up the convex hull. + Returns + ------- + [(float, float)]: convexhull points + """ + hull_points = [] + start = points[0] + min_x = start[0] + for p in points[1:]: + if p[0] < min_x: + min_x = p[0] + start = p + point = start + hull_points.append(start) + + far_point = None + while far_point is not start: + p1 = None + for p in points: + if p is point: + continue + else: + p1 = p + break + + far_point = p1 + + for p2 in points: + if p2 is point or p2 is p1: + continue + else: + direction = get_orientation(point, far_point, p2) + if direction > 0: + far_point = p2 + + hull_points.append(far_point) + point = far_point + return hull_points + + +def minimum_bounding_box(points): + """ Given a list of 2D points, it returns the minimum area rectangle bounding all + the points in the point cloud. + Returns + ------ + returns a namedtuple that contains: + area: area of the rectangle + length_parallel: length of the side that is parallel to unit_vector + length_orthogonal: length of the side that is orthogonal to unit_vector + rectangle_center: coordinates of the rectangle center + unit_vector: direction of the length_parallel side. RADIANS + unit_vector_angle: angle of the unit vector + corner_points: set that contains the corners of the rectangle + """ + if len(points) <= 2: raise ValueError('More than two points required.') + + hull_ordered = [points[index] for index in ConvexHull(points).vertices] + hull_ordered.append(hull_ordered[0]) + #hull_ordered = compute_hull(points) + hull_ordered = tuple(hull_ordered) + + min_rectangle = bounding_area(0, hull_ordered) + for i in range(1, len(hull_ordered)-1): + rectangle = bounding_area(i, hull_ordered) + if rectangle['area'] < min_rectangle['area']: + min_rectangle = rectangle + + min_rectangle['unit_vector_angle'] = atan2(min_rectangle['unit_vector'][1], min_rectangle['unit_vector'][0]) + min_rectangle['rectangle_center'] = to_xy_coordinates(min_rectangle['unit_vector_angle'], min_rectangle['rectangle_center']) + + return bounding_box_tuple( + area = min_rectangle['area'], + length_parallel = min_rectangle['length_parallel'], + length_orthogonal = min_rectangle['length_orthogonal'], + rectangle_center = min_rectangle['rectangle_center'], + unit_vector = min_rectangle['unit_vector'], + unit_vector_angle = min_rectangle['unit_vector_angle'], + corner_points = set(rectangle_corners(min_rectangle))) + +def get_center(im): + """ Given image, returns the location of center pixel + Returns + ------- + (int, int): center of the image + """ + center_x = float(im.size[0]) / 2 + center_y = float(im.size[1]) / 2 + return int(center_x), int(center_y) + +def get_horizontal_angle(unit_vector_angle): + """ Given an angle in radians, returns angle of the unit vector in + first or fourth quadrant. + Returns + ------ + (float): updated angle of the unit vector to be in radians. + It is only in first or fourth quadrant. + """ + if unit_vector_angle > pi / 2 and unit_vector_angle <= pi: + unit_vector_angle = unit_vector_angle - pi + elif unit_vector_angle > -pi and unit_vector_angle < -pi / 2: + unit_vector_angle = unit_vector_angle + pi + + return unit_vector_angle + +def get_smaller_angle(bounding_box): + """ Given a rectangle, returns its smallest absolute angle from horizontal axis. + Returns + ------ + (float): smallest angle of the rectangle to be in radians. + """ + unit_vector = bounding_box.unit_vector + unit_vector_angle = bounding_box.unit_vector_angle + ortho_vector = orthogonal_vector(unit_vector) + ortho_vector_angle = atan2(ortho_vector[1], ortho_vector[0]) + + unit_vector_angle_updated = get_horizontal_angle(unit_vector_angle) + ortho_vector_angle_updated = get_horizontal_angle(ortho_vector_angle) + + if abs(unit_vector_angle_updated) < abs(ortho_vector_angle_updated): + return unit_vector_angle_updated + else: + return ortho_vector_angle_updated + +def rotated_points(bounding_box, center): + """ Given the rectangle, returns corner points of rotated rectangle. + It rotates the rectangle around the center by its smallest angle. + Returns + ------- + [(int, int)]: 4 corner points of rectangle. + """ + p1, p2, p3, p4 = bounding_box.corner_points + x1, y1 = p1 + x2, y2 = p2 + x3, y3 = p3 + x4, y4 = p4 + center_x, center_y = center + rotation_angle_in_rad = -get_smaller_angle(bounding_box) + x_dash_1 = (x1 - center_x) * cos(rotation_angle_in_rad) - (y1 - center_y) * sin(rotation_angle_in_rad) + center_x + x_dash_2 = (x2 - center_x) * cos(rotation_angle_in_rad) - (y2 - center_y) * sin(rotation_angle_in_rad) + center_x + x_dash_3 = (x3 - center_x) * cos(rotation_angle_in_rad) - (y3 - center_y) * sin(rotation_angle_in_rad) + center_x + x_dash_4 = (x4 - center_x) * cos(rotation_angle_in_rad) - (y4 - center_y) * sin(rotation_angle_in_rad) + center_x + + y_dash_1 = (y1 - center_y) * cos(rotation_angle_in_rad) + (x1 - center_x) * sin(rotation_angle_in_rad) + center_y + y_dash_2 = (y2 - center_y) * cos(rotation_angle_in_rad) + (x2 - center_x) * sin(rotation_angle_in_rad) + center_y + y_dash_3 = (y3 - center_y) * cos(rotation_angle_in_rad) + (x3 - center_x) * sin(rotation_angle_in_rad) + center_y + y_dash_4 = (y4 - center_y) * cos(rotation_angle_in_rad) + (x4 - center_x) * sin(rotation_angle_in_rad) + center_y + return x_dash_1, y_dash_1, x_dash_2, y_dash_2, x_dash_3, y_dash_3, x_dash_4, y_dash_4 + +def pad_image(image): + """ Given an image, returns a padded image around the border. + This routine save the code from crashing if bounding boxes that are + slightly outside the page boundary. + Returns + ------- + image: page image + """ + offset = int(args.padding // 2) + padded_image = Image.new('RGB', (image.size[0] + int(args.padding), image.size[1] + int(args.padding)), "white") + padded_image.paste(im = image, box = (offset, offset)) + return padded_image + +def update_minimum_bounding_box_input(bounding_box_input): + """ Given list of 2D points, returns list of 2D points shifted by an offset. + Returns + ------ + points [(float, float)]: points, a list or tuple of 2D coordinates + """ + updated_minimum_bounding_box_input = [] + offset = int(args.padding // 2) + for point in bounding_box_input: + x, y = point + new_x = x + offset + new_y = y + offset + word_coordinate = (new_x, new_y) + updated_minimum_bounding_box_input.append(word_coordinate) + + return updated_minimum_bounding_box_input + +### main ### +print("Processing '{}' data...".format(args.out_dir)) + +text_file = os.path.join(args.out_dir, 'text') +text_fh = open(text_file, 'w', encoding='utf-8') +utt2spk_file = os.path.join(args.out_dir, 'utt2spk') +utt2spk_fh = open(utt2spk_file, 'w', encoding='utf-8') +image_file = os.path.join(args.out_dir, 'images.scp') +image_fh = open(image_file, 'w', encoding='utf-8') + +for filename in sorted(os.listdir(args.database_path)): + if filename.endswith('.dgr'): + with open(os.path.join(args.database_path, filename), 'rb') as f: + iHdSize = struct.unpack('i', f.read(4))[0] + szFormatCode = struct.unpack(''.join('c' for x in range(0,8)), f.read(8)) + szFormatCode = "".join([x.decode('utf8') for x in szFormatCode]) + szIllustr = f.read(iHdSize - 36) + szCodeType = struct.unpack(''.join(['c' for x in range(0,20)]), f.read(20)) + szCodeType = "".join([x.decode('utf8') for x in szCodeType]) + sCodeLen = struct.unpack('h', f.read(2))[0] + sBitApp = struct.unpack('h', f.read(2))[0] + iImgHei = struct.unpack('i', f.read(4))[0] + iImgWid = struct.unpack('i', f.read(4))[0] + pDocImg = Image.new('L', (iImgWid, iImgHei), (255)) + iLineNum = struct.unpack('i', f.read(4))[0] + text_dict = {} + image_dict = {} + for i in range(0, iLineNum): + iWordNum = struct.unpack('i', f.read(4))[0] + for j in range(0, iWordNum): + pWordLabel = f.read(sCodeLen).decode('gb18030', errors='ignore') + sTop = struct.unpack('h', f.read(2))[0] + sLeft = struct.unpack('h', f.read(2))[0] + sHei = struct.unpack('h', f.read(2))[0] + sWid = struct.unpack('h', f.read(2))[0] + if i in text_dict: + text_dict[i] += [pWordLabel] + else: + text_dict[i] = [pWordLabel] + if i in image_dict: + image_dict[i] += [[sTop, sLeft, sHei, sWid]] + else: + image_dict[i] = [[sTop, sLeft, sHei, sWid]] + pTmpData = struct.unpack("{}B".format(sHei * sWid), f.read(sHei * sWid)) + character = misc.toimage(np.array(pTmpData).reshape(sHei, sWid)) + pDocImg.paste(character, (sLeft, sTop)) + pDocImg.save(os.path.join(args.out_dir, 'data', 'images', os.path.splitext(filename)[0] + '.png'), 'png') + + im_page = pad_image(pDocImg) + for i in range(0, iLineNum): + text = "" + points = [] + for j, char in enumerate(text_dict[i]): + text += char + points.append([image_dict[i][j][1], image_dict[i][j][0]]) + points.append([image_dict[i][j][1] + image_dict[i][j][3], image_dict[i][j][0]]) + points.append([image_dict[i][j][1], image_dict[i][j][0] + image_dict[i][j][2]]) + points.append([image_dict[i][j][1] + image_dict[i][j][3], image_dict[i][j][0] + image_dict[i][j][2]]) + updated_mbb_input = update_minimum_bounding_box_input(points) + bounding_box = minimum_bounding_box(updated_mbb_input) + p1, p2, p3, p4 = bounding_box.corner_points + x1, y1 = p1 + x2, y2 = p2 + x3, y3 = p3 + x4, y4 = p4 + min_x = int(min(x1, x2, x3, x4)) + min_y = int(min(y1, y2, y3, y4)) + max_x = int(max(x1, x2, x3, x4)) + max_y = int(max(y1, y2, y3, y4)) + box = (min_x, min_y, max_x, max_y) + region_initial = im_page.crop(box) + rot_points = [] + p1_new = (x1 - min_x, y1 - min_y) + p2_new = (x2 - min_x, y2 - min_y) + p3_new = (x3 - min_x, y3 - min_y) + p4_new = (x4 - min_x, y4 - min_y) + rot_points.append(p1_new) + rot_points.append(p2_new) + rot_points.append(p3_new) + rot_points.append(p4_new) + cropped_bounding_box = bounding_box_tuple(bounding_box.area, + bounding_box.length_parallel, + bounding_box.length_orthogonal, + bounding_box.length_orthogonal, + bounding_box.unit_vector, + bounding_box.unit_vector_angle, + set(rot_points)) + + rotation_angle_in_rad = get_smaller_angle(cropped_bounding_box) + img2 = region_initial.rotate(degrees(rotation_angle_in_rad), resample=Image.BICUBIC) + x_dash_1, y_dash_1, x_dash_2, y_dash_2, x_dash_3, y_dash_3, x_dash_4, y_dash_4 = rotated_points( + cropped_bounding_box, get_center(region_initial)) + + + min_x = int(min(x_dash_1, x_dash_2, x_dash_3, x_dash_4)) + min_y = int(min(y_dash_1, y_dash_2, y_dash_3, y_dash_4)) + max_x = int(max(x_dash_1, x_dash_2, x_dash_3, x_dash_4)) + max_y = int(max(y_dash_1, y_dash_2, y_dash_3, y_dash_4)) + box = (min_x, min_y, max_x, max_y) + region_final = img2.crop(box) + text = text.replace('\x00', '') + text = unicodedata.normalize('NFC', text) + image_id = os.path.splitext(filename)[0] + '_' + str(i).zfill(3) + image_filepath = os.path.join(args.out_dir, 'data', 'images', os.path.splitext(filename)[0] + '_' + str(i).zfill(3) + '.png') + writer_id = os.path.splitext(filename)[0].split('-')[0] + region_final.save(image_filepath, 'png') + + text_fh.write(image_id + ' ' + text + '\n') + utt2spk_fh.write(image_id + ' ' + writer_id + '\n') + image_fh.write(image_id + ' ' + image_filepath + '\n') diff --git a/egs/casia_hwdb/v1/local/score.sh b/egs/casia_hwdb/v1/local/score.sh new file mode 100755 index 00000000000..f2405205f02 --- /dev/null +++ b/egs/casia_hwdb/v1/local/score.sh @@ -0,0 +1,5 @@ +#!/bin/bash + + +steps/scoring/score_kaldi_wer.sh --max-lmwt 10 "$@" +steps/scoring/score_kaldi_cer.sh --max-lmwt 10 --stage 2 "$@" diff --git a/egs/casia_hwdb/v1/local/train_lm.sh b/egs/casia_hwdb/v1/local/train_lm.sh new file mode 100755 index 00000000000..bc738f217da --- /dev/null +++ b/egs/casia_hwdb/v1/local/train_lm.sh @@ -0,0 +1,110 @@ +#!/bin/bash + +# Copyright 2016 Vincent Nguyen +# 2016 Johns Hopkins University (author: Daniel Povey) +# 2017 Ashish Arora +# 2017 Hossein Hadian +# Apache 2.0 +# +# This script trains a LM on the YOMDLE training transcriptions. +# It is based on the example scripts distributed with PocoLM + +# It will check if pocolm is installed and if not will proceed with installation + +set -e +stage=0 +dir=data/local/local_lm +data_dir=data + +echo "$0 $@" # Print the command line for logging +. ./utils/parse_options.sh || exit 1; + +lm_dir=${dir}/data + + +mkdir -p $dir +. ./path.sh || exit 1; # for KALDI_ROOT +export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH +( # First make sure the pocolm toolkit is installed. + cd $KALDI_ROOT/tools || exit 1; + if [ -d pocolm ]; then + echo Not installing the pocolm toolkit since it is already there. + else + echo "$0: Please install the PocoLM toolkit with: " + echo " cd ../../../tools; extras/install_pocolm.sh; cd -" + exit 1; + fi +) || exit 1; + +bypass_metaparam_optim_opt= +# If you want to bypass the metaparameter optimization steps with specific metaparameters +# un-comment the following line, and change the numbers to some appropriate values. +# You can find the values from output log of train_lm.py. +# These example numbers of metaparameters is for 4-gram model (with min-counts) +# running with train_lm.py. +# The dev perplexity should be close to the non-bypassed model. +#bypass_metaparam_optim_opt= +# Note: to use these example parameters, you may need to remove the .done files +# to make sure the make_lm_dir.py be called and tain only 3-gram model +#for order in 3; do +#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done + +if [ $stage -le 0 ]; then + mkdir -p ${dir}/data + mkdir -p ${dir}/data/text + + echo "$0: Getting the Data sources" + + rm ${dir}/data/text/* 2>/dev/null || true + + # Note: the name 'dev' is treated specially by pocolm, it automatically + # becomes the dev set. + nr=`cat $data_dir/train/text | wc -l` + nr_dev=$(($nr / 10 )) + nr_train=$(( $nr - $nr_dev )) + + # use the training data as an additional data source. + # we can later fold the dev data into this. + head -n $nr_train $data_dir/train/text | cut -d " " -f 2- > ${dir}/data/text/train.txt + tail -n $nr_dev $data_dir/train/text | cut -d " " -f 2- > ${dir}/data/text/dev.txt + + # for reporting perplexities, we'll use the "real" dev set. + # (the validation data is used as ${dir}/data/text/dev.txt to work + # out interpolation weights.) + # note, we can't put it in ${dir}/data/text/, because then pocolm would use + # it as one of the data sources. + cut -d " " -f 2- < $data_dir/test/text > ${dir}/data/real_dev_set.txt + + # get the wordlist from MADCAT text + cat ${dir}/data/text/train.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count + cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist +fi + +order=3 + +if [ $stage -le 1 ]; then + # decide on the vocabulary. + # Note: you'd use --wordlist if you had a previously determined word-list + # that you wanted to use. + # Note: if you have more than one order, use a certain amount of words as the + # vocab and want to restrict max memory for 'sort', + echo "$0: training the unpruned LM" + min_counts='train=1' + wordlist=${dir}/data/wordlist + + lm_name="`basename ${wordlist}`_${order}" + if [ -n "${min_counts}" ]; then + lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`" + fi + unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm + train_lm.py --wordlist=${wordlist} --num-splits=5 --warm-start-ratio=1 \ + --min-counts="$min_counts" \ + --limit-unk-history=true \ + ${bypass_metaparam_optim_opt} \ + ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir} + + get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' + + mkdir -p ${dir}/data/arpa + format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz +fi diff --git a/egs/casia_hwdb/v1/local/train_lm_lr.sh b/egs/casia_hwdb/v1/local/train_lm_lr.sh new file mode 100755 index 00000000000..a8b1bfb76a4 --- /dev/null +++ b/egs/casia_hwdb/v1/local/train_lm_lr.sh @@ -0,0 +1,117 @@ +#!/bin/bash + +# Copyright 2016 Vincent Nguyen +# 2016 Johns Hopkins University (author: Daniel Povey) +# 2017 Ashish Arora +# 2017 Hossein Hadian +# Apache 2.0 +# +# This script trains a LM on the YOMDLE+Extra training transcriptions. +# It is based on the example scripts distributed with PocoLM + +# It will check if pocolm is installed and if not will proceed with installation + +set -e +stage=0 +dir=data/local/local_lm +data_dir=data +extra_lm=download/extra_lm.txt +order=3 + +echo "$0 $@" # Print the command line for logging +. ./utils/parse_options.sh || exit 1; + +lm_dir=${dir}/data + + +mkdir -p $dir +. ./path.sh || exit 1; # for KALDI_ROOT +export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH +( # First make sure the pocolm toolkit is installed. + cd $KALDI_ROOT/tools || exit 1; + if [ -d pocolm ]; then + echo Not installing the pocolm toolkit since it is already there. + else + echo "$0: Please install the PocoLM toolkit with: " + echo " cd ../../../tools; extras/install_pocolm.sh; cd -" + exit 1; + fi +) || exit 1; + +bypass_metaparam_optim_opt= +# If you want to bypass the metaparameter optimization steps with specific metaparameters +# un-comment the following line, and change the numbers to some appropriate values. +# You can find the values from output log of train_lm.py. +# These example numbers of metaparameters is for 4-gram model (with min-counts) +# running with train_lm.py. +# The dev perplexity should be close to the non-bypassed model. +#bypass_metaparam_optim_opt= +# Note: to use these example parameters, you may need to remove the .done files +# to make sure the make_lm_dir.py be called and tain only 3-gram model +#for order in 3; do +#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done + +if [ $stage -le 0 ]; then + mkdir -p ${dir}/data + mkdir -p ${dir}/data/text + + echo "$0: Getting the Data sources" + + rm ${dir}/data/text/* 2>/dev/null || true + + cat ${extra_lm} | \ + local/normalize_text.py | \ + utils/lang/bpe/prepend_words.py | \ + python3 utils/lang/bpe/apply_bpe.py -c $data_dir/train/bpe.out | \ + sed 's/@@//g' > ${dir}/data/text/extra_lm.txt + + # Note: the name 'dev' is treated specially by pocolm, it automatically + # becomes the dev set. + nr=`cat $data_dir/train/text | wc -l` + nr_dev=$(($nr / 10 )) + nr_train=$(( $nr - $nr_dev )) + + # use the training data as an additional data source. + # we can later fold the dev data into this. + head -n $nr_train $data_dir/train/text | cut -d " " -f 2- > ${dir}/data/text/train.txt + tail -n $nr_dev $data_dir/train/text | cut -d " " -f 2- > ${dir}/data/text/dev.txt + + # for reporting perplexities, we'll use the "real" dev set. + # (the validation data is used as ${dir}/data/text/dev.txt to work + # out interpolation weights.) + # note, we can't put it in ${dir}/data/text/, because then pocolm would use + # it as one of the data sources. + cut -d " " -f 2- < $data_dir/test/text > ${dir}/data/real_dev_set.txt + + # get the wordlist from MADCAT text + cat ${dir}/data/text/{train,extra_lm}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count + #cat ${dir}/data/text/extra_fa.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count + cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist +fi + +if [ $stage -le 1 ]; then + # decide on the vocabulary. + # Note: you'd use --wordlist if you had a previously determined word-list + # that you wanted to use. + # Note: if you have more than one order, use a certain amount of words as the + # vocab and want to restrict max memory for 'sort', + echo "$0: training the unpruned LM" + min_counts='extra_lm=10 train=1' + wordlist=${dir}/data/wordlist + + lm_name="`basename ${wordlist}`_${order}" + if [ -n "${min_counts}" ]; then + lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`" + fi + unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm + train_lm.py --wordlist=${wordlist} --num-splits=30 --warm-start-ratio=1 \ + --min-counts="$min_counts" \ + --limit-unk-history=true \ + ${bypass_metaparam_optim_opt} \ + ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir} + + get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' + + mkdir -p ${dir}/data/arpa + format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz +fi diff --git a/egs/casia_hwdb/v1/local/wer_output_filter b/egs/casia_hwdb/v1/local/wer_output_filter new file mode 100755 index 00000000000..8702738144f --- /dev/null +++ b/egs/casia_hwdb/v1/local/wer_output_filter @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 +# Copyright 2017 Hossein Hadian + +# This is a filter used in scoring. It separates all +# punctuations from words. For e.g. this sentence: + +# "They have come!" he said reverently, gripping his +# hands. "Isn't it a glorious thing! Long awaited." + +# is converted to this: + +# " They have come ! " he said reverently , gripping his +# hands . " Isn ' t it a glorious thing ! Long awaited . " + +# Sample BPE-based output: +# |He |ro se |from |his |b re ak f as t - s ch oo l |b en ch + +import io +import sys +import unicodedata + +sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding="utf8"); +sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf8"); +for line in sys.stdin: + line = unicodedata.normalize('NFKC', line) + words = line.strip().split() + uttid = words[0] + transcript = ''.join(words[1:]) + transcript = transcript.replace('|', ' ') + print("{} {}".format(uttid, transcript)) diff --git a/egs/casia_hwdb/v1/path.sh b/egs/casia_hwdb/v1/path.sh new file mode 100644 index 00000000000..2d17b17a84a --- /dev/null +++ b/egs/casia_hwdb/v1/path.sh @@ -0,0 +1,6 @@ +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C diff --git a/egs/casia_hwdb/v1/run.sh b/egs/casia_hwdb/v1/run.sh new file mode 100755 index 00000000000..44d1f26117c --- /dev/null +++ b/egs/casia_hwdb/v1/run.sh @@ -0,0 +1,110 @@ +#!/bin/bash + +set -e +stage=0 +nj=60 + +database_train=/export/corpora5/handwriting_ocr/CASIA_HWDB/Offline/ +database_competition=/export/corpora5/handwriting_ocr/CASIA_HWDB/Offline/ +data_dir=data +exp_dir=exp + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if [ $stage -le -1 ]; then + mkdir download/Train + mkdir download/Test + mkdir download/Competition + local/extract_database.sh --database-train $database_train \ + --database-competition $database_competition +fi + +if [ $stage -le 0 ]; then + mkdir -p data/train/data/images + mkdir -p data/test/data/images + mkdir -p data/competition/data/images + local/process_data.py download/Train data/train + local/process_data.py download/Test data/test + local/process_data.py download/Competition data/competition + image/fix_data_dir.sh ${data_dir}/test + image/fix_data_dir.sh ${data_dir}/train + image/fix_data_dir.sh ${data_dir}/competition +fi + +mkdir -p $data_dir/{train,test}/data +if [ $stage -le 1 ]; then + echo "$0: Obtaining image groups. calling get_image2num_frames" + echo "Date: $(date)." + image/get_image2num_frames.py --feat-dim 60 $data_dir/train + image/get_allowed_lengths.py --frame-subsampling-factor 4 10 $data_dir/train + + for datasplit in train test competition; do + echo "$0: Extracting features and calling compute_cmvn_stats for dataset: $datasplit. " + echo "Date: $(date)." + local/extract_features.sh --nj $nj --cmd "$cmd" \ + --feat-dim 60 --num-channels 3 \ + $data_dir/${datasplit} + steps/compute_cmvn_stats.sh $data_dir/${datasplit} || exit 1; + done + + echo "$0: Fixing data directory for train dataset" + echo "Date: $(date)." + utils/fix_data_dir.sh $data_dir/train +fi + +#if [ $stage -le 2 ]; then +# for datasplit in train; do +# echo "$(date) stage 2: Performing augmentation, it will double training data" +# local/augment_data.sh --nj $nj --cmd "$cmd" --feat-dim 60 $data_dir/${datasplit} $data_dir/${datasplit}_aug $data_dir +# steps/compute_cmvn_stats.sh $data_dir/${datasplit}_aug || exit 1; +# done +#fi + +if [ $stage -le 3 ]; then + echo "$0: Preparing dictionary and lang..." + if [ ! -f $data_dir/train/bpe.out ]; then + cut -d' ' -f2- $data_dir/train/text | utils/lang/bpe/prepend_words.py | python3 utils/lang/bpe/learn_bpe.py -s 700 > $data_dir/train/bpe.out + for datasplit in test train; do + cut -d' ' -f1 $data_dir/$datasplit/text > $data_dir/$datasplit/ids + cut -d' ' -f2- $data_dir/$datasplit/text | utils/lang/bpe/prepend_words.py | python3 utils/lang/bpe/apply_bpe.py -c $data_dir/train/bpe.out | sed 's/@@//g' > $data_dir/$datasplit/bpe_text + mv $data_dir/$datasplit/text $data_dir/$datasplit/text.old + paste -d' ' $data_dir/$datasplit/ids $data_dir/$datasplit/bpe_text > $data_dir/$datasplit/text + done + fi + + local/prepare_dict.sh --data-dir $data_dir --dir $data_dir/local/dict + # This recipe uses byte-pair encoding, the silences are part of the words' pronunciations. + # So we set --sil-prob to 0.0 + utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \ + $data_dir/local/dict "" $data_dir/lang/temp $data_dir/lang + utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 $data_dir/lang +fi + +if [ $stage -le 4 ]; then + echo "$0: Estimating a language model for decoding..." + local/train_lm.sh --data-dir $data_dir --dir $data_dir/local/local_lm + utils/format_lm.sh $data_dir/lang $data_dir/local/local_lm/data/arpa/3gram_unpruned.arpa.gz \ + $data_dir/local/dict/lexicon.txt $data_dir/lang_test +fi + +if [ $stage -le 5 ]; then + echo "$0: Calling the flat-start chain recipe..." + echo "Date: $(date)." + local/chain/run_flatstart_cnn1a.sh --nj $nj --train-set train --data-dir $data_dir --exp-dir $exp_dir +fi + +if [ $stage -le 6 ]; then + echo "$0: Aligning the training data using the e2e chain model..." + echo "Date: $(date)." + steps/nnet3/align.sh --nj $nj --cmd "$cmd" --use-gpu false \ + --scale-opts '--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0' \ + $data_dir/train $data_dir/lang $exp_dir/chain/e2e_cnn_1a $exp_dir/chain/e2e_ali_train +fi + +if [ $stage -le 7 ]; then + echo "$0: Building a tree and training a regular chain model using the e2e alignments..." + echo "Date: $(date)." + local/chain/run_cnn_e2eali_1b.sh --nj $nj --train-set train --data-dir $data_dir --exp-dir $exp_dir +fi diff --git a/egs/casia_hwdb/v1/steps b/egs/casia_hwdb/v1/steps new file mode 120000 index 00000000000..6e99bf5b5ad --- /dev/null +++ b/egs/casia_hwdb/v1/steps @@ -0,0 +1 @@ +../../wsj/s5/steps \ No newline at end of file diff --git a/egs/casia_hwdb/v1/utils b/egs/casia_hwdb/v1/utils new file mode 120000 index 00000000000..b240885218f --- /dev/null +++ b/egs/casia_hwdb/v1/utils @@ -0,0 +1 @@ +../../wsj/s5/utils \ No newline at end of file diff --git a/egs/chime1/s5/local/chime1_prepare_data.sh b/egs/chime1/s5/local/chime1_prepare_data.sh index e60c46ff8da..c5963b5d4ab 100755 --- a/egs/chime1/s5/local/chime1_prepare_data.sh +++ b/egs/chime1/s5/local/chime1_prepare_data.sh @@ -53,7 +53,7 @@ for x in "devel" "test"; do for sid in `seq 34`; do sid2=`printf "s%02d" $sid` ls -1 $wav_dir/*/s${sid}_*.wav \ - | perl -ape "s/(.*)\/(.*)\/s.*_(.*).wav/${sid2}_\3_\2$\t\1\/\2\/s${sid}_\3.wav/;" \ + | perl -ape "s/(.*)\/(.*)\/s.*_(.*).wav/${sid2}_\3_\2\t\1\/\2\/s${sid}_\3.wav/;" \ | sort >> $scp done fi @@ -68,7 +68,7 @@ for x in $set_list; do # Create utt2spk files # No speaker ID - perl -ape "s/(.*)\t.*/\1$\t\1/;" < "$scp" > "$data/$x/utt2spk" + perl -ape "s/(.*)\t.*/\1\t\1/;" < "$scp" > "$data/$x/utt2spk" # Use speaker ID # perl -ape "s/(s..)(.*)\\t.*/\1\2\t\1/;" < "$scp" > "$data/$x/utt2spk" diff --git a/egs/chime4/s5_1ch/RESULTS b/egs/chime4/s5_1ch/RESULTS index c0146b772b7..3e5f752a803 100644 --- a/egs/chime4/s5_1ch/RESULTS +++ b/egs/chime4/s5_1ch/RESULTS @@ -17,7 +17,22 @@ et05_simu WER: 33.30% (Average), 26.65% (BUS), 38.40% (CAFE), 34.68% (PEDESTRIAN et05_real WER: 37.54% (Average), 51.92% (BUS), 39.67% (CAFE), 34.04% (PEDESTRIAN), 24.54% (STREET) ------------------- -Advanced baseline: +GMM noisy multi-condition without enhancement using 6 channel data +exp/tri3b_tr05_multi_noisy/best_wer_isolated_1ch_track.result +------------------- +best overall dt05 WER 22.32% (language model weight = 10) +------------------- +dt05_simu WER: 23.24% (Average), 19.28% (BUS), 28.41% (CAFE), 19.16% (PEDESTRIAN), 26.12% (STREET) +------------------- +dt05_real WER: 21.40% (Average), 25.86% (BUS), 21.81% (CAFE), 16.80% (PEDESTRIAN), 21.12% (STREET) +------------------- +et05_simu WER: 32.03% (Average), 25.42% (BUS), 36.25% (CAFE), 33.34% (PEDESTRIAN), 33.10% (STREET) +------------------- +et05_real WER: 36.14% (Average), 49.28% (BUS), 38.79% (CAFE), 32.44% (PEDESTRIAN), 24.06% (STREET) +------------------- + +GMM noisy multi-condition without enhancement using 6 channel data plus enhanced data +exp/tri3b_tr05_multi_noisy/best_wer_isolated_1ch_track.result ------------------- best overall dt05 WER 22.28% (language model weight = 10) ------------------- @@ -30,6 +45,34 @@ et05_simu WER: 32.18% (Average), 25.33% (BUS), 37.37% (CAFE), 33.36% (PEDESTRIAN et05_real WER: 35.54% (Average), 49.07% (BUS), 38.94% (CAFE), 31.60% (PEDESTRIAN), 22.56% (STREET) ------------------- +GMM noisy multi-condition with BLSTM masking using 6 channel data +exp/tri3b_tr05_multi_noisy/best_wer_single_BLSTMmask.result +------------------- +best overall dt05 WER 28.82% (language model weight = 14) +------------------- +dt05_simu WER: 28.54% (Average), 25.46% (BUS), 33.47% (CAFE), 25.19% (PEDESTRIAN), 30.06% (STREET) +------------------- +dt05_real WER: 29.10% (Average), 33.46% (BUS), 31.80% (CAFE), 25.71% (PEDESTRIAN), 25.42% (STREET) +------------------- +et05_simu WER: 36.10% (Average), 30.97% (BUS), 40.42% (CAFE), 35.82% (PEDESTRIAN), 37.19% (STREET) +------------------- +et05_real WER: 41.84% (Average), 52.57% (BUS), 46.41% (CAFE), 39.87% (PEDESTRIAN), 28.52% (STREET) +------------------- + +GMM noisy multi-condition with BLSTM masking using 6 channel data plus enhanced data +exp/tri3b_tr05_multi_noisy/best_wer_single_BLSTMmask.result +------------------- +best overall dt05 WER 22.72% (language model weight = 13) +------------------- +dt05_simu WER: 23.37% (Average), 20.71% (BUS), 28.26% (CAFE), 19.85% (PEDESTRIAN), 24.66% (STREET) +------------------- +dt05_real WER: 22.07% (Average), 25.92% (BUS), 24.32% (CAFE), 18.47% (PEDESTRIAN), 19.58% (STREET) +------------------- +et05_simu WER: 30.41% (Average), 24.08% (BUS), 35.86% (CAFE), 30.80% (PEDESTRIAN), 30.89% (STREET) +------------------- +et05_real WER: 34.02% (Average), 44.68% (BUS), 37.19% (CAFE), 31.73% (PEDESTRIAN), 22.49% (STREET) +------------------- + DNN sMBR exp/tri4a_dnn_tr05_multi_noisy_smbr_i1lats/best_wer_isolated_1ch_track.result ------------------- @@ -45,7 +88,7 @@ et05_simu WER: 24.13% (Average), 19.65% (BUS), 27.57% (CAFE), 23.14% (PEDESTRIAN et05_real WER: 27.68% (Average), 40.40% (BUS), 28.95% (CAFE), 24.25% (PEDESTRIAN), 17.13% (STREET) ------------------- -Advanced baseline: +DNN sMBR using all 6 channel data ------------------- best overall dt05 WER 12.84% (language model weight = 12) (Number of iterations = 3) @@ -73,7 +116,7 @@ et05_simu WER: 22.32% (Average), 17.82% (BUS), 25.48% (CAFE), 21.70% (PEDESTRIAN et05_real WER: 24.92% (Average), 37.52% (BUS), 26.45% (CAFE), 21.28% (PEDESTRIAN), 14.44% (STREET) ------------------- -Advanced baseline: +5-gram rescoring using all 6 channel data ------------------- best overall dt05 WER 11.07% (language model weight = 12) ------------------- @@ -100,7 +143,7 @@ et05_simu WER: 20.84% (Average), 16.49% (BUS), 23.91% (CAFE), 20.25% (PEDESTRIAN et05_real WER: 23.70% (Average), 35.93% (BUS), 24.60% (CAFE), 19.94% (PEDESTRIAN), 14.36% (STREET) ------------------- -Advanced baseline: +RNNLM using all 6 channel data ------------------- best overall dt05 WER 9.99% (language model weight = 14) ------------------- @@ -113,30 +156,86 @@ et05_simu WER: 17.31% (Average), 12.81% (BUS), 20.32% (CAFE), 17.03% (PEDESTRIAN et05_real WER: 18.10% (Average), 26.58% (BUS), 19.97% (CAFE), 14.44% (PEDESTRIAN), 11.43% (STREET) ------------------- -TDNN -exp/chain/tdnn1d_sp/best_wer_beamformit_5mics.result +TDNN using all 6 channel data +exp/chain/tdnniso_sp/best_wer_beamformit_5mics.result +------------------- +best overall dt05 WER 9.56% (language model weight = 10) +------------------- +dt05_simu WER: 10.23% (Average), 8.86% (BUS), 13.13% (CAFE), 7.94% (PEDESTRIAN), 11.00% (STREET) +------------------- +dt05_real WER: 8.89% (Average), 11.90% (BUS), 8.54% (CAFE), 6.09% (PEDESTRIAN), 9.03% (STREET) +------------------- +et05_simu WER: 16.48% (Average), 12.87% (BUS), 18.60% (CAFE), 15.52% (PEDESTRIAN), 18.94% (STREET) +------------------- +et05_real WER: 16.34% (Average), 24.32% (BUS), 16.51% (CAFE), 13.43% (PEDESTRIAN), 11.11% (STREET) +------------------- + +TDNN+RNNLM using all 6 channel data +exp/chain/tdnniso_sp_smbr_lmrescore/best_wer_beamformit_5mics_rnnlm_5k_h300_w0.5_n100.result +------------------- +best overall dt05 WER 7.21% (language model weight = 11) +------------------- +dt05_simu WER: 7.78% (Average), 6.52% (BUS), 10.27% (CAFE), 5.69% (PEDESTRIAN), 8.66% (STREET) +------------------- +dt05_real WER: 6.64% (Average), 9.06% (BUS), 6.62% (CAFE), 4.26% (PEDESTRIAN), 6.61% (STREET) +------------------- +et05_simu WER: 13.54% (Average), 10.22% (BUS), 15.07% (CAFE), 12.94% (PEDESTRIAN), 15.93% (STREET) +------------------- +et05_real WER: 12.92% (Average), 20.79% (BUS), 12.35% (CAFE), 9.62% (PEDESTRIAN), 8.91% (STREET) +------------------- + +TDNN with BLSTM masking using all 6 channel data +exp/chain/tdnn1a_sp/best_wer_single_BLSTMmask.result +------------------- +best overall dt05 WER 18.00% (language model weight = 13) +------------------- +dt05_simu WER: 18.81% (Average), 15.34% (BUS), 23.58% (CAFE), 15.27% (PEDESTRIAN), 21.06% (STREET) +------------------- +dt05_real WER: 17.18% (Average), 21.12% (BUS), 19.45% (CAFE), 11.61% (PEDESTRIAN), 16.53% (STREET) +------------------- +et05_simu WER: 25.85% (Average), 20.06% (BUS), 30.13% (CAFE), 26.88% (PEDESTRIAN), 26.32% (STREET) +------------------- +et05_real WER: 27.68% (Average), 37.88% (BUS), 29.51% (CAFE), 24.74% (PEDESTRIAN), 18.60% (STREET) +------------------- + +TDNN+RNNLM with BLSTM masking using all 6 channel data +exp/chain/tdnn1a_sp/best_wer_single_BLSTMmask.result +------------------- +best overall dt05 WER 14.38% (language model weight = 14) +------------------- +dt05_simu WER: 15.62% (Average), 12.36% (BUS), 20.46% (CAFE), 12.11% (PEDESTRIAN), 17.55% (STREET) +------------------- +dt05_real WER: 13.15% (Average), 16.43% (BUS), 15.21% (CAFE), 8.59% (PEDESTRIAN), 12.37% (STREET) +------------------- +et05_simu WER: 21.61% (Average), 16.01% (BUS), 25.87% (CAFE), 22.15% (PEDESTRIAN), 22.39% (STREET) +------------------- +et05_real WER: 22.47% (Average), 32.34% (BUS), 24.08% (CAFE), 18.91% (PEDESTRIAN), 14.57% (STREET) +------------------- + +TDNN with BLSTM masking using all 6 channel data plus enhanced data +exp/chain/tdnn1a_sp/best_wer_single_BLSTMmask.result ------------------- -best overall dt05 WER 10.37% (language model weight = 9) +best overall dt05 WER 11.73% (language model weight = 12) ------------------- -dt05_simu WER: 10.79% (Average), 9.62% (BUS), 13.70% (CAFE), 8.23% (PEDESTRIAN), 11.61% (STREET) +dt05_simu WER: 13.06% (Average), 10.78% (BUS), 17.20% (CAFE), 10.15% (PEDESTRIAN), 14.10% (STREET) ------------------- -dt05_real WER: 9.95% (Average), 14.38% (BUS), 8.81% (CAFE), 6.43% (PEDESTRIAN), 10.19% (STREET) +dt05_real WER: 10.40% (Average), 13.44% (BUS), 10.72% (CAFE), 7.29% (PEDESTRIAN), 10.16% (STREET) ------------------- -et05_simu WER: 17.18% (Average), 13.75% (BUS), 19.48% (CAFE), 15.82% (PEDESTRIAN), 19.67% (STREET) +et05_simu WER: 19.48% (Average), 14.48% (BUS), 23.10% (CAFE), 19.84% (PEDESTRIAN), 20.49% (STREET) ------------------- -et05_real WER: 18.36% (Average), 30.77% (BUS), 16.17% (CAFE), 14.29% (PEDESTRIAN), 12.20% (STREET) +et05_real WER: 19.08% (Average), 27.43% (BUS), 19.76% (CAFE), 16.93% (PEDESTRIAN), 12.22% (STREET) ------------------- -TDNN+RNNLM -exp/chain/tdnn1d_sp_smbr_lmrescore/best_wer_beamformit_5mics_rnnlm_5k_h300_w0.5_n100.result +TDNN+RNNLM with BLSTM masking using all 6 channel data plus enhanced data +exp/chain/tdnn1a_sp/best_wer_single_BLSTMmask.result ------------------- -best overall dt05 WER 7.98% (language model weight = 10) +best overall dt05 WER 8.95% (language model weight = 13) ------------------- -dt05_simu WER: 8.40% (Average), 7.37% (BUS), 10.91% (CAFE), 6.36% (PEDESTRIAN), 8.97% (STREET) +dt05_simu WER: 10.28% (Average), 8.51% (BUS), 13.88% (CAFE), 7.58% (PEDESTRIAN), 11.17% (STREET) ------------------- -dt05_real WER: 7.56% (Average), 11.58% (BUS), 6.58% (CAFE), 4.41% (PEDESTRIAN), 7.65% (STREET) +dt05_real WER: 7.62% (Average), 10.25% (BUS), 7.86% (CAFE), 5.31% (PEDESTRIAN), 7.05% (STREET) ------------------- -et05_simu WER: 13.91% (Average), 10.87% (BUS), 15.09% (CAFE), 12.78% (PEDESTRIAN), 16.88% (STREET) +et05_simu WER: 16.18% (Average), 12.03% (BUS), 18.71% (CAFE), 16.62% (PEDESTRIAN), 17.35% (STREET) ------------------- -et05_real WER: 14.99% (Average), 26.88% (BUS), 13.32% (CAFE), 10.07% (PEDESTRIAN), 9.71% (STREET) +et05_real WER: 15.08% (Average), 22.96% (BUS), 15.45% (CAFE), 12.74% (PEDESTRIAN), 9.17% (STREET) ------------------- diff --git a/egs/chime4/s5_1ch/local/CHiME3_simulate_data_patched_parallel.m b/egs/chime4/s5_1ch/local/CHiME3_simulate_data_patched_parallel.m new file mode 100755 index 00000000000..49c1ed48018 --- /dev/null +++ b/egs/chime4/s5_1ch/local/CHiME3_simulate_data_patched_parallel.m @@ -0,0 +1,362 @@ +function CHiME3_simulate_data_patched_parallel(official,nj,chime4_dir,chime3_dir) + +% CHIME3_SIMULATE_DATA Creates simulated data for the 3rd CHiME Challenge +% +% CHiME3_simulate_data +% CHiME3_simulate_data(official) +% +% Input: +% official: boolean flag indicating whether to recreate the official +% Challenge data (default) or to create new (non-official) data +% +% If you use this software in a publication, please cite: +% +% Jon Barker, Ricard Marxer, Emmanuel Vincent, and Shinji Watanabe, The +% third 'CHiME' Speech Separation and Recognition Challenge: Dataset, +% task and baselines, submitted to IEEE 2015 Automatic Speech Recognition +% and Understanding Workshop (ASRU), 2015. +% +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% Copyright 2015 University of Sheffield (Jon Barker, Ricard Marxer) +% Inria (Emmanuel Vincent) +% Mitsubishi Electric Research Labs (Shinji Watanabe) +% This software is distributed under the terms of the GNU Public License +% version 3 (http://www.gnu.org/licenses/gpl.txt) +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +utils_folder = sprintf('%s/tools/utils', chime4_dir); +enhancement_folder = sprintf('%s/tools/enhancement/', chime3_dir); +addpath(utils_folder,'-end'); +addpath(enhancement_folder); +sim_folder = sprintf('%s/tools/simulation', chime4_dir); +addpath(sim_folder); +upath = sprintf('%s/data/audio/16kHz/isolated/', chime4_dir); +cpath = sprintf('%s/data/audio/16kHz/embedded/', chime4_dir); +bpath = sprintf('%s/data/audio/16kHz/backgrounds/', chime4_dir); +apath = sprintf('%s/data/annotations/', chime4_dir); +upath_ext = 'local/nn-gev/data/audio/16kHz/isolated_ext/'; +upath_simu = 'local/nn-gev/data/audio/16kHz/isolated/'; +nchan=6; + +% Define hyper-parameters +pow_thresh=-20; % threshold in dB below which a microphone is considered to fail +wlen_sub=256; % STFT window length in samples +blen_sub=4000; % average block length in samples for speech subtraction (250 ms) +ntap_sub=12; % filter length in frames for speech subtraction (88 ms) +wlen_add=1024; % STFT window length in samples for speaker localization +del=-3; % minimum delay (0 for a causal filter) + +%% Create simulated training dataset from original WSJ0 data %% +if exist('equal_filter.mat','file'), + load('equal_filter.mat'); +else + % Compute average power spectrum of booth data + nfram=0; + bth_spec=zeros(wlen_sub/2+1,1); + sets={'tr05' 'dt05'}; + for set_ind=1:length(sets), + set=sets{set_ind}; + mat=json2mat([apath set '_bth.json']); + for utt_ind=1:length(mat), + oname=[mat{utt_ind}.speaker '_' mat{utt_ind}.wsj_name '_BTH']; + fprintf('%s\n',[upath set '_bth/' oname '.CH0.wav']); + o=audioread([upath set '_bth/' oname '.CH0.wav']); + O=stft_multi(o.',wlen_sub); + nfram=nfram+size(O,2); + bth_spec=bth_spec+sum(abs(O).^2,2); + end + end + bth_spec=bth_spec/nfram; + + % Compute average power spectrum of original WSJ0 data + nfram=0; + org_spec=zeros(wlen_sub/2+1,1); + olist=dir([upath 'tr05_org/*.wav']); + for f=1:length(olist), + oname=olist(f).name; + o=audioread([upath 'tr05_org/' oname]); + O=stft_multi(o.',wlen_sub); + nfram=nfram+size(O,2); + org_spec=org_spec+sum(abs(O).^2,2); + end + org_spec=org_spec/nfram; + + % Derive equalization filter + equal_filter=sqrt(bth_spec./org_spec); + save('equal_filter.mat','equal_filter'); +end +% Read official annotations +if official, + mat=json2mat([apath 'tr05_simu.json']); +% Create new (non-official) annotations +else + mat=json2mat([apath 'tr05_org.json']); + ir_mat=json2mat([apath 'tr05_real.json']); + for utt_ind=1:length(mat), + oname=[mat{utt_ind}.speaker '_' mat{utt_ind}.wsj_name '_ORG']; + osize=audioread([upath 'tr05_org/' oname '.wav'],'size'); + dur=osize(1)/16000; + envirs={'BUS' 'CAF' 'PED' 'STR'}; + envir=envirs{randperm(4,1)}; % draw a random environment + mat{utt_ind}.environment=envir; + blist=dir([bpath '*' envir '.CH1.wav']); + dur_diff=inf(1,length(ir_mat)); + for ir_ind=1:length(ir_mat), + if strcmp(ir_mat{ir_ind}.environment,envir), + ir_dur=ir_mat{ir_ind}.end-ir_mat{ir_ind}.start; + dur_diff(ir_ind)=abs(ir_dur-dur); + end + end + ir_ind=find(isinf(dur_diff)); + ir_ind=ir_ind(1); + nfail=true; + while nfail, + bname=blist(randperm(length(blist),1)).name(1:end-8); % draw a random background recording + mat{utt_ind}.noise_wavfile=bname; + bsize=audioread([bpath bname '.CH1.wav'],'size'); + bdur=bsize(1)/16000; + mat{utt_ind}.noise_start=ceil(rand(1)*(bdur-dur)*16000)/16000; % draw a random time + mat{utt_ind}.noise_end=mat{utt_ind}.noise_start+dur; + nname=mat{utt_ind}.noise_wavfile; + nbeg=round(mat{utt_ind}.noise_start*16000)+1; + nend=round(mat{utt_ind}.noise_end*16000); + n=zeros(nend-nbeg+1,nchan); + for c=1:nchan, + n(:,c)=audioread([bpath nname '.CH' int2str(c) '.wav'],[nbeg nend]); + end + npow=sum(n.^2,1); + npow=10*log10(npow/max(npow)); + nfail=any(npow<=pow_thresh); % check for microphone failure + end + xfail=true; + while xfail, + dur_diff(ir_ind)=inf; + [~,ir_ind]=min(dur_diff); % pick impulse response from the same environment with the closest duration + if dur_diff(ir_ind)==inf, + keyboard; + end + mat{utt_ind}.ir_wavfile=ir_mat{ir_ind}.wavfile; + mat{utt_ind}.ir_start=ir_mat{ir_ind}.start; + mat{utt_ind}.ir_end=ir_mat{ir_ind}.end; + iname=mat{utt_ind}.ir_wavfile; + ibeg=round(mat{utt_ind}.ir_start*16000)+1; + iend=round(mat{utt_ind}.ir_end*16000); + x=zeros(iend-ibeg+1,nchan); + for c=1:nchan, + x(:,c)=audioread([cpath iname '.CH' int2str(c) '.wav'],[ibeg iend]); + end + xpow=sum(x.^2,1); + xpow=10*log10(xpow/max(xpow)); + xfail=any(xpow<=pow_thresh); % check for microphone failure + end + mat{utt_ind}=orderfields(mat{utt_ind}); + end + mat2json(mat,[apath 'tr05_simu_new.json']); +end + +p = parpool('local', nj); +% Loop over utterances +parfor utt_ind=1:length(mat), + if official, + udir=[upath_simu 'tr05_' lower(mat{utt_ind}.environment) '_simu/']; + udir_ext=[upath_ext 'tr05_' lower(mat{utt_ind}.environment) '_simu/']; + else + udir=[upath 'tr05_' lower(mat{utt_ind}.environment) '_simu_new/']; + end + if ~exist(udir,'dir'), + system(['mkdir -p ' udir]); + end + if ~exist(udir_ext,'dir'), + system(['mkdir -p ' udir_ext]); + end + oname=[mat{utt_ind}.speaker '_' mat{utt_ind}.wsj_name '_ORG']; + iname=mat{utt_ind}.ir_wavfile; + nname=mat{utt_ind}.noise_wavfile; + uname=[mat{utt_ind}.speaker '_' mat{utt_ind}.wsj_name '_' mat{utt_ind}.environment]; + ibeg=round(mat{utt_ind}.ir_start*16000)+1; + iend=round(mat{utt_ind}.ir_end*16000); + nbeg=round(mat{utt_ind}.noise_start*16000)+1; + nend=round(mat{utt_ind}.noise_end*16000); + + % Load WAV files + fprintf('%s\n',[upath 'tr05_org/' oname '.wav']); + o=audioread([upath 'tr05_org/' oname '.wav']); + [r,fs]=audioread([cpath iname '.CH0.wav'],[ibeg iend]); + fprintf('%s\n',[cpath iname '.CH0.wav'],[ibeg iend]); + x=zeros(iend-ibeg+1,nchan); + n=zeros(nend-nbeg+1,nchan); + for c=1:nchan, + fprintf('%s Place1\n',[cpath iname '.CH' int2str(c) '.wav']); + x(:,c)=audioread([cpath iname '.CH' int2str(c) '.wav'],[ibeg iend]); + n(:,c)=audioread([bpath nname '.CH' int2str(c) '.wav'],[nbeg nend]); + fprintf('%s Place2\n',[bpath nname '.CH' int2str(c) '.wav']); + end + + % Compute the STFT (short window) + O=stft_multi(o.',wlen_sub); + R=stft_multi(r.',wlen_sub); + X=stft_multi(x.',wlen_sub); + + % Estimate 88 ms impulse responses on 250 ms time blocks + A=estimate_ir(R,X,blen_sub,ntap_sub,del); + + % Derive SNR + Y=apply_ir(A,R,del); + y=istft_multi(Y,iend-ibeg+1).'; + SNR=sum(sum(y.^2))/sum(sum((x-y).^2)); + + % Equalize microphone + [~,nfram]=size(O); + O=O.*repmat(equal_filter,[1 nfram]); + o=istft_multi(O,nend-nbeg+1).'; + + % Compute the STFT (long window) + O=stft_multi(o.',wlen_add); + X=stft_multi(x.',wlen_add); + [nbin,nfram] = size(O); + + % Localize and track the speaker + [~,TDOAx]=localize(X); + + % Interpolate the spatial position over the duration of clean speech + TDOA=zeros(nchan,nfram); + for c=1:nchan, + TDOA(c,:)=interp1(0:size(X,2)-1,TDOAx(c,:),(0:nfram-1)/(nfram-1)*(size(X,2)-1)); + end + + % Filter clean speech + Ysimu=zeros(nbin,nfram,nchan); + for f=1:nbin, + for t=1:nfram, + Df=sqrt(1/nchan)*exp(-2*1i*pi*(f-1)/wlen_add*fs*TDOA(:,t)); + Ysimu(f,t,:)=permute(Df*O(f,t),[2 3 1]); + end + end + ysimu=istft_multi(Ysimu,nend-nbeg+1).'; + + % Normalize level and add + ysimu=sqrt(SNR/sum(sum(ysimu.^2))*sum(sum(n.^2)))*ysimu; + xsimu=ysimu+n; + + % Write WAV file + for c=1:nchan, + audiowrite([udir uname '.CH' int2str(c) '.wav'],xsimu(:,c),fs); + audiowrite([udir_ext uname '.CH' int2str(c) '.Noise.wav'],n(:, c),fs); + audiowrite([udir_ext uname '.CH' int2str(c) '.Clean.wav'],ysimu(:, c), fs); + end +end + +%% Create simulated development and test datasets from booth recordings %% +sets={'dt05' 'et05'}; +for set_ind=1:length(sets), + set=sets{set_ind}; + + % Read official annotations + if official, + mat=json2mat([apath set '_simu.json']); + + % Create new (non-official) annotations + else + mat=json2mat([apath set '_real.json']); + clean_mat=json2mat([apath set '_bth.json']); + for utt_ind=1:length(mat), + for clean_ind=1:length(clean_mat), % match noisy utterance with same clean utterance (may be from a different speaker) + if strcmp(clean_mat{clean_ind}.wsj_name,mat{utt_ind}.wsj_name), + break; + end + end + noise_mat=mat{utt_ind}; + mat{utt_ind}=clean_mat{clean_ind}; + mat{utt_ind}.environment=noise_mat.environment; + mat{utt_ind}.noise_wavfile=noise_mat.wavfile; + dur=mat{utt_ind}.end-mat{utt_ind}.start; + noise_dur=noise_mat.end-noise_mat.start; + pbeg=round((dur-noise_dur)/2*16000)/16000; + pend=round((dur-noise_dur)*16000)/16000-pbeg; + mat{utt_ind}.noise_start=noise_mat.start-pbeg; + mat{utt_ind}.noise_end=noise_mat.end+pend; + mat{utt_ind}=orderfields(mat{utt_ind}); + end + mat2json(mat,[apath set '_simu_new.json']); + end + + % Loop over utterances + parfor utt_ind=1:length(mat), + if official, + udir=[upath_simu set '_' lower(mat{utt_ind}.environment) '_simu/']; + udir_ext=[upath_ext set '_' lower(mat{utt_ind}.environment) '_simu/']; + else + udir=[upath set '_' lower(mat{utt_ind}.environment) '_simu_new/']; + end + if ~exist(udir,'dir'), + system(['mkdir -p ' udir]); + end + if ~exist(udir_ext,'dir'), + system(['mkdir -p ' udir_ext]); + end + oname=[mat{utt_ind}.speaker '_' mat{utt_ind}.wsj_name '_BTH']; + nname=mat{utt_ind}.noise_wavfile; + uname=[mat{utt_ind}.speaker '_' mat{utt_ind}.wsj_name '_' mat{utt_ind}.environment]; + tbeg=round(mat{utt_ind}.noise_start*16000)+1; + tend=round(mat{utt_ind}.noise_end*16000); + + % Load WAV files + o=audioread([upath set '_bth/' oname '.CH0.wav']); + [r,fs]=audioread([cpath nname '.CH0.wav'],[tbeg tend]); + nsampl=length(r); + x=zeros(nsampl,nchan); + for c=1:nchan, + x(:,c)=audioread([cpath nname '.CH' int2str(c) '.wav'],[tbeg tend]); + end + + % Compute the STFT (short window) + R=stft_multi(r.',wlen_sub); + X=stft_multi(x.',wlen_sub); + + % Estimate 88 ms impulse responses on 250 ms time blocks + A=estimate_ir(R,X,blen_sub,ntap_sub,del); + + % Filter and subtract close-mic speech + Y=apply_ir(A,R,del); + y=istft_multi(Y,nsampl).'; + level=sum(sum(y.^2)); + n=x-y; + + % Compute the STFT (long window) + O=stft_multi(o.',wlen_add); + X=stft_multi(x.',wlen_add); + [nbin,nfram] = size(O); + + % Localize and track the speaker + [~,TDOAx]=localize(X); + + % Interpolate the spatial position over the duration of clean speech + TDOA=zeros(nchan,nfram); + for c=1:nchan, + TDOA(c,:)=interp1(0:size(X,2)-1,TDOAx(c,:),(0:nfram-1)/(nfram-1)*(size(X,2)-1)); + end + + % Filter clean speech + Ysimu=zeros(nbin,nfram,nchan); + for f=1:nbin, + for t=1:nfram, + Df=sqrt(1/nchan)*exp(-2*1i*pi*(f-1)/wlen_add*fs*TDOA(:,t)); + Ysimu(f,t,:)=permute(Df*O(f,t),[2 3 1]); + end + end + ysimu=istft_multi(Ysimu,nsampl).'; + + % Normalize level and add + ysimu=sqrt(level/sum(sum(ysimu.^2)))*ysimu; + xsimu=ysimu+n; + + % Write WAV file + for c=1:nchan, + audiowrite([udir uname '.CH' int2str(c) '.wav'],xsimu(:,c),fs); + audiowrite([udir_ext uname '.CH' int2str(c) '.Noise.wav'],n(:, c),fs); + audiowrite([udir_ext uname '.CH' int2str(c) '.Clean.wav'],ysimu(:, c), fs); + end + end +end +delete(p); +end diff --git a/egs/chime4/s5_1ch/local/chain/run_tdnn_lstm_recog.sh b/egs/chime4/s5_1ch/local/chain/run_tdnn_lstm_recog.sh deleted file mode 100755 index 9348cd6fa5a..00000000000 --- a/egs/chime4/s5_1ch/local/chain/run_tdnn_lstm_recog.sh +++ /dev/null @@ -1,223 +0,0 @@ -#!/bin/bash - -set -e -o pipefail - -stage=0 -nj=30 -train=noisy -enhan=$1 -mdir=$2 -train_set=tr05_multi_${train} -test_sets="dt05_real_$enhan dt05_simu_$enhan et05_real_$enhan et05_simu_$enhan" -gmm=tri3b_tr05_multi_${train} # this is the source gmm-dir that we'll use for alignments; it - # should have alignments for the specified training data. -nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. - -# Options which are not passed through to run_ivector_common.sh -affix=1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. -common_egs_dir= -reporting_email= - -# training chunk-options -chunk_width=140,100,160 -# we don't need extra left/right context for TDNN systems. -chunk_left_context=0 -chunk_right_context=0 - -#decode options -test_online_decoding=false # if true, it will run the last decoding stage. - -# End configuration section. -echo "$0 $@" # Print the command line for logging - - -. ./cmd.sh -. ./path.sh -. ./utils/parse_options.sh - - -if ! cuda-compiled; then - cat < \n\n" `basename $0` - echo "First argument specifies a unique name for different enhancement method" - echo "Second argument specifies acoustic and language model directory" - exit 1; -fi - -# check whether run_init is executed -if [ ! -d data/lang ]; then - echo "error, execute local/run_init.sh, first" - exit 1; -fi - -# check whether run_init is executed -if [ ! -d exp/tri3b_tr05_multi_${train} ]; then - echo "error, execute local/run_init.sh, first" - exit 1; -fi - -# check ivector extractor -if [ ! -d $mdir/exp/nnet3${nnet3_affix}/extractor ]; then - echo "error, set $mdir correctly" - exit 1; -elif [ ! -d exp/nnet3${nnet3_affix}/extractor ]; then - echo "copy $mdir/exp/nnet3${nnet3_affix}/extractor" - mkdir -p exp/nnet3${nnet3_affix} - cp -r $mdir/exp/nnet3${nnet3_affix}/extractor exp/nnet3${nnet3_affix}/ -fi - -# check tdnn-lstm graph -if [ ! -d $mdir/exp/chain${nnet3_affix}/tree_a_sp/graph_tgpr_5k ]; then - echo "error, set $mdir correctly" - exit 1; -elif [ ! -d exp/chain${nnet3_affix}/tree_a_sp/graph_tgpr_5k ]; then - echo "copy $mdir/exp/chain${nnet3_affix}/tree_a_sp/graph_tgpr_5k" - mkdir -p exp/chain${nnet3_affix}/tree_a_sp - cp -r $mdir/exp/chain${nnet3_affix}/tree_a_sp/graph_tgpr_5k exp/chain${nnet3_affix}/tree_a_sp/ -fi - -# check dir -if [ ! -d $mdir/exp/chain${nnet3_affix}/tdnn_lstm${affix}_sp ]; then - echo "error, set $mdir correctly" - exit 1; -elif [ ! -d exp/chain${nnet3_affix}/tdnn_lstm${affix}_sp ]; then - echo "copy $mdir/exp/chain${nnet3_affix}/tdnn_lstm${affix}_sp" - cp -r $mdir/exp/chain${nnet3_affix}/tdnn_lstm${affix}_sp exp/chain${nnet3_affix}/ - rm -rf exp/chain${nnet3_affix}/tdnn_lstm${affix}_sp/decode_* - rm -rf exp/chain${nnet3_affix}/tdnn_lstm${affix}_sp/best_* -fi - -dir=exp/chain${nnet3_affix}/tdnn_lstm${affix}_sp - -# note: you don't necessarily have to change the treedir name -# each time you do a new experiment-- only if you change the -# configuration in a way that affects the tree. -tree_dir=$mdir/exp/chain${nnet3_affix}/tree_a_sp - -# make ivector for dev and eval -if [ $stage -le 2 ]; then - for datadir in ${test_sets}; do - utils/copy_data_dir.sh data/$datadir data/${datadir}_hires - done - - # extracting hires features - for datadir in ${test_sets}; do - steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \ - --cmd "$train_cmd" data/${datadir}_hires - steps/compute_cmvn_stats.sh data/${datadir}_hires - utils/fix_data_dir.sh data/${datadir}_hires - done - - # extract iVectors for the test data, but in this case we don't need the speed - # perturbation (sp). - for data in ${test_sets}; do - nspk=$(wc -l /dev/null || true - - for data in $test_sets; do - ( - data_affix=$(echo $data | sed s/test_//) - nspk=$(wc -l /dev/null || true - - for data in $test_sets; do - ( - data_affix=$(echo $data | sed s/test_//) - nspk=$(wc -l /dev/null || true - - for data in $test_sets; do - ( - data_affix=$(echo $data | sed s/test_//) - nspk=$(wc -l exp/chain/tdnn_lstm${affix}_sp/best_wer_$enhan.result - head -n 15 exp/chain/tdnn_lstm${affix}_sp/best_wer_$enhan.result - - echo "score looped decoding results" - local/chime4_calc_wers_looped.sh exp/chain/tdnn_lstm${affix}_sp $enhan exp/chain/tree_a_sp/graph_tgpr_5k \ - > exp/chain/tdnn_lstm${affix}_sp/best_wer_looped_$enhan.result - head -n 15 exp/chain/tdnn_lstm${affix}_sp/best_wer_looped_$enhan.result -fi - -exit 0; diff --git a/egs/chime4/s5_1ch/local/chain/run_tdnn_recog.sh b/egs/chime4/s5_1ch/local/chain/run_tdnn_recog.sh deleted file mode 100755 index 38a9cc391e7..00000000000 --- a/egs/chime4/s5_1ch/local/chain/run_tdnn_recog.sh +++ /dev/null @@ -1,200 +0,0 @@ -#!/bin/bash - -set -e -o pipefail - -stage=0 -nj=30 -train=noisy -enhan=$1 -mdir=$2 -train_set=tr05_multi_${train} -test_sets="dt05_real_$enhan dt05_simu_$enhan et05_real_$enhan et05_simu_$enhan" -gmm=tri3b_tr05_multi_${train} # this is the source gmm-dir that we'll use for alignments; it - # should have alignments for the specified training data. -nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. - -# Options which are not passed through to run_ivector_common.sh -affix=1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. -common_egs_dir= -reporting_email= - -# training chunk-options -chunk_width=140,100,160 -# we don't need extra left/right context for TDNN systems. -chunk_left_context=0 -chunk_right_context=0 - -#decode options -test_online_decoding=false # if true, it will run the last decoding stage. - -# End configuration section. -echo "$0 $@" # Print the command line for logging - - -. ./cmd.sh -. ./path.sh -. ./utils/parse_options.sh - - -if ! cuda-compiled; then - cat < \n\n" `basename $0` - echo "First argument specifies a unique name for different enhancement method" - echo "Second argument specifies acoustic and language model directory" - exit 1; -fi - -# check whether run_init is executed -if [ ! -d data/lang ]; then - echo "error, execute local/run_init.sh, first" - exit 1; -fi - -# check whether run_init is executed -if [ ! -d exp/tri3b_tr05_multi_${train} ]; then - echo "error, execute local/run_init.sh, first" - exit 1; -fi - -# check ivector extractor -if [ ! -d $mdir/exp/nnet3${nnet3_affix}/extractor ]; then - echo "error, set $mdir correctly" - exit 1; -elif [ ! -d exp/nnet3${nnet3_affix}/extractor ]; then - echo "copy $mdir/exp/nnet3${nnet3_affix}/extractor" - mkdir -p exp/nnet3${nnet3_affix} - cp -r $mdir/exp/nnet3${nnet3_affix}/extractor exp/nnet3${nnet3_affix}/ -fi - -# check tdnn graph -if [ ! -d $mdir/exp/chain${nnet3_affix}/tree_a_sp/graph_tgpr_5k ]; then - echo "error, set $mdir correctly" - exit 1; -elif [ ! -d exp/chain${nnet3_affix}/tree_a_sp/graph_tgpr_5k ]; then - echo "copy $mdir/exp/chain${nnet3_affix}/tree_a_sp/graph_tgpr_5k" - mkdir -p exp/chain${nnet3_affix}/tree_a_sp - cp -r $mdir/exp/chain${nnet3_affix}/tree_a_sp/graph_tgpr_5k exp/chain${nnet3_affix}/tree_a_sp/ -fi - -# check dir -if [ ! -d $mdir/exp/chain${nnet3_affix}/tdnn${affix}_sp ]; then - echo "error, set $mdir correctly" - exit 1; -elif [ ! -d exp/chain${nnet3_affix}/tdnn${affix}_sp ]; then - echo "copy $mdir/exp/chain${nnet3_affix}/tdnn${affix}_sp" - cp -r $mdir/exp/chain${nnet3_affix}/tdnn${affix}_sp exp/chain${nnet3_affix}/ - rm -rf exp/chain${nnet3_affix}/tdnn${affix}_sp/decode_* - rm -rf exp/chain${nnet3_affix}/tdnn${affix}_sp/best_* -fi - -dir=exp/chain${nnet3_affix}/tdnn${affix}_sp - -# note: you don't necessarily have to change the treedir name -# each time you do a new experiment-- only if you change the -# configuration in a way that affects the tree. -tree_dir=$mdir/exp/chain${nnet3_affix}/tree_a_sp - -# make ivector for dev and eval -if [ $stage -le 2 ]; then - for datadir in ${test_sets}; do - utils/copy_data_dir.sh data/$datadir data/${datadir}_hires - done - - # extracting hires features - for datadir in ${test_sets}; do - steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \ - --cmd "$train_cmd" data/${datadir}_hires - steps/compute_cmvn_stats.sh data/${datadir}_hires - utils/fix_data_dir.sh data/${datadir}_hires - done - - # extract iVectors for the test data, but in this case we don't need the speed - # perturbation (sp). - for data in ${test_sets}; do - nspk=$(wc -l /dev/null || true - - for data in $test_sets; do - ( - data_affix=$(echo $data | sed s/test_//) - nspk=$(wc -l /dev/null || true - - for data in $test_sets; do - ( - data_affix=$(echo $data | sed s/test_//) - nspk=$(wc -l exp/chain/tdnn${affix}_sp/best_wer_$enhan.result - head -n 15 exp/chain/tdnn${affix}_sp/best_wer_$enhan.result -fi - - -exit 0; diff --git a/egs/chime4/s5_1ch/local/chain/tuning/run_tdnn_1a.sh b/egs/chime4/s5_1ch/local/chain/tuning/run_tdnn_1a.sh index aa7d07b636a..3f8b7c60090 100755 --- a/egs/chime4/s5_1ch/local/chain/tuning/run_tdnn_1a.sh +++ b/egs/chime4/s5_1ch/local/chain/tuning/run_tdnn_1a.sh @@ -1,20 +1,20 @@ #!/bin/bash -# This was modified from wsj/local/chain/tunning/run_tdnn_1d.sh to be +# This was modified from wsj/local/chain/tunning/run_tdnn_1e.sh to be # used in Chime4. #This is the result using all 6 channels: -# exp/chain/tdnn1a_sp/best_wer_beamformit_5mics.result +# exp/chain/tdnn1a_sp/best_wer_blstm_gev.result # ------------------- -# best overall dt05 WER 6.04% (language model weight = 9) +# best overall dt05 WER 4.34% (language model weight = 7) # ------------------- -# dt05_simu WER: 6.25% (Average), 5.71% (BUS), 6.92% (CAFE), 5.37% (PEDESTRIAN), 7.02% (STREET) +# dt05_simu WER: 4.46% (Average), 4.12% (BUS), 5.29% (CAFE), 4.00% (PEDESTRIAN), 4.42% (STREET) # ------------------- -# dt05_real WER: 5.83% (Average), 7.48% (BUS), 5.28% (CAFE), 4.43% (PEDESTRIAN), 6.13% (STREET) +# dt05_real WER: 4.21% (Average), 4.94% (BUS), 4.07% (CAFE), 3.81% (PEDESTRIAN), 4.04% (STREET) # ------------------- -# et05_simu WER: 10.30% (Average), 7.34% (BUS), 10.37% (CAFE), 10.05% (PEDESTRIAN), 13.43% (STREET) +# et05_simu WER: 5.50% (Average), 4.78% (BUS), 5.86% (CAFE), 5.51% (PEDESTRIAN), 5.83% (STREET) # ------------------- -# et05_real WER: 9.67% (Average), 12.71% (BUS), 8.33% (CAFE), 8.20% (PEDESTRIAN), 9.45% (STREET) +# et05_real WER: 5.78% (Average), 6.82% (BUS), 5.10% (CAFE), 5.70% (PEDESTRIAN), 5.51% (STREET) # ------------------- # Final train prob -0.080 # Final valid prob -0.075 @@ -32,9 +32,7 @@ set -e -o pipefail stage=1 nj=30 train=noisy -enhan=$1 train_set=tr05_multi_${train} -test_sets="dt05_real_$enhan dt05_simu_$enhan et05_real_$enhan et05_simu_$enhan" gmm=tri3b_tr05_multi_${train} # this is the source gmm-dir that we'll use for alignments; it # should have alignments for the specified training data. num_threads_ubm=32 @@ -57,11 +55,11 @@ chunk_right_context=0 # training options srand=0 -remove_egs=false +remove_egs=true #decode options test_online_decoding=false # if true, it will run the last decoding stage. - +decode_only=false # if true, it wouldn't train a model again and will only do decoding # End configuration section. echo "$0 $@" # Print the command line for logging @@ -70,6 +68,8 @@ echo "$0 $@" # Print the command line for logging . ./path.sh . ./utils/parse_options.sh +enhan=$1 +test_sets="dt05_real_$enhan dt05_simu_$enhan et05_real_$enhan et05_simu_$enhan" if ! cuda-compiled; then cat < $dir/configs/network.xconfig @@ -187,18 +232,18 @@ if [ $stage -le 15 ]; then fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat # the first splicing is moved before the lda layer, so no splicing here - relu-batchnorm-layer name=tdnn1 dim=750 - relu-batchnorm-layer name=tdnn2 dim=750 input=Append(-1,0,1) - relu-batchnorm-layer name=tdnn3 dim=750 - relu-batchnorm-layer name=tdnn4 dim=750 input=Append(-1,0,1) - relu-batchnorm-layer name=tdnn5 dim=750 - relu-batchnorm-layer name=tdnn6 dim=750 input=Append(-3,0,3) - relu-batchnorm-layer name=tdnn7 dim=750 input=Append(-3,0,3) - relu-batchnorm-layer name=tdnn8 dim=750 input=Append(-6,-3,0) + relu-batchnorm-layer name=tdnn1 $opts dim=850 + relu-batchnorm-layer name=tdnn2 $opts dim=850 input=Append(-1,0,1) + relu-batchnorm-layer name=tdnn3 $opts dim=850 + relu-batchnorm-layer name=tdnn4 $opts dim=850 input=Append(-1,0,1) + relu-batchnorm-layer name=tdnn5 $opts dim=850 + relu-batchnorm-layer name=tdnn6 $opts dim=850 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn7 $opts dim=850 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn8 $opts dim=850 input=Append(-6,-3,0) ## adding the layers for chain branch - relu-batchnorm-layer name=prefinal-chain dim=750 target-rms=0.5 - output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + relu-batchnorm-layer name=prefinal-chain $opts dim=850 target-rms=0.5 + output-layer name=output $output_opts include-log-softmax=false dim=$num_targets max-change=1.5 # adding the layers for xent branch # This block prints the configs for a separate output that will be @@ -209,8 +254,8 @@ if [ $stage -le 15 ]; then # final-layer learns at a rate independent of the regularization # constant; and the 0.5 was tuned so as to make the relative progress # similar in the xent and regular final layers. - relu-batchnorm-layer name=prefinal-xent input=tdnn8 dim=750 target-rms=0.5 - output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + relu-batchnorm-layer name=prefinal-xent $opts input=tdnn8 dim=850 target-rms=0.5 + output-layer name=output-xent $output_opts dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 EOF steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ fi @@ -221,7 +266,12 @@ if [ $stage -le 16 ]; then utils/create_split_dir.pl \ /export/b0{3,4,5,6}/$USER/kaldi-data/egs/chime4-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage fi - + + cat $train_data_dir/utt2uniq | awk -F' ' '{print $1}' > $train_data_dir/utt2uniq.tmp1 + cat $train_data_dir/utt2uniq | awk -F' ' '{print $2}' | sed -e 's/\....//g' | sed -e 's/\_CH.//g' | sed -e 's/\_enhan//g' > $train_data_dir/utt2uniq.tmp2 + paste -d" " $train_data_dir/utt2uniq.tmp1 $train_data_dir/utt2uniq.tmp2 > $train_data_dir/utt2uniq + rm -rf $train_data_dir/utt2uniq.tmp{1,2} + steps/nnet3/chain/train.py --stage=$train_stage \ --cmd="$decode_cmd" \ --feat.online-ivector-dir=$train_ivector_dir \ @@ -233,16 +283,17 @@ if [ $stage -le 16 ]; then --chain.lm-opts="--num-extra-lm-states=2000" \ --trainer.srand=$srand \ --trainer.max-param-change=2.0 \ - --trainer.num-epochs=6 \ + --trainer.num-epochs=12 \ --trainer.frames-per-iter=3000000 \ --trainer.optimization.num-jobs-initial=2 \ - --trainer.optimization.num-jobs-final=5 \ - --trainer.optimization.initial-effective-lrate=0.003 \ - --trainer.optimization.final-effective-lrate=0.0003 \ + --trainer.optimization.num-jobs-final=12 \ + --trainer.optimization.initial-effective-lrate=0.005 \ + --trainer.optimization.final-effective-lrate=0.0005 \ --trainer.optimization.shrink-value=1.0 \ - --trainer.optimization.proportional-shrink=60.0 \ --trainer.num-chunk-per-minibatch=128,64 \ --trainer.optimization.momentum=0.0 \ + --trainer.optimization.backstitch-training-scale=0.3 \ + --trainer.optimization.backstitch-training-interval=1 \ --egs.chunk-width=$chunk_width \ --egs.chunk-left-context=0 \ --egs.chunk-right-context=0 \ @@ -280,8 +331,11 @@ if [ $stage -le 18 ]; then for data in $test_sets; do ( + utils/data/modify_speaker_info.sh --seconds-per-spk-max 200 \ + data/${data}_hires data/${data}_chunked + data_affix=$(echo $data | sed s/test_//) - nspk=$(wc -l $dir/configs/network.xconfig diff --git a/egs/chime4/s5_1ch/local/chime4_calc_wers_looped.sh b/egs/chime4/s5_1ch/local/chime4_calc_wers_looped.sh index 9fe4a20f43a..84bb2cb8dbd 100755 --- a/egs/chime4/s5_1ch/local/chime4_calc_wers_looped.sh +++ b/egs/chime4/s5_1ch/local/chime4_calc_wers_looped.sh @@ -82,4 +82,4 @@ for e_d in $tasks; do | utils/int2sym.pl -f 2- $graph_dir/words.txt \ | sed s:\::g done -done \ No newline at end of file +done diff --git a/egs/chime4/s5_1ch/local/compute_pesq.sh b/egs/chime4/s5_1ch/local/compute_pesq.sh new file mode 100755 index 00000000000..1d290a4893f --- /dev/null +++ b/egs/chime4/s5_1ch/local/compute_pesq.sh @@ -0,0 +1,51 @@ +#!/bin/bash +# Copyright 2017 Johns Hopkins University (Author: Aswin Shanmugam Subramanian) +# Apache 2.0 + +# This script creates the average PESQ score of files in an enhanced directory with corresponding +# files in a reference directory. +# Expects the PESQ third party executable in "local/PESQ" +# PESQ source was dowloaded and compiled using "local/download_se_eval_tool.sh" +# Eg. local/compute_pesq.sh blstm_gev enhan/blstm_gev local/nn-gev/data/audio/16kHz/isolated_ext $PWD + +set -e +set -u +set -o pipefail + +if [ $# != 4 ]; then + echo "Wrong #arguments ($#, expected 4)" + echo "Usage: local/compute_pesq.sh " + exit 1; +fi + +enhancement_method=$1 +enhancement_directory=$2 +chime_rir_directory=$3 +modeldir=$4 + +expdir=$modeldir/exp/compute_pesq_${enhancement_method} +mkdir -p $expdir +pushd $expdir +ls $enhancement_directory/et05_*_simu/*.wav > $expdir/et05_files +ls $enhancement_directory/dt05_*_simu/*.wav > $expdir/dt05_files + +for set in "dt05" "et05" +do +declare -i n_files=0 +t_mos=0 +avg_mos=0 + while read filename; do + n_files=$n_files+1 + target_filename=`echo $filename | rev | cut -d"/" -f1 | rev` + speaker=`echo $target_filename | cut -d"_" -f1` + utt_id=`echo $target_filename | cut -d"_" -f2` + noise_cap=`echo $target_filename | cut -d"_" -f3 | cut -d"." -f1` + noise=`echo "$noise_cap" | awk '{ print tolower($1) }'` + temp=`$modeldir/local/PESQ +16000 ../../$chime_rir_directory/"$set"_"$noise"_simu/"$speaker"_"$utt_id"_"$noise_cap".CH5.Clean.wav $filename` + pesq_score=`echo $temp | rev | cut -d " " -f1 | rev` + t_mos=$(awk "BEGIN {print $t_mos+$pesq_score; exit}") + done <$expdir/"$set"_files +avg_mos=$(awk "BEGIN {print $t_mos/$n_files; exit}") +echo $avg_mos>"$expdir"/pesq_"$set" +done +popd diff --git a/egs/chime4/s5_1ch/local/compute_stoi_estoi_sdr.sh b/egs/chime4/s5_1ch/local/compute_stoi_estoi_sdr.sh new file mode 100755 index 00000000000..b7627560b67 --- /dev/null +++ b/egs/chime4/s5_1ch/local/compute_stoi_estoi_sdr.sh @@ -0,0 +1,40 @@ +#!/bin/bash +# Copyright 2017 Johns Hopkins University (Author: Aswin Shanmugam Subramanian) +# Apache 2.0 + +# This script creates the list of enhanced files and reference files and calls the +# matlab script "stoi_estoi_sdr.m" to get STOI, eSTOI and SDR scores +# Eg. local/compute_stoi_estoi_sdr.sh --njobs 10 blstm_gev enhan/blstm_gev local/nn-gev/data/audio/16kHz/isolated_ext + +. ./cmd.sh +. ./path.sh +set -e +set -u +set -o pipefail + +njobs=10 +cmd=run.pl + +. utils/parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "Wrong #arguments ($#, expected 3)" + echo "Usage: local/compute_stoi_estoi_sdr.sh [options] " + echo "options" + echo " --njobs # number of parallel jobs" + echo " --cmd # Command to run in parallel with" + exit 1; +fi + +enhancement_method=$1 +enhancement_directory=$2 +chime_rir_directory=$3 + +expdir=exp/compute_stoi_estoi_sdr_${enhancement_method} +mkdir -p $expdir +ls $chime_rir_directory/dt05_*/*CH5.Clean.wav > $expdir/original_list +ls $enhancement_directory/dt05_*simu/*.wav > $expdir/enhanced_list +$cmd $expdir/compute_stoi_estoi_sdr_dt05.log matlab -nodisplay -nosplash -r "addpath('local'); stoi_estoi_sdr($njobs,'$enhancement_method','$expdir','dt05');exit" +ls $chime_rir_directory/et05_*/*CH5.Clean.wav > $expdir/original_list +ls $enhancement_directory/et05_*simu/*.wav > $expdir/enhanced_list +$cmd $expdir/compute_stoi_estoi_sdr_et05.log matlab -nodisplay -nosplash -r "addpath('local'); stoi_estoi_sdr($njobs,'$enhancement_method','$expdir','et05');exit" diff --git a/egs/chime4/s5_1ch/local/download_se_eval_tool.sh b/egs/chime4/s5_1ch/local/download_se_eval_tool.sh new file mode 100755 index 00000000000..ddd86a03d8a --- /dev/null +++ b/egs/chime4/s5_1ch/local/download_se_eval_tool.sh @@ -0,0 +1,13 @@ +#!/bin/bash +# Copyright 2017 Johns Hopkins University (Author: Aswin Shanmugam Subramanian) +# Apache 2.0 + +wget http://bass-db.gforge.inria.fr/bss_eval/bss_eval_sources.m -O local/bss_eval_sources.m +wget https://github.com/JacobD10/SoundZone_Tools/raw/master/stoi.m -O local/stoi.m +wget https://github.com/JacobD10/SoundZone_Tools/raw/master/estoi.m -O local/estoi.m +wget 'https://www.itu.int/rec/dologin_pub.asp?lang=e&id=T-REC-P.862-200102-I!!SOFT-ZST-E&type=items' -O PESQ.zip +unzip PESQ.zip -d local/PESQ_sources +cd local/PESQ_sources/P862/Software/source +gcc *.c -lm -o PESQ +cd ../../../../../ +mv local/PESQ_sources/P862/Software/source/PESQ local/ diff --git a/egs/chime4/s5_1ch/local/fix_read_sim_from_different_directory.patch b/egs/chime4/s5_1ch/local/fix_read_sim_from_different_directory.patch new file mode 100644 index 00000000000..46121357c5e --- /dev/null +++ b/egs/chime4/s5_1ch/local/fix_read_sim_from_different_directory.patch @@ -0,0 +1,244 @@ +diff --git a/beamform.py b/beamform.py +index 02eeed8..070c76d 100644 +--- a/beamform.py ++++ b/beamform.py +@@ -6,9 +6,10 @@ from chainer import Variable + from chainer import cuda + from chainer import serializers + from tqdm import tqdm ++import sys + +-from chime_data import gen_flist_simu, \ +- gen_flist_real, get_audio_data, get_audio_data_with_context ++from chime_data import gen_flist_simu, gen_flist_2ch,\ ++ gen_flist_real, get_audio_data, get_audio_data_1ch, get_audio_data_with_context + from fgnt.beamforming import gev_wrapper_on_masks + from fgnt.signal_processing import audiowrite, stft, istft + from fgnt.utils import Timer +@@ -20,6 +21,8 @@ parser.add_argument('flist', + help='Name of the flist to process (e.g. tr05_simu)') + parser.add_argument('chime_dir', + help='Base directory of the CHiME challenge.') ++parser.add_argument('sim_dir', ++ help='Base directory of the CHiME challenge simulated data.') + parser.add_argument('output_dir', + help='The directory where the enhanced wav files will ' + 'be stored.') +@@ -29,6 +32,10 @@ parser.add_argument('model_type', + help='Type of model (BLSTM or FW)') + parser.add_argument('--gpu', '-g', default=-1, type=int, + help='GPU ID (negative value indicates CPU)') ++parser.add_argument('--single', '-s', default=0, type=int, ++ help='0 for multi-channel and channel number (1-6) for single channel') ++parser.add_argument('--track', '-t', default=6, type=int, ++ help='1, 2 or 6 depending on the data used') + args = parser.parse_args() + + # Prepare model +@@ -48,11 +55,35 @@ xp = np if args.gpu < 0 else cuda.cupy + stage = args.flist[:2] + scenario = args.flist.split('_')[-1] + ++if stage == 'tr' and (args.track == 1 or args.track == 2): ++ print("No train data for 1ch track and 2ch track"); ++ sys.exit(0); ++ + # CHiME data handling + if scenario == 'simu': +- flist = gen_flist_simu(args.chime_dir, stage) ++ if args.track == 6: ++ flist = gen_flist_simu(args.chime_dir, args.sim_dir, stage) ++ elif args.track == 2: ++ flist = gen_flist_2ch(args.chime_dir, stage, scenario) ++ elif args.track == 1: ++ flist = list() ++ for env in ['caf', 'bus', 'str', 'ped']: ++ flist_temp = os.listdir(os.path.join(args.chime_dir, 'audio', '16kHz', 'isolated_1ch_track', '{}05_{}_{}'.format(stage, env, scenario))) ++ flist_ext = [i for i in flist_temp if i.endswith('.wav')] ++ flist_with_dir = [os.path.join(args.chime_dir, 'audio', '16kHz', 'isolated_1ch_track', '{}05_{}_{}'.format(stage, env, scenario), i) for i in flist_ext] ++ flist = flist + flist_with_dir + elif scenario == 'real': +- flist = gen_flist_real(args.chime_dir, stage) ++ if args.track == 6: ++ flist = gen_flist_real(args.chime_dir, stage) ++ elif args.track == 2: ++ flist = gen_flist_2ch(args.chime_dir, stage, scenario) ++ elif args.track == 1: ++ flist = list() ++ for env in ['caf', 'bus', 'str', 'ped']: ++ flist_temp = os.listdir(os.path.join(args.chime_dir, 'audio', '16kHz', 'isolated_1ch_track', '{}05_{}_{}'.format(stage, env, scenario))) ++ flist_ext = [i for i in flist_temp if i.endswith('.wav')] ++ flist_with_dir = [os.path.join(args.chime_dir, 'audio', '16kHz', 'isolated_1ch_track', '{}05_{}_{}'.format(stage, env, scenario), i) for i in flist_ext] ++ flist = flist + flist_with_dir + else: + raise ValueError('Unknown flist {}'.format(args.flist)) + +@@ -67,12 +98,19 @@ t_beamform = 0 + # Beamform loop + for cur_line in tqdm(flist): + with Timer() as t: +- if scenario == 'simu': ++ if args.track == 6: ++ if scenario == 'simu': ++ audio_data = get_audio_data(cur_line) ++ context_samples = 0 ++ elif scenario == 'real': ++ audio_data, context_samples = get_audio_data_with_context( ++ cur_line[0], cur_line[1], cur_line[2]) ++ elif args.track == 2: + audio_data = get_audio_data(cur_line) + context_samples = 0 +- elif scenario == 'real': +- audio_data, context_samples = get_audio_data_with_context( +- cur_line[0], cur_line[1], cur_line[2]) ++ elif args.track == 1: ++ audio_data = get_audio_data_1ch(cur_line) ++ context_samples = 0 + t_io += t.msecs + Y = stft(audio_data, time_dim=1).transpose((1, 0, 2)) + Y_var = Variable(np.abs(Y).astype(np.float32), True) +@@ -85,28 +123,45 @@ for cur_line in tqdm(flist): + t_net += t.msecs + + with Timer() as t: +- N_mask = np.median(N_masks.data, axis=1) +- X_mask = np.median(X_masks.data, axis=1) +- Y_hat = gev_wrapper_on_masks(Y, N_mask, X_mask) ++ if args.single >= 1 or args.track == 1: ++ Y_hat = X_masks.data * Y ++ elif args.single == 0: ++ N_mask = np.median(N_masks.data, axis=1) ++ X_mask = np.median(X_masks.data, axis=1) ++ Y_hat = gev_wrapper_on_masks(Y, N_mask, X_mask) + t_beamform += t.msecs + +- if scenario == 'simu': +- wsj_name = cur_line.split('/')[-1].split('_')[1] +- spk = cur_line.split('/')[-1].split('_')[0] +- env = cur_line.split('/')[-1].split('_')[-1] +- elif scenario == 'real': +- wsj_name = cur_line[3] +- spk = cur_line[0].split('/')[-1].split('_')[0] +- env = cur_line[0].split('/')[-1].split('_')[-1] ++ if args.track == 1: ++ env = cur_line.split('/')[-1].split('_')[2].split('.')[0] ++ filename = os.path.join(args.output_dir, '{}05_{}_{}'.format(stage, env.lower(), scenario), os.path.basename(cur_line)) ++ else: ++ if scenario == 'simu' or args.track == 2: ++ wsj_name = cur_line.split('/')[-1].split('_')[1] ++ spk = cur_line.split('/')[-1].split('_')[0] ++ env = cur_line.split('/')[-1].split('_')[-1] ++ elif scenario == 'real': ++ wsj_name = cur_line[3] ++ spk = cur_line[0].split('/')[-1].split('_')[0] ++ env = cur_line[0].split('/')[-1].split('_')[-1] + +- filename = os.path.join( +- args.output_dir, +- '{}05_{}_{}'.format(stage, env.lower(), scenario), +- '{}_{}_{}.wav'.format(spk, wsj_name, env.upper()) +- ) +- with Timer() as t: +- audiowrite(istft(Y_hat)[context_samples:], filename, 16000, True, True) +- t_io += t.msecs ++ filename = os.path.join( ++ args.output_dir, ++ '{}05_{}_{}'.format(stage, env.lower(), scenario), ++ '{}_{}_{}.wav'.format(spk, wsj_name, env.upper()) ++ ) ++ if args.track == 1: ++ with Timer() as t: ++ audiowrite(istft(Y_hat[:,0,:])[int(context_samples):], filename, 16000, True, True) ++ t_io += t.msecs ++ elif args.single == 0: ++ with Timer() as t: ++ audiowrite(istft(Y_hat)[int(context_samples):], filename, 16000, True, True) ++ t_io += t.msecs ++ elif args.single >= 1: ++ ch = args.single ++ with Timer() as t: ++ audiowrite(istft(Y_hat[:,ch-1,:])[int(context_samples):], filename, 16000, True, True) ++ t_io += t.msecs + + print('Finished') + print('Timings: I/O: {:.2f}s | Net: {:.2f}s | Beamformer: {:.2f}s'.format( +diff --git a/beamform.sh b/beamform.sh +index 3c7de5a..aaae10d 100755 +--- a/beamform.sh ++++ b/beamform.sh +@@ -1,5 +1,5 @@ + #!/usr/bin/env bash + + for flist in tr05_simu tr05_real dt05_simu dt05_real et05_simu et05_real; do +- python beamform.py $flist "$@" +-done +\ No newline at end of file ++ $HOME/miniconda3/bin/python local/nn-gev/beamform.py $flist "$@" ++done +diff --git a/chime_data.py b/chime_data.py +index 0072e1b..641d9d3 100644 +--- a/chime_data.py ++++ b/chime_data.py +@@ -11,7 +11,7 @@ from fgnt.signal_processing import stft + from fgnt.utils import mkdir_p + + +-def gen_flist_simu(chime_data_dir, stage, ext=False): ++def gen_flist_simu(chime_data_dir, dest_dir, stage, ext=False): + with open(os.path.join( + chime_data_dir, 'annotations', + '{}05_{}.json'.format(stage, 'simu'))) as fid: +@@ -21,7 +21,7 @@ def gen_flist_simu(chime_data_dir, stage, ext=False): + else: + isolated_dir = 'isolated' + flist = [os.path.join( +- chime_data_dir, 'audio', '16kHz', isolated_dir, ++ dest_dir, 'audio', '16kHz', isolated_dir, + '{}05_{}_{}'.format(stage, a['environment'].lower(), 'simu'), + '{}_{}_{}'.format(a['speaker'], a['wsj_name'], a['environment'])) + for a in annotations] +@@ -39,11 +39,33 @@ def gen_flist_real(chime_data_dir, stage): + return flist_tuples + + ++def gen_flist_2ch(chime_data_dir, stage, scenario): ++ with open(os.path.join( ++ chime_data_dir, 'annotations', ++ '{}05_{}.json'.format(stage, scenario))) as fid: ++ annotations = json.load(fid) ++ flist = [os.path.join( ++ chime_data_dir, 'audio', '16kHz', 'isolated_2ch_track', ++ '{}05_{}_{}'.format(stage, a['environment'].lower(), scenario), ++ '{}_{}_{}'.format(a['speaker'], a['wsj_name'], a['environment'])) ++ for a in annotations] ++ return flist ++ ++ ++def get_audio_data_1ch(filename): ++ audio_data = list() ++ audio_data.append(audioread(filename)[None, :]) ++ audio_data = np.concatenate(audio_data, axis=0) ++ audio_data = audio_data.astype(np.float32) ++ return audio_data ++ ++ + def get_audio_data(file_template, postfix='', ch_range=range(1, 7)): + audio_data = list() + for ch in ch_range: +- audio_data.append(audioread( +- file_template + '.CH{}{}.wav'.format(ch, postfix))[None, :]) ++ if os.path.isfile(file_template + '.CH{}{}.wav'.format(ch, postfix)): ++ audio_data.append(audioread( ++ file_template + '.CH{}{}.wav'.format(ch, postfix))[None, :]) + audio_data = np.concatenate(audio_data, axis=0) + audio_data = audio_data.astype(np.float32) + return audio_data +@@ -65,7 +87,7 @@ def get_audio_data_with_context(embedded_template, t_start, t_end, + + def prepare_training_data(chime_data_dir, dest_dir): + for stage in ['tr', 'dt']: +- flist = gen_flist_simu(chime_data_dir, stage, ext=True) ++ flist = gen_flist_simu(chime_data_dir, dest_dir, stage, ext=True) + export_flist = list() + mkdir_p(os.path.join(dest_dir, stage)) + for f in tqdm.tqdm(flist, desc='Generating data for {}'.format(stage)): diff --git a/egs/chime4/s5_1ch/local/real_noisy_chime4_data_prep.sh b/egs/chime4/s5_1ch/local/real_noisy_chime4_data_prep.sh index edbbfd41e69..0173b022176 100755 --- a/egs/chime4/s5_1ch/local/real_noisy_chime4_data_prep.sh +++ b/egs/chime4/s5_1ch/local/real_noisy_chime4_data_prep.sh @@ -68,10 +68,14 @@ if $eval_flag; then cp $trans_dir/et05_real.dot_all et05_real.dot fi -# make a scp file from file list +# make a scp temporary file from file list for x in $list_set; do - cat $x.flist | awk -F'[/]' '{print $NF}'| sed -e 's/\.wav/_REAL/' > ${x}_wav.ids - paste -d" " ${x}_wav.ids $x.flist | sort -k 1 > ${x}_wav.scp + cat $x.flist | awk -F'[/]' '{print $NF}'| sed -e 's/\.wav/_REAL/' > ${x}_wav.id.temp + cat ${x}_wav.id.temp | awk -F'_' '{print $3}' | awk -F'.' '{print $2}' > $x.ch + cat ${x}_wav.id.temp | awk -F'_' '{print $1}' > $x.part1 + cat ${x}_wav.id.temp | sed -e 's/^..._//' > $x.part2 + paste -d"_" $x.part1 $x.ch $x.part2 > ${x}_wav.ids + paste -d" " ${x}_wav.ids $x.flist | sort -t_ -k1,1 -k3 > ${x}_wav.scp.temp done #make a transcription from dot @@ -98,13 +102,17 @@ fi # data-preparation stage independent of the specific lexicon used. noiseword=""; for x in $list_set;do + cat ${x}_wav.scp.temp | awk '{print $1}' > $x.txt.part1 + cat $x.trans1 | awk '{$1=""; print $0}' | sed 's/^[ \t]*//g' > $x.txt.part2 + paste -d" " $x.txt.part1 $x.txt.part2 > $x.trans1 cat $x.trans1 | $local/normalize_transcript.pl $noiseword \ | sort > $x.txt || exit 1; done # Make the utt2spk and spk2utt files. for x in $list_set; do - cat ${x}_wav.scp | awk -F'_' '{print $1}' > $x.spk + sort ${x}_wav.scp.temp > ${x}_wav.scp + cat ${x}_wav.scp | awk -F'_' '{print $1"_"$2}' > $x.spk cat ${x}_wav.scp | awk '{print $1}' > $x.utt paste -d" " $x.utt $x.spk > $x.utt2spk cat $x.utt2spk | $utils/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1; @@ -119,4 +127,8 @@ for x in $list_set; do cp ${x}.utt2spk ../../$x/utt2spk || exit 1; done +# clean up temp files +rm *.temp +rm *.part{1,2} + echo "Data preparation succeeded" diff --git a/egs/chime4/s5_1ch/local/rnnlm/run_lstm.sh b/egs/chime4/s5_1ch/local/rnnlm/run_lstm.sh new file mode 120000 index 00000000000..c53740399ce --- /dev/null +++ b/egs/chime4/s5_1ch/local/rnnlm/run_lstm.sh @@ -0,0 +1 @@ +tuning/run_lstm_1a.sh \ No newline at end of file diff --git a/egs/chime4/s5_1ch/local/rnnlm/run_lstm_back.sh b/egs/chime4/s5_1ch/local/rnnlm/run_lstm_back.sh new file mode 100755 index 00000000000..76e2b563e6b --- /dev/null +++ b/egs/chime4/s5_1ch/local/rnnlm/run_lstm_back.sh @@ -0,0 +1,93 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (author: Daniel Povey) +# 2015 Guoguo Chen +# 2017 Hainan Xu +# 2017 Szu-Jui Chen + +# This script trains LMs on the reversed Chime4 data, which we +# call it backward model. + +# Begin configuration section. +affix=1a +dir=exp/rnnlm_lstm_${affix}_back +embedding_dim=2048 +lstm_rpd=512 +lstm_nrpd=512 +stage=-10 +train_stage=-10 + +# variables for lattice rescoring +ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order + # if it's set, it merges histories in the lattice if they share + # the same ngram history and this prevents the lattice from + # exploding exponentially + +. cmd.sh +. utils/parse_options.sh + +srcdir=data/local/local_lm +lexicon=data/local/dict/lexiconp.txt +text_dir=data/rnnlm/text_nosp_${affix}_back +mkdir -p $dir/config +set -e + +for f in $lexicon; do + [ ! -f $f ] && \ + echo "$0: expected file $f to exist; search for local/wsj_extend_dict.sh in run.sh" && exit 1 +done + +#prepare training and dev data +if [ $stage -le 0 ]; then + mkdir -p $text_dir + cat $srcdir/train.rnn | awk '{for(i=NF;i>0;i--) printf("%s ",$i); print""}'> $text_dir/chime4.txt.tmp + sed -e "s///g" $text_dir/chime4.txt.tmp > $text_dir/chime4.txt + rm $text_dir/chime4.txt.tmp + cat $srcdir/valid.rnn | awk '{for(i=NF;i>0;i--) printf("%s ",$i); print""}'> $text_dir/dev.txt +fi + +if [ $stage -le 1 ]; then + cp data/lang_chain/words.txt $dir/config/words.txt + n=`cat $dir/config/words.txt | wc -l` + echo " $n" >> $dir/config/words.txt + # words that are not present in words.txt but are in the training or dev data, will be + # mapped to during training. + echo "" >$dir/config/oov.txt + + cat > $dir/config/data_weights.txt <$dir/config/unigram_probs.txt + + # choose features + rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \ + --use-constant-feature=true \ + --special-words=',,,' \ + $dir/config/words.txt > $dir/config/features.txt + + cat >$dir/config/xconfig <//g" $text_dir/chime4.txt.tmp > $text_dir/chime4.txt + cp $srcdir/valid.rnn $text_dir/dev.txt +fi + +if [ $stage -le 1 ]; then + cp data/lang_chain/words.txt $dir/config/words.txt + n=`cat $dir/config/words.txt | wc -l` + echo " $n" >> $dir/config/words.txt + # words that are not present in words.txt but are in the training or dev data, will be + # mapped to during training. + echo "" >$dir/config/oov.txt + + cat > $dir/config/data_weights.txt <$dir/config/unigram_probs.txt + + # choose features + rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \ + --use-constant-feature=true \ + --special-words=',,,' \ + $dir/config/words.txt > $dir/config/features.txt + + cat >$dir/config/xconfig < $tgtdir/best_wer_${enhan}_${decode_dir_suffix}.result + head -n 15 $tgtdir/best_wer_${enhan}_${decode_dir_suffix}.result +fi + +nbest=100 +rnnweight=0.8 +if [ $stage -le 6 ] && $run_nbest_rescore; then + echo "$0: Perform nbest-rescoring on $ac_model_dir" + for decode_set in dt05_real dt05_simu et05_real et05_simu; do + decode_dir=$tgtdir/decode_tgpr_5k_${decode_set}_${enhan}_${LM} + ( + # Lattice rescoring + rnnlm/lmrescore_nbest.sh \ + --cmd "$train_cmd --mem 2G" --N $nbest \ + $rnnweight data/lang_test_$LM $dir \ + data/${decode_set}_${enhan}_chunked ${decode_dir} \ + $tgtdir/decode_tgpr_5k_${decode_set}_${enhan}_${decode_dir_suffix}_w${rnnweight}_n${nbest} + + if $use_backward_model; then + rnnlm/lmrescore_nbest_back.sh \ + --cmd "$train_cmd --mem 2G" --N $nbest \ + $rnnweight data/lang_test_$LM ${dir}_back \ + data/${decode_set}_${enhan}_chunked \ + $tgtdir/decode_tgpr_5k_${decode_set}_${enhan}_${decode_dir_suffix}_w${rnnweight}_n${nbest} \ + $tgtdir/decode_tgpr_5k_${decode_set}_${enhan}_${decode_dir_suffix}_w${rnnweight}_n${nbest}_bi + fi + ) & + done + wait + # calc wers for nbest-rescoring results + if $use_backward_model; then + local/chime4_calc_wers.sh $tgtdir ${enhan}_${decode_dir_suffix}_w${rnnweight}_n${nbest}_bi \ + $tgtdir/graph_tgpr_5k \ + > $tgtdir/best_wer_${enhan}_${decode_dir_suffix}_w${rnnweight}_n${nbest}_bi.result + head -n 15 $tgtdir/best_wer_${enhan}_${decode_dir_suffix}_w${rnnweight}_n${nbest}_bi.result + else + local/chime4_calc_wers.sh $tgtdir ${enhan}_${decode_dir_suffix}_w${rnnweight}_n${nbest} \ + $tgtdir/graph_tgpr_5k \ + > $tgtdir/best_wer_${enhan}_${decode_dir_suffix}_w${rnnweight}_n${nbest}.result + head -n 15 $tgtdir/best_wer_${enhan}_${decode_dir_suffix}_w${rnnweight}_n${nbest}.result + fi +fi + +exit 0 diff --git a/egs/chime4/s5_1ch/local/run_blstm_gev.sh b/egs/chime4/s5_1ch/local/run_blstm_gev.sh new file mode 100755 index 00000000000..2ee92b70fbd --- /dev/null +++ b/egs/chime4/s5_1ch/local/run_blstm_gev.sh @@ -0,0 +1,81 @@ +#!/bin/bash +# Copyright 2017 Johns Hopkins University (Author: Aswin Shanmugam Subramanian) +# Apache 2.0 + +. ./cmd.sh +. ./path.sh + +# Config: +nj=10 +cmd=run.pl +track=6 +. utils/parse_options.sh || exit 1; + +if [ $# != 4 ]; then + echo "Wrong #arguments ($#, expected 4)" + echo "Usage: local/run_blstm_gev.sh [options] " + echo "main options (for others, see top of script file)" + echo " --nj # number of parallel jobs" + echo " --cmd # Command to run in parallel with" + echo " --track # Chime data to use (1, 2 or 6)" + exit 1; +fi + +sdir=$1 +chime3_dir=$2 +odir=$3 +enhancement_type=$4 + +# Set bash to 'debug' mode, it will exit on : +# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', +set -e +set -u +set -o pipefail + +miniconda_dir=$HOME/miniconda3/ +if [ ! -d $miniconda_dir ]; then + echo "$miniconda_dir does not exist. Please run '../../../tools/extras/install_miniconda.sh' and '../../../tools/extras/install_chainer.sh';" +fi + +# check if chainer is installed +result=`$HOME/miniconda3/bin/python -c "\ +try: + import chainer + print('1') +except ImportError: + print('0')"` + +if [ "$result" == "1" ]; then + echo "Chainer is installed" +else + echo "Chainer is not installed. Please run ../../../tools/extras/install_chainer.sh" +fi + +if [ ! -d local/nn-gev ]; then + cd local/ + git clone https://github.com/fgnt/nn-gev.git + cd nn-gev/ + git checkout 3a039a4b707419fab05deb9679b41360ea92d779 . + git apply ../fix_read_sim_from_different_directory.patch + cd ../../ +else + cd local/nn-gev/ + git checkout 3a039a4b707419fab05deb9679b41360ea92d779 . + git apply ../fix_read_sim_from_different_directory.patch + cd ../../ +fi + +mkdir -p $odir +set +e +n_isolated_dirs=`ls local/nn-gev/data/audio/16kHz/isolated/ 2>/dev/null | wc -l` +n_isolated_ext_dirs=`ls local/nn-gev/data/audio/16kHz/isolated_ext/ 2>/dev/null | wc -l` +set -e +if [[ "$n_isolated_dirs" -ne 12 || "$n_isolated_ext_dirs" -ne 12 ]];then + echo "generating simulation data and storing in local/nn-gev/data" + $cmd $odir/simulation.log matlab -nodisplay -nosplash -r "addpath('local'); CHiME3_simulate_data_patched_parallel(1,$nj,'$sdir','$chime3_dir');exit" +else + echo "Didn't run Matlab simulation. Using existing data in local/nn-gev/data/audio/" +fi + +echo "Training a BLSTM-based mask network and enhancing signals with mask-based GEV beamformer" +$cuda_cmd $odir/beamform.log local/run_nn-gev.sh $sdir $odir $enhancement_type $track diff --git a/egs/chime4/s5_1ch/local/run_dnn.sh b/egs/chime4/s5_1ch/local/run_dnn.sh deleted file mode 100755 index 2207574e71c..00000000000 --- a/egs/chime4/s5_1ch/local/run_dnn.sh +++ /dev/null @@ -1,237 +0,0 @@ -#!/bin/bash - -# Copyright 2016 University of Sheffield (Jon Barker, Ricard Marxer) -# Inria (Emmanuel Vincent) -# Mitsubishi Electric Research Labs (Shinji Watanabe) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -# This script is made from the kaldi recipe of the 2nd CHiME Challenge Track 2 -# made by Chao Weng - -. ./path.sh -. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. - ## This relates to the queue. - -# Config: -nj=30 -stage=0 # resume training with --stage N -train=noisy -eval_flag=true # make it true when the evaluation data are released - -. utils/parse_options.sh || exit 1; - -# This is a shell script, but it's recommended that you run the commands one by -# one by copying and pasting into the shell. - -if [ $# -ne 1 ]; then - printf "\nUSAGE: %s \n\n" `basename $0` - echo "First argument specifies a unique name for different enhancement method" - exit 1; -fi - -# set enhanced data -enhan=$1 - -# Set bash to 'debug' mode, it will exit on : -# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', -set -e -set -u -set -o pipefail - -# check whether run_init is executed -if [ ! -d data/lang ]; then - echo "error, execute local/run_init.sh, first" - exit 1; -fi - -# check whether run_init is executed -if [ ! -d exp/tri3b_tr05_multi_${train} ]; then - echo "error, execute local/run_init.sh, first" - exit 1; -fi - -# get alignments -if [ $stage -le 0 ]; then - steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \ - data/tr05_multi_${train} data/lang exp/tri3b_tr05_multi_${train} exp/tri3b_tr05_multi_${train}_ali - steps/align_fmllr.sh --nj 4 --cmd "$train_cmd" \ - data/dt05_multi_$enhan data/lang exp/tri3b_tr05_multi_${train} exp/tri3b_tr05_multi_${train}_ali_dt05 -fi - -# make fmllr feature for training multi = simu + real -gmmdir=exp/tri3b_tr05_multi_${train}_ali -data_fmllr=data-fmllr-tri3b -mkdir -p $data_fmllr -fmllrdir=fmllr-tri3b/${train} -if [ $stage -le 1 ]; then - for x in tr05_real_${train} tr05_simu_${train}; do - steps/nnet/make_fmllr_feats.sh --nj 4 --cmd "$train_cmd" \ - --transform-dir $gmmdir \ - $data_fmllr/$x data/$x $gmmdir exp/make_fmllr_tri3b/$x $fmllrdir - done -fi - -# make fmllr feature for dev and eval -gmmdir=exp/tri3b_tr05_multi_${train} -fmllrdir=fmllr-tri3b/$enhan -if [ $stage -le 2 ]; then - if $eval_flag; then - tasks="dt05_real_$enhan dt05_simu_$enhan et05_real_$enhan et05_simu_$enhan" - else - tasks="dt05_real_$enhan dt05_simu_$enhan" - fi - for x in $tasks; do - steps/nnet/make_fmllr_feats.sh --nj 4 --cmd "$train_cmd" \ - --transform-dir $gmmdir/decode_tgpr_5k_$x \ - $data_fmllr/$x data/$x $gmmdir exp/make_fmllr_tri3b/$x $fmllrdir - done -fi - -# make mixed training set from real and simulation enhanced data -# multi = simu + real -if [ $stage -le 3 ]; then - for data_dir in $data_fmllr/tr05_real_${train} $data_fmllr/tr05_simu_${train} $data_fmllr/dt05_real_$enhan $data_fmllr/dt05_simu_$enhan; do - utils/data/get_utt2dur.sh $data_dir - done - - utils/combine_data.sh $data_fmllr/tr05_multi_${train} $data_fmllr/tr05_simu_${train} $data_fmllr/tr05_real_${train} - utils/combine_data.sh $data_fmllr/dt05_multi_$enhan $data_fmllr/dt05_simu_$enhan $data_fmllr/dt05_real_$enhan - if $eval_flag; then - for data_dir in $data_fmllr/et05_real_$enhan $data_fmllr/et05_simu_$enhan; do - utils/data/get_utt2dur.sh $data_dir - done - utils/combine_data.sh $data_fmllr/et05_multi_$enhan $data_fmllr/et05_simu_$enhan $data_fmllr/et05_real_$enhan - fi -fi - -# pre-train dnn -dir=exp/tri4a_dnn_pretrain_tr05_multi_${train} -if [ $stage -le 4 ]; then - $cuda_cmd $dir/_pretrain_dbn.log \ - steps/nnet/pretrain_dbn.sh --nn-depth 7 --rbm-iter 3 $data_fmllr/tr05_multi_${train} $dir -fi - -# train dnn -dir=exp/tri4a_dnn_tr05_multi_${train} -ali=exp/tri3b_tr05_multi_${train}_ali -ali_dev=exp/tri3b_tr05_multi_${train}_ali_dt05 -feature_transform=exp/tri4a_dnn_pretrain_tr05_multi_${train}/final.feature_transform -dbn=exp/tri4a_dnn_pretrain_tr05_multi_${train}/7.dbn -if [ $stage -le 5 ]; then - $cuda_cmd $dir/_train_nnet.log \ - steps/nnet/train.sh --feature-transform $feature_transform --dbn $dbn --hid-layers 0 --learn-rate 0.008 \ - $data_fmllr/tr05_multi_${train} $data_fmllr/dt05_multi_$enhan data/lang $ali $ali_dev $dir -fi - -# decode enhanced speech -if [ $stage -le 6 ]; then - utils/mkgraph.sh data/lang_test_tgpr_5k $dir $dir/graph_tgpr_5k - steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --acwt 0.10 --config conf/decode_dnn.config \ - $dir/graph_tgpr_5k $data_fmllr/dt05_real_$enhan $dir/decode_tgpr_5k_dt05_real_$enhan & - steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --acwt 0.10 --config conf/decode_dnn.config \ - $dir/graph_tgpr_5k $data_fmllr/dt05_simu_$enhan $dir/decode_tgpr_5k_dt05_simu_$enhan & - if $eval_flag; then - steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --acwt 0.10 --config conf/decode_dnn.config \ - $dir/graph_tgpr_5k $data_fmllr/et05_real_$enhan $dir/decode_tgpr_5k_et05_real_$enhan & - steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --acwt 0.10 --config conf/decode_dnn.config \ - $dir/graph_tgpr_5k $data_fmllr/et05_simu_$enhan $dir/decode_tgpr_5k_et05_simu_$enhan & - fi - wait; -fi - -# Sequence training using sMBR criterion, we do Stochastic-GD -# with per-utterance updates. We use usually good acwt 0.1 -# Lattices are re-generated after 1st epoch, to get faster convergence. -dir=exp/tri4a_dnn_tr05_multi_${train}_smbr -srcdir=exp/tri4a_dnn_tr05_multi_${train} -acwt=0.1 - -# First we generate lattices and alignments: -# awk -v FS="/" '{ NF_nosuffix=$NF; sub(".gz","",NF_nosuffix); print NF_nosuffix gunzip -c "$0" |"; }' in -# steps/nnet/make_denlats.sh -if [ $stage -le 7 ]; then - steps/nnet/align.sh --nj $nj --cmd "$train_cmd" \ - $data_fmllr/tr05_multi_${train} data/lang $srcdir ${srcdir}_ali - steps/nnet/make_denlats.sh --nj $nj --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt $acwt \ - $data_fmllr/tr05_multi_${train} data/lang $srcdir ${srcdir}_denlats -fi - -# Re-train the DNN by 1 iteration of sMBR -if [ $stage -le 8 ]; then - steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 1 --acwt $acwt --do-smbr true \ - $data_fmllr/tr05_multi_${train} data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir -fi - -# Decode (reuse HCLG graph) -if [ $stage -le 9 ]; then - for ITER in 1; do - steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \ - --nnet $dir/${ITER}.nnet --acwt $acwt \ - exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k $data_fmllr/dt05_real_${enhan} $dir/decode_tgpr_5k_dt05_real_${enhan}_it${ITER} & - steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \ - --nnet $dir/${ITER}.nnet --acwt $acwt \ - exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k $data_fmllr/dt05_simu_${enhan} $dir/decode_tgpr_5k_dt05_simu_${enhan}_it${ITER} & - if $eval_flag; then - steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \ - --nnet $dir/${ITER}.nnet --acwt $acwt \ - exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k $data_fmllr/et05_real_${enhan} $dir/decode_tgpr_5k_et05_real_${enhan}_it${ITER} & - steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \ - --nnet $dir/${ITER}.nnet --acwt $acwt \ - exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k $data_fmllr/et05_simu_${enhan} $dir/decode_tgpr_5k_et05_simu_${enhan}_it${ITER} & - fi - done -fi - -# Re-generate lattices, run 4 more sMBR iterations -dir=exp/tri4a_dnn_tr05_multi_${train}_smbr_i1lats -srcdir=exp/tri4a_dnn_tr05_multi_${train}_smbr -acwt=0.1 - -# Generate lattices and alignments: -if [ $stage -le 10 ]; then - steps/nnet/align.sh --nj $nj --cmd "$train_cmd" \ - $data_fmllr/tr05_multi_${train} data/lang $srcdir ${srcdir}_ali - steps/nnet/make_denlats.sh --nj $nj --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt $acwt \ - $data_fmllr/tr05_multi_${train} data/lang $srcdir ${srcdir}_denlats -fi - -# Re-train the DNN by 4 iterations of sMBR -if [ $stage -le 11 ]; then - steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 4 --acwt $acwt --do-smbr true \ - $data_fmllr/tr05_multi_${train} data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir || exit 1 -fi - -# Decode (reuse HCLG graph) -if [ $stage -le 12 ]; then - for ITER in 1 2 3 4; do - steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \ - --nnet $dir/${ITER}.nnet --acwt $acwt \ - exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k $data_fmllr/dt05_real_${enhan} $dir/decode_tgpr_5k_dt05_real_${enhan}_it${ITER} & - steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \ - --nnet $dir/${ITER}.nnet --acwt $acwt \ - exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k $data_fmllr/dt05_simu_${enhan} $dir/decode_tgpr_5k_dt05_simu_${enhan}_it${ITER} & - if $eval_flag; then - steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \ - --nnet $dir/${ITER}.nnet --acwt $acwt \ - exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k $data_fmllr/et05_real_${enhan} $dir/decode_tgpr_5k_et05_real_${enhan}_it${ITER} & - steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \ - --nnet $dir/${ITER}.nnet --acwt $acwt \ - exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k $data_fmllr/et05_simu_${enhan} $dir/decode_tgpr_5k_et05_simu_${enhan}_it${ITER} & - fi - done - wait -fi - -# scoring -if [ $stage -le 13 ]; then - # decoded results of enhanced speech using DNN AMs trained with enhanced data - local/chime4_calc_wers.sh exp/tri4a_dnn_tr05_multi_${train} $enhan exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k \ - > exp/tri4a_dnn_tr05_multi_${train}/best_wer_$enhan.result - head -n 15 exp/tri4a_dnn_tr05_multi_${train}/best_wer_$enhan.result - # decoded results of enhanced speech using sequence-training DNN - ./local/chime4_calc_wers_smbr.sh exp/tri4a_dnn_tr05_multi_${train}_smbr_i1lats ${enhan} exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k \ - > exp/tri4a_dnn_tr05_multi_${train}_smbr_i1lats/best_wer_${enhan}.result - head -n 15 exp/tri4a_dnn_tr05_multi_${train}_smbr_i1lats/best_wer_${enhan}.result -fi - -echo "`basename $0` Done." diff --git a/egs/chime4/s5_1ch/local/run_dnn_recog.sh b/egs/chime4/s5_1ch/local/run_dnn_recog.sh deleted file mode 100755 index 5e6ade02387..00000000000 --- a/egs/chime4/s5_1ch/local/run_dnn_recog.sh +++ /dev/null @@ -1,143 +0,0 @@ -#!/bin/bash - -# Copyright 2016 University of Sheffield (Jon Barker, Ricard Marxer) -# Inria (Emmanuel Vincent) -# Mitsubishi Electric Research Labs (Shinji Watanabe) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -# This script is made from the kaldi recipe of the 2nd CHiME Challenge Track 2 -# made by Chao Weng - -. ./path.sh -. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. - ## This relates to the queue. - -# Config: -nj=30 -stage=0 # resume training with --stage=N -train=noisy -eval_flag=true # make it true when the evaluation data are released - -. utils/parse_options.sh || exit 1; - -# This is a shell script, but it's recommended that you run the commands one by -# one by copying and pasting into the shell. - -if [ $# -ne 2 ]; then - printf "\nUSAGE: %s \n\n" `basename $0` - echo "First argument specifies a unique name for different enhancement method" - echo "Second argument specifies acoustic and language model directory" - exit 1; -fi - -# set enhanced data -enhan=$1 -# set model directory -mdir=$2 - -# Set bash to 'debug' mode, it will exit on : -# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', -set -e -set -u -set -o pipefail - -# check data/loca/data -if [ ! -d $mdir/data/local/data ]; then - echo "error, set $mdir correctly" - exit 1; -elif [ ! -d data/local/data ]; then - echo "copy $mdir/data/local/data" - mkdir -p data/local - cp -r $mdir/data/local/data data/local/ -fi - -# check gmm model -if [ ! -d $mdir/exp/tri3b_tr05_multi_${train} ]; then - echo "error, set $mdir correctly" - exit 1; -elif [ ! -d exp/tri3b_tr05_multi_${train} ]; then - echo "copy $mdir/exp/tri3b_tr05_multi_${train}" - mkdir -p exp - cp -r $mdir/exp/tri3b_tr05_multi_${train} exp/ -fi - -# check dnn graph -if [ ! -d $mdir/exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k ]; then - echo "error, set $mdir correctly" - exit 1; -elif [ ! -d exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k ]; then - echo "copy $mdir/exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k" - mkdir -p exp/tri4a_dnn_tr05_multi_${train} - cp -r $mdir/exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k exp/tri4a_dnn_tr05_multi_${train}/ -fi - -# check dnn smbr model -if [ ! -d $mdir/exp/tri4a_dnn_tr05_multi_${train}_smbr_i1lats ]; then - echo "error, set $mdir correctly" - exit 1; -elif [ ! -d exp/tri4a_dnn_tr05_multi_${train}_smbr_i1lats ]; then - echo "copy $mdir/exp/tri4a_dnn_tr05_multi_${train}_smbr_i1lats" - mkdir -p exp - cp -r $mdir/exp/tri4a_dnn_tr05_multi_${train}_smbr_i1lats exp/ -fi - -# make fmllr feature for dev and eval -gmmdir=exp/tri3b_tr05_multi_${train} -data_fmllr=data-fmllr-tri3b -mkdir -p $data_fmllr -fmllrdir=fmllr-tri3b/$enhan -if [ $stage -le 4 ]; then - if $eval_flag; then - tasks="dt05_real_$enhan dt05_simu_$enhan et05_real_$enhan et05_simu_$enhan" - else - tasks="dt05_real_$enhan dt05_simu_$enhan" - fi - for x in $tasks; do - steps/nnet/make_fmllr_feats.sh --nj 4 --cmd "$train_cmd" \ - --transform-dir $gmmdir/decode_tgpr_5k_$x \ - $data_fmllr/$x data/$x $gmmdir exp/make_fmllr_tri3b/$x $fmllrdir - done -fi - -# make mixed training set from real and simulation enhanced data -# multi = simu + real -if [ $stage -le 5 ]; then - utils/combine_data.sh $data_fmllr/dt05_multi_$enhan $data_fmllr/dt05_simu_$enhan $data_fmllr/dt05_real_$enhan - if $eval_flag; then - utils/combine_data.sh $data_fmllr/et05_multi_$enhan $data_fmllr/et05_simu_$enhan $data_fmllr/et05_real_$enhan - fi -fi - -# Re-generate lattices, run 4 more sMBR iterations -dir=exp/tri4a_dnn_tr05_multi_${train}_smbr_i1lats -acwt=0.1 - -# Decode (reuse HCLG graph) -if [ $stage -le 6 ]; then - for ITER in 1 2 3 4; do - steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \ - --nnet $dir/${ITER}.nnet --acwt $acwt \ - exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k $data_fmllr/dt05_real_${enhan} $dir/decode_tgpr_5k_dt05_real_${enhan}_it${ITER} & - steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \ - --nnet $dir/${ITER}.nnet --acwt $acwt \ - exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k $data_fmllr/dt05_simu_${enhan} $dir/decode_tgpr_5k_dt05_simu_${enhan}_it${ITER} & - if $eval_flag; then - steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \ - --nnet $dir/${ITER}.nnet --acwt $acwt \ - exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k $data_fmllr/et05_real_${enhan} $dir/decode_tgpr_5k_et05_real_${enhan}_it${ITER} & - steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \ - --nnet $dir/${ITER}.nnet --acwt $acwt \ - exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k $data_fmllr/et05_simu_${enhan} $dir/decode_tgpr_5k_et05_simu_${enhan}_it${ITER} & - fi - wait - done -fi - -# scoring -if [ $stage -le 7 ]; then - # decoded results of enhanced speech using sequence-training DNN - ./local/chime4_calc_wers_smbr.sh $dir ${enhan} exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k > $dir/best_wer_${enhan}.result - head -n 15 $dir/best_wer_${enhan}.result -fi - -echo "`basename $0` Done." diff --git a/egs/chime4/s5_1ch/local/run_gmm.sh b/egs/chime4/s5_1ch/local/run_gmm.sh index 2a3c8680f23..5178433dfc2 100755 --- a/egs/chime4/s5_1ch/local/run_gmm.sh +++ b/egs/chime4/s5_1ch/local/run_gmm.sh @@ -17,6 +17,8 @@ nj=30 stage=0 # resume training with --stage=N train=noisy # noisy data multi-condition training eval_flag=true # make it true when the evaluation data are released +add_enhanced_data=true # make it true when you want to add enhanced data into training set +decode_only=false # if true, it wouldn't train a model again and will only do decoding . utils/parse_options.sh || exit 1; @@ -49,6 +51,33 @@ if [ ! -d data/lang ]; then exit 1; fi +if $decode_only; then + # check data/loca/data + mdir=`pwd` + if [ ! -d $mdir/data/local/data ]; then + echo "error, set $mdir correctly" + exit 1; + elif [ ! -d data/local/data ]; then + echo "copy $mdir/data/local/data" + mkdir -p data/local + cp -r $mdir/data/local/data data/local/ + fi + # check gmm model + if [ ! -d $mdir/exp/tri3b_tr05_multi_${train} ]; then + echo "error, set $mdir correctly" + exit 1; + elif [ ! -d exp/tri3b_tr05_multi_${train} ]; then + echo "copy $mdir/exp/tri3b_tr05_multi_${train}" + mkdir -p exp + cp -r $mdir/exp/tri3b_tr05_multi_${train} exp/ + fi + # process for enhanced data + if [ ! -d data/dt05_real_$enhan ] || [ ! -d data/et05_real_$enhan ]; then + local/real_enhan_chime4_data_prep.sh $enhan $enhan_data + local/simu_enhan_chime4_data_prep.sh $enhan $enhan_data + fi + stage=6 +fi ####################### #### training ######### if [ $stage -le 1 ]; then @@ -63,27 +92,51 @@ if [ $stage -le 1 ]; then local/simu_enhan_chime4_data_prep.sh $enhan $enhan_data fi fi +# Copy enhanced data for 1ch and 2ch experiments +if [ $stage -le 2 ] && [[ "$PWD" != *s5_6ch* ]]; then + beamformed=0 + # First remove empty files generated from previous stage + for d in tr05_{real,simu}_$enhan; do + [ -d data/$d ] && rm -rf data/$d && \ + echo "remove empty directory $d" + done + if [[ "$enhan" == *beamformit_2mics* ]] && [ -d ../s5_6ch/data/tr05_real_beamformit_5mics ]; then + echo "copy tr05_{real,simu}_beamformit_5mics from ../s5_6ch/data/" + cp -r ../s5_6ch/data/tr05_real_beamformit_5mics data/tr05_real_beamformit_2mics + cp -r ../s5_6ch/data/tr05_simu_beamformit_5mics data/tr05_simu_beamformit_2mics + beamformed=1 + elif [ -d ../s5_6ch/data/tr05_real_$enhan ]; then + echo "copy enhanced training data ${d} from ../s5_6ch/data/" + cp -r ../s5_6ch/data/tr05_real_$enhan data/ + cp -r ../s5_6ch/data/tr05_simu_$enhan data/ + beamformed=1 + elif [[ "$enhan" == *isolated_1ch_track* ]]; then + beamformed=1 + fi + if [ $beamformed == 0 ]; then + echo "no such directory tr05_{real,simu}_{beamformit_5mics,blstm_gev,single_BLSTMmask}" + echo "They are generated by run_beamform_6ch_track.sh in ../s5_6ch/run.sh, please execute it first" && \ + exit 1; + fi +fi # Now make MFCC features for clean, close, and noisy data # mfccdir should be some place with a largish disk where you # want to store MFCC features. mfccdir=mfcc -if [ $stage -le 2 ]; then - if $eval_flag; then - tasks="tr05_real_${train} dt05_real_${train} tr05_simu_${train} dt05_simu_${train} et05_real_${train} et05_simu_${train} tr05_real_$enhan tr05_simu_$enhan" +if [ $stage -le 3 ]; then + if $add_enhanced_data; then + if $eval_flag; then + tasks="tr05_real_${train} dt05_real_${train} tr05_simu_${train} dt05_simu_${train} et05_real_${train} et05_simu_${train} tr05_real_$enhan tr05_simu_$enhan" + else + tasks="tr05_real_${train} dt05_real_${train} tr05_simu_${train} dt05_simu_${train} tr05_real_$enhan tr05_simu_$enhan" + fi else - tasks="tr05_real_${train} dt05_real_${train} tr05_simu_${train} dt05_simu_${train} tr05_real_$enhan tr05_simu_$enhan" - fi - if [ "$enhan" == "beamformit_2mics" ]; then - for d in ../s5_6ch/data/tr05_{real,simu}_beamformit_5mics; do - [ ! -d $d ] && echo "no such directory $d" && \ - echo "It is generated by run_beamform_6ch_track.sh within ../s5_6ch/run.sh, execute it first" && \ - exit 1; - done - echo "copy enhanced training data from ../s5_6ch/data/" - rm -rf data/tr05_{real,simu}_beamformit_2mics - cp -r ../s5_6ch/data/tr05_real_beamformit_5mics data/tr05_real_beamformit_2mics - cp -r ../s5_6ch/data/tr05_simu_beamformit_5mics data/tr05_simu_beamformit_2mics + if $eval_flag; then + tasks="tr05_real_${train} dt05_real_${train} tr05_simu_${train} dt05_simu_${train} et05_real_${train} et05_simu_${train}" + else + tasks="tr05_real_${train} dt05_real_${train} tr05_simu_${train} dt05_simu_${train}" + fi fi for x in $tasks; do steps/make_mfcc.sh --nj 8 --cmd "$train_cmd" \ @@ -95,17 +148,20 @@ fi # make mixed training set from real and simulation training data # multi = simu + real # Note that we are combining enhanced training data with noisy training data -if [ $stage -le 3 ]; then - utils/combine_data.sh data/tr05_multi_${train} data/tr05_simu_${train} data/tr05_real_${train} data/tr05_simu_$enhan data/tr05_real_$enhan - #utils/combine_data.sh data/tr05_multi_${train} data/tr05_simu_${train} data/tr05_real_${train} +if [ $stage -le 4 ]; then + if $add_enhanced_data; then + utils/combine_data.sh data/tr05_multi_${train} data/tr05_simu_${train} data/tr05_real_${train} data/tr05_simu_$enhan data/tr05_real_$enhan + else + utils/combine_data.sh data/tr05_multi_${train} data/tr05_simu_${train} data/tr05_real_${train} + fi utils/combine_data.sh data/dt05_multi_${train} data/dt05_simu_${train} data/dt05_real_${train} if $eval_flag; then - utils/combine_data.sh data/et05_multi_${train} data/et05_simu_${train} data/et05_real_${train} + utils/combine_data.sh data/et05_multi_${train} data/et05_simu_${train} data/et05_real_${train} fi fi # training models for noisy data -if [ $stage -le 4 ]; then +if [ $stage -le 5 ]; then nspk=`wc -l data/tr05_multi_${train}/spk2utt | awk '{print $1}'` if [ $nj -gt $nspk ]; then nj2=$nspk diff --git a/egs/chime4/s5_1ch/local/run_gmm_recog.sh b/egs/chime4/s5_1ch/local/run_gmm_recog.sh deleted file mode 100755 index 5f7f47b39d7..00000000000 --- a/egs/chime4/s5_1ch/local/run_gmm_recog.sh +++ /dev/null @@ -1,127 +0,0 @@ -#!/bin/bash - -# Copyright 2016 University of Sheffield (Jon Barker, Ricard Marxer) -# Inria (Emmanuel Vincent) -# Mitsubishi Electric Research Labs (Shinji Watanabe) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -# This script is made from the kaldi recipe of the 2nd CHiME Challenge Track 2 -# made by Chao Weng - -. ./path.sh -. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. - ## This relates to the queue. - -# Config: -nj=30 -stage=0 # resume training with --stage=N -train=noisy -eval_flag=true # make it true when the evaluation data are released - -. utils/parse_options.sh || exit 1; - -# This is a shell script, but it's recommended that you run the commands one by -# one by copying and pasting into the shell. - -if [ $# -ne 3 ]; then - printf "\nUSAGE: %s \n\n" `basename $0` - echo "First argument specifies a unique name for different enhancement method" - echo "Second argument specifies the directory of enhanced wav files" - echo "Third argument specifies acoustic and language model directory" - exit 1; -fi - -# set enhanced data -enhan=$1 -enhan_data=$2 -# set model directory -mdir=$3 - -# Set bash to 'debug' mode, it will exit on : -# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', -set -e -set -u -set -o pipefail - -# check data/loca/data -if [ ! -d $mdir/data/local/data ]; then - echo "error, set $mdir correctly" - exit 1; -elif [ ! -d data/local/data ]; then - echo "copy $mdir/data/local/data" - mkdir -p data/local - cp -r $mdir/data/local/data data/local/ -fi - -# check gmm model -if [ ! -d $mdir/exp/tri3b_tr05_multi_${train} ]; then - echo "error, set $mdir correctly" - exit 1; -elif [ ! -d exp/tri3b_tr05_multi_${train} ]; then - echo "copy $mdir/exp/tri3b_tr05_multi_${train}" - mkdir -p exp - cp -r $mdir/exp/tri3b_tr05_multi_${train} exp/ -fi - -# process for enhanced data -if [ $stage -le 0 ]; then - if [ ! -d data/dt05_real_$enhan ] || [ ! -d data/et05_real_$enhan ]; then - local/real_enhan_chime4_data_prep.sh $enhan $enhan_data - local/simu_enhan_chime4_data_prep.sh $enhan $enhan_data - fi -fi - -# Now make MFCC features for enhanced data -# mfccdir should be some place with a largish disk where you -# want to store MFCC features. -mfccdir=mfcc/$enhan -if [ $stage -le 1 ]; then - if $eval_flag; then - tasks="dt05_real_$enhan dt05_simu_$enhan et05_real_$enhan et05_simu_$enhan" - else - tasks="dt05_real_$enhan dt05_simu_$enhan" - fi - for x in $tasks; do - if [ ! -e data/$x/feats.scp ]; then - steps/make_mfcc.sh --nj 8 --cmd "$train_cmd" \ - data/$x exp/make_mfcc/$x $mfccdir - steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir - fi - done -fi - -# make mixed training set from real and simulation enhanced data -# multi = simu + real -if [ $stage -le 2 ]; then - if [ ! -d data/dt05_multi_$enhan ] || [ ! -d data/et05_multi_$enhan ]; then - utils/combine_data.sh data/dt05_multi_$enhan data/dt05_simu_$enhan data/dt05_real_$enhan - if $eval_flag; then - utils/combine_data.sh data/et05_multi_$enhan data/et05_simu_$enhan data/et05_real_$enhan - fi - fi -fi - -# decode enhanced speech using AMs trained with enhanced data -if [ $stage -le 3 ]; then - steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \ - exp/tri3b_tr05_multi_${train}/graph_tgpr_5k data/dt05_real_$enhan exp/tri3b_tr05_multi_${train}/decode_tgpr_5k_dt05_real_$enhan & - steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \ - exp/tri3b_tr05_multi_${train}/graph_tgpr_5k data/dt05_simu_$enhan exp/tri3b_tr05_multi_${train}/decode_tgpr_5k_dt05_simu_$enhan & - if $eval_flag; then - steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \ - exp/tri3b_tr05_multi_${train}/graph_tgpr_5k data/et05_real_$enhan exp/tri3b_tr05_multi_${train}/decode_tgpr_5k_et05_real_$enhan & - steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \ - exp/tri3b_tr05_multi_${train}/graph_tgpr_5k data/et05_simu_$enhan exp/tri3b_tr05_multi_${train}/decode_tgpr_5k_et05_simu_$enhan & - fi - wait; -fi - -# scoring -if [ $stage -le 4 ]; then - # decoded results of enhanced speech using AMs trained with enhanced data - local/chime4_calc_wers.sh exp/tri3b_tr05_multi_${train} $enhan exp/tri3b_tr05_multi_${train}/graph_tgpr_5k \ - > exp/tri3b_tr05_multi_${train}/best_wer_$enhan.result - head -n 15 exp/tri3b_tr05_multi_${train}/best_wer_$enhan.result -fi - -echo "`basename $0` Done." diff --git a/egs/chime4/s5_1ch/local/run_lmrescore_recog.sh b/egs/chime4/s5_1ch/local/run_lmrescore_recog.sh deleted file mode 100755 index 8b57585fda0..00000000000 --- a/egs/chime4/s5_1ch/local/run_lmrescore_recog.sh +++ /dev/null @@ -1,121 +0,0 @@ -#!/bin/bash - -# Copyright 2015 University of Sheffield (Jon Barker, Ricard Marxer) -# Inria (Emmanuel Vincent) -# Mitsubishi Electric Research Labs (Shinji Watanabe) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -# Copyright 2015, Mitsubishi Electric Research Laboratories, MERL (Author: Takaaki Hori) - -nj=12 -stage=1 -order=5 -hidden=300 -rnnweight=0.5 -nbest=100 -train=noisy -eval_flag=true # make it true when the evaluation data are released - -. utils/parse_options.sh || exit 1; - -. ./path.sh -. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. - ## This relates to the queue. - -# This is a shell script, but it's recommended that you run the commands one by -# one by copying and pasting into the shell. - -if [ $# -ne 2 ]; then - printf "\nUSAGE: %s \n\n" `basename $0` - echo "First argument specifies a unique name for different enhancement method" - echo "Second argument specifies acoustic and language model directory" - exit 1; -fi - -# set language models -lm_suffix=${order}gkn_5k -rnnlm_suffix=rnnlm_5k_h${hidden} - -# enhan data -enhan=$1 -# set model directory -mdir=$2 -srcdir=exp/tri4a_dnn_tr05_multi_${train}_smbr_i1lats - -# check language models -if [ ! -d $mdir/data/lang ]; then - echo "error, set $mdir correctly" - exit 1; -fi - -# preparation -dir=exp/tri4a_dnn_tr05_multi_${train}_smbr_lmrescore -mkdir -p $dir -# make a symbolic link to graph info -if [ ! -e $dir/graph_tgpr_5k ]; then - if [ ! -e exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k ]; then - echo "graph is missing, execute local/run_dnn.sh, correctly" - exit 1; - fi - pushd . ; cd $dir - ln -s ../tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k . - popd -fi - -# rescore lattices by a high-order N-gram -if [ $stage -le 3 ]; then - # check the best iteration - if [ ! -f $srcdir/log/best_wer_$enhan ]; then - echo "$0: error $srcdir/log/best_wer_$enhan not found. execute local/run_dnn.sh, first" - exit 1; - fi - it=`cut -f 1 -d" " $srcdir/log/best_wer_$enhan | awk -F'[_]' '{print $1}'` - # rescore lattices - if $eval_flag; then - tasks="dt05_simu dt05_real et05_simu et05_real" - else - tasks="dt05_simu dt05_real" - fi - for t in $tasks; do - steps/lmrescore.sh --mode 3 \ - $mdir/data/lang_test_tgpr_5k \ - $mdir/data/lang_test_${lm_suffix} \ - data-fmllr-tri3b/${t}_$enhan \ - $srcdir/decode_tgpr_5k_${t}_${enhan}_it$it \ - $dir/decode_tgpr_5k_${t}_${enhan}_${lm_suffix} - done - # rescored results by high-order n-gram LM - mkdir -p $dir/log - local/chime4_calc_wers.sh $dir ${enhan}_${lm_suffix} $dir/graph_tgpr_5k \ - > $dir/best_wer_${enhan}_${lm_suffix}.result - head -n 15 $dir/best_wer_${enhan}_${lm_suffix}.result -fi - -# N-best rescoring using a RNNLM -if [ $stage -le 4 ]; then - # check the best lmw - if [ ! -f $dir/log/best_wer_${enhan}_${lm_suffix} ]; then - echo "error, rescoring with a high-order n-gram seems to be failed" - exit 1; - fi - lmw=`cut -f 1 -d" " $dir/log/best_wer_${enhan}_${lm_suffix} | awk -F'[_]' '{print $NF}'` - # rescore n-best list for all sets - if $eval_flag; then - tasks="dt05_simu dt05_real et05_simu et05_real" - else - tasks="dt05_simu dt05_real" - fi - for t in $tasks; do - steps/rnnlmrescore.sh --inv-acwt $lmw --N $nbest --use-phi true \ - $rnnweight \ - $mdir/data/lang_test_${lm_suffix} \ - $mdir/data/lang_test_${rnnlm_suffix} \ - data-fmllr-tri3b/${t}_$enhan \ - $dir/decode_tgpr_5k_${t}_${enhan}_${lm_suffix} \ - $dir/decode_tgpr_5k_${t}_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest} - done - # calc wers for RNNLM results - local/chime4_calc_wers.sh $dir ${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest} $dir/graph_tgpr_5k \ - > $dir/best_wer_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest}.result - head -n 15 $dir/best_wer_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest}.result -fi diff --git a/egs/chime4/s5_1ch/local/run_lmrescore_tdnn.sh b/egs/chime4/s5_1ch/local/run_lmrescore_tdnn.sh index 67572f0dd4c..58af793615e 100755 --- a/egs/chime4/s5_1ch/local/run_lmrescore_tdnn.sh +++ b/egs/chime4/s5_1ch/local/run_lmrescore_tdnn.sh @@ -98,7 +98,7 @@ if [ $stage -le 3 ]; then steps/lmrescore.sh --mode 3 \ data/lang_test_tgpr_5k \ data/lang_test_${lm_suffix} \ - data/${t}_${enhan}_hires \ + data/${t}_${enhan}_chunked \ $srcdir/decode_tgpr_5k_${t}_${enhan} \ $dir/decode_tgpr_5k_${t}_${enhan}_${lm_suffix} done @@ -128,7 +128,7 @@ if [ $stage -le 4 ]; then $rnnweight \ data/lang_test_${lm_suffix} \ data/lang_test_${rnnlm_suffix} \ - data/${t}_${enhan}_hires \ + data/${t}_${enhan}_chunked \ $dir/decode_tgpr_5k_${t}_${enhan}_${lm_suffix} \ $dir/decode_tgpr_5k_${t}_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest} done diff --git a/egs/chime4/s5_1ch/local/run_lmrescore_tdnn_lstm.sh b/egs/chime4/s5_1ch/local/run_lmrescore_tdnn_lstm.sh index 7173dcea78b..0bea4dd7102 100755 --- a/egs/chime4/s5_1ch/local/run_lmrescore_tdnn_lstm.sh +++ b/egs/chime4/s5_1ch/local/run_lmrescore_tdnn_lstm.sh @@ -165,4 +165,4 @@ if [ $stage -le 4 ]; then local/chime4_calc_wers_looped.sh $dir ${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest} $dir/graph_tgpr_5k \ > $dir/best_wer_looped_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest}.result head -n 15 $dir/best_wer_looped_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest}.result -fi \ No newline at end of file +fi diff --git a/egs/chime4/s5_1ch/local/run_lmrescore_tdnn_lstm_recog.sh b/egs/chime4/s5_1ch/local/run_lmrescore_tdnn_lstm_recog.sh deleted file mode 100755 index c4b4e238011..00000000000 --- a/egs/chime4/s5_1ch/local/run_lmrescore_tdnn_lstm_recog.sh +++ /dev/null @@ -1,153 +0,0 @@ -#!/bin/bash - -# Copyright 2015 University of Sheffield (Jon Barker, Ricard Marxer) -# Inria (Emmanuel Vincent) -# Mitsubishi Electric Research Labs (Shinji Watanabe) -# 2017 JHU CLSP (Szu-Jui Chen) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -# Copyright 2015, Mitsubishi Electric Research Laboratories, MERL (Author: Takaaki Hori) - -nj=12 -stage=1 -order=5 -hidden=300 -rnnweight=0.5 -nbest=100 -train=noisy -eval_flag=true # make it true when the evaluation data are released - -. utils/parse_options.sh || exit 1; - -. ./path.sh -. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. - ## This relates to the queue. - -# This is a shell script, but it's recommended that you run the commands one by -# one by copying and pasting into the shell. - -if [ $# -ne 2 ]; then - printf "\nUSAGE: %s \n\n" `basename $0` - echo "First argument specifies a unique name for different enhancement method" - echo "Second argument specifies acoustic and language model directory" - exit 1; -fi - -# set language models -# You might need to change affix to the affix of your best tdnn model. -affix=1a -lm_suffix=${order}gkn_5k -rnnlm_suffix=rnnlm_5k_h${hidden} - -# enhan data -enhan=$1 -# set model directory -mdir=$2 -srcdir=exp/chain/tdnn_lstm${affix}_sp - -# check language models -if [ ! -d $mdir/data/lang ]; then - echo "error, set $mdir correctly" - exit 1; -fi - -# preparation -dir=exp/chain/tdnn_lstm${affix}_sp_smbr_lmrescore -mkdir -p $dir -# make a symbolic link to graph info -if [ ! -e $dir/graph_tgpr_5k ]; then - if [ ! -e exp/chain/tree_a_sp/graph_tgpr_5k ]; then - echo "graph is missing, execute local/run_tdnn.sh, correctly" - exit 1; - fi - pushd . ; cd $dir - ln -s ../tree_a_sp/graph_tgpr_5k . - popd -fi - -# rescore lattices by a high-order N-gram -if [ $stage -le 3 ]; then - # check the best iteration - if [ ! -f $srcdir/log/best_wer_$enhan ]; then - echo "$0: error $srcdir/log/best_wer_$enhan not found. execute local/run_tdnn_lstm.sh, first" - exit 1; - fi - it=`cut -f 1 -d" " $srcdir/log/best_wer_$enhan | awk -F'[_]' '{print $1}'` - # rescore lattices - if $eval_flag; then - tasks="dt05_simu dt05_real et05_simu et05_real" - else - tasks="dt05_simu dt05_real" - fi - for t in $tasks; do - steps/lmrescore.sh --mode 3 \ - $mdir/data/lang_test_tgpr_5k \ - $mdir/data/lang_test_${lm_suffix} \ - data/${t}_${enhan}_hires \ - $srcdir/decode_tgpr_5k_${t}_${enhan} \ - $dir/decode_tgpr_5k_${t}_${enhan}_${lm_suffix} - done - # rescored results by high-order n-gram LM - mkdir -p $dir/log - local/chime4_calc_wers.sh $dir ${enhan}_${lm_suffix} $dir/graph_tgpr_5k \ - > $dir/best_wer_${enhan}_${lm_suffix}.result - head -n 15 $dir/best_wer_${enhan}_${lm_suffix}.result - - # now rescore lattices after looped decoding - for t in $tasks; do - steps/lmrescore.sh --mode 3 \ - data/lang_test_tgpr_5k \ - data/lang_test_${lm_suffix} \ - data/${t}_${enhan}_hires \ - $srcdir/decode_looped_tgpr_5k_${t}_${enhan} \ - $dir/decode_looped_tgpr_5k_${t}_${enhan}_${lm_suffix} - done - # rescored results by high-order n-gram LM - local/chime4_calc_wers_looped.sh $dir ${enhan}_${lm_suffix} $dir/graph_tgpr_5k \ - > $dir/best_wer_looped_${enhan}_${lm_suffix}.result - head -n 15 $dir/best_wer_looped_${enhan}_${lm_suffix}.result -fi - -# N-best rescoring using a RNNLM -if [ $stage -le 4 ]; then - # check the best lmw - if [ ! -f $dir/log/best_wer_${enhan}_${lm_suffix} ]; then - echo "error, rescoring with a high-order n-gram seems to be failed" - exit 1; - fi - lmw=`cut -f 1 -d" " $dir/log/best_wer_${enhan}_${lm_suffix} | awk -F'[_]' '{print $NF}'` - # rescore n-best list for all sets - if $eval_flag; then - tasks="dt05_simu dt05_real et05_simu et05_real" - else - tasks="dt05_simu dt05_real" - fi - for t in $tasks; do - steps/rnnlmrescore.sh --inv-acwt $lmw --N $nbest --use-phi true \ - $rnnweight \ - $mdir/data/lang_test_${lm_suffix} \ - $mdir/data/lang_test_${rnnlm_suffix} \ - data/${t}_${enhan}_hires \ - $dir/decode_tgpr_5k_${t}_${enhan}_${lm_suffix} \ - $dir/decode_tgpr_5k_${t}_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest} - done - # calc wers for RNNLM results - local/chime4_calc_wers.sh $dir ${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest} $dir/graph_tgpr_5k \ - > $dir/best_wer_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest}.result - head -n 15 $dir/best_wer_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest}.result - - # now rescore lattices after looped decoding - for t in $tasks; do - steps/rnnlmrescore.sh --inv-acwt $lmw --N $nbest --use-phi true \ - $rnnweight \ - data/lang_test_${lm_suffix} \ - data/lang_test_${rnnlm_suffix} \ - data/${t}_${enhan}_hires \ - $dir/decode_looped_tgpr_5k_${t}_${enhan}_${lm_suffix} \ - $dir/decode_looped_tgpr_5k_${t}_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest} - done - # calc wers for RNNLM results - local/chime4_calc_wers_looped.sh $dir ${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest} $dir/graph_tgpr_5k \ - > $dir/best_wer_looped_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest}.result - head -n 15 $dir/best_wer_looped_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest}.result -fi diff --git a/egs/chime4/s5_1ch/local/run_lmrescore_tdnn_recog.sh b/egs/chime4/s5_1ch/local/run_lmrescore_tdnn_recog.sh deleted file mode 100755 index 4508ddeb9f4..00000000000 --- a/egs/chime4/s5_1ch/local/run_lmrescore_tdnn_recog.sh +++ /dev/null @@ -1,124 +0,0 @@ -#!/bin/bash - -# Copyright 2015 University of Sheffield (Jon Barker, Ricard Marxer) -# Inria (Emmanuel Vincent) -# Mitsubishi Electric Research Labs (Shinji Watanabe) -# 2017 JHU CLSP (Szu-Jui Chen) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -# Copyright 2015, Mitsubishi Electric Research Laboratories, MERL (Author: Takaaki Hori) - -nj=12 -stage=1 -order=5 -hidden=300 -rnnweight=0.5 -nbest=100 -train=noisy -eval_flag=true # make it true when the evaluation data are released - -. utils/parse_options.sh || exit 1; - -. ./path.sh -. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. - ## This relates to the queue. - -# This is a shell script, but it's recommended that you run the commands one by -# one by copying and pasting into the shell. - -if [ $# -ne 2 ]; then - printf "\nUSAGE: %s \n\n" `basename $0` - echo "First argument specifies a unique name for different enhancement method" - echo "Second argument specifies acoustic and language model directory" - exit 1; -fi - -# set language models -# You might need to change affix to the affix of your best tdnn model. -affix=1a -lm_suffix=${order}gkn_5k -rnnlm_suffix=rnnlm_5k_h${hidden} - -# enhan data -enhan=$1 -# set model directory -mdir=$2 -srcdir=exp/chain/tdnn${affix}_sp - -# check language models -if [ ! -d $mdir/data/lang ]; then - echo "error, set $mdir correctly" - exit 1; -fi - -# preparation -dir=exp/chain/tdnn${affix}_sp_smbr_lmrescore -mkdir -p $dir -# make a symbolic link to graph info -if [ ! -e $dir/graph_tgpr_5k ]; then - if [ ! -e exp/chain/tree_a_sp/graph_tgpr_5k ]; then - echo "graph is missing, execute local/run_tdnn.sh, correctly" - exit 1; - fi - pushd . ; cd $dir - ln -s ../tree_a_sp/graph_tgpr_5k . - popd -fi - -# rescore lattices by a high-order N-gram -if [ $stage -le 3 ]; then - # check the best iteration - if [ ! -f $srcdir/log/best_wer_$enhan ]; then - echo "$0: error $srcdir/log/best_wer_$enhan not found. execute local/run_tdnn.sh, first" - exit 1; - fi - it=`cut -f 1 -d" " $srcdir/log/best_wer_$enhan | awk -F'[_]' '{print $1}'` - # rescore lattices - if $eval_flag; then - tasks="dt05_simu dt05_real et05_simu et05_real" - else - tasks="dt05_simu dt05_real" - fi - for t in $tasks; do - steps/lmrescore.sh --mode 3 \ - $mdir/data/lang_test_tgpr_5k \ - $mdir/data/lang_test_${lm_suffix} \ - data/${t}_${enhan}_hires \ - $srcdir/decode_tgpr_5k_${t}_${enhan} \ - $dir/decode_tgpr_5k_${t}_${enhan}_${lm_suffix} - done - # rescored results by high-order n-gram LM - mkdir -p $dir/log - local/chime4_calc_wers.sh $dir ${enhan}_${lm_suffix} $dir/graph_tgpr_5k \ - > $dir/best_wer_${enhan}_${lm_suffix}.result - head -n 15 $dir/best_wer_${enhan}_${lm_suffix}.result -fi - -# N-best rescoring using a RNNLM -if [ $stage -le 4 ]; then - # check the best lmw - if [ ! -f $dir/log/best_wer_${enhan}_${lm_suffix} ]; then - echo "error, rescoring with a high-order n-gram seems to be failed" - exit 1; - fi - lmw=`cut -f 1 -d" " $dir/log/best_wer_${enhan}_${lm_suffix} | awk -F'[_]' '{print $NF}'` - # rescore n-best list for all sets - if $eval_flag; then - tasks="dt05_simu dt05_real et05_simu et05_real" - else - tasks="dt05_simu dt05_real" - fi - for t in $tasks; do - steps/rnnlmrescore.sh --inv-acwt $lmw --N $nbest --use-phi true \ - $rnnweight \ - $mdir/data/lang_test_${lm_suffix} \ - $mdir/data/lang_test_${rnnlm_suffix} \ - data/${t}_${enhan}_hires \ - $dir/decode_tgpr_5k_${t}_${enhan}_${lm_suffix} \ - $dir/decode_tgpr_5k_${t}_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest} - done - # calc wers for RNNLM results - local/chime4_calc_wers.sh $dir ${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest} $dir/graph_tgpr_5k \ - > $dir/best_wer_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest}.result - head -n 15 $dir/best_wer_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest}.result -fi diff --git a/egs/chime4/s5_1ch/local/run_nn-gev.sh b/egs/chime4/s5_1ch/local/run_nn-gev.sh new file mode 100755 index 00000000000..a17dd3d3f15 --- /dev/null +++ b/egs/chime4/s5_1ch/local/run_nn-gev.sh @@ -0,0 +1,31 @@ +#!/bin/bash +# Copyright 2017 Johns Hopkins University (Author: Aswin Shanmugam Subramanian) +# Apache 2.0 + +. ./cmd.sh +. ./path.sh + +if [ $# != 4 ]; then + echo "Wrong #arguments ($#, expected 3)" + echo "Usage: local/run_nn-gev.sh " + exit 1; +fi + +sdir=$1 +odir=$2 +enhancement_type=$3 +track=$4 + +gpu_id=1 +case $(hostname -f) in + *.clsp.jhu.edu) gpu_id=`free-gpu` ;; # JHU, +esac + +if [ ! -f local/nn-gev/data/BLSTM_model/mlp.tr ]; then + echo "training a BLSTM mask network" + $HOME/miniconda3/bin/python local/nn-gev/train.py --chime_dir=$sdir/data --gpu $gpu_id local/nn-gev/data BLSTM +else + echo "Not training a BLSTM mask network. Using existing model in local/nn-gev/data/BLSTM_model/" +fi +echo "enhancing signals with mask-based GEV beamformer" +local/nn-gev/beamform.sh $sdir/data local/nn-gev/data $odir local/nn-gev/data/BLSTM_model/best.nnet BLSTM --gpu $gpu_id --single $enhancement_type --track $track diff --git a/egs/chime4/s5_1ch/local/simu_noisy_chime4_data_prep.sh b/egs/chime4/s5_1ch/local/simu_noisy_chime4_data_prep.sh index 03e355a82ec..124cde82b8a 100755 --- a/egs/chime4/s5_1ch/local/simu_noisy_chime4_data_prep.sh +++ b/egs/chime4/s5_1ch/local/simu_noisy_chime4_data_prep.sh @@ -69,8 +69,12 @@ fi # make a scp file from file list for x in $list_set; do - cat $x.flist | awk -F'[/]' '{print $NF}'| sed -e 's/\.wav/_SIMU/' > ${x}_wav.ids - paste -d" " ${x}_wav.ids $x.flist | sort -k 1 > ${x}_wav.scp + cat $x.flist | awk -F'[/]' '{print $NF}'| sed -e 's/\.wav/_SIMU/' > ${x}_wav.id.temp + cat ${x}_wav.id.temp | awk -F'_' '{print $3}' | awk -F'.' '{print $2}' > $x.ch + cat ${x}_wav.id.temp | awk -F'_' '{print $1}' > $x.part1 + cat ${x}_wav.id.temp | sed -e 's/^..._//' > $x.part2 + paste -d"_" $x.part1 $x.ch $x.part2 > ${x}_wav.ids + paste -d" " ${x}_wav.ids $x.flist | sort -t_ -k1,1 -k3 > ${x}_wav.scp.temp done # make a transcription from dot @@ -80,10 +84,10 @@ if [ ! -e dot_files.flist ]; then echo "Could not find $dir/dot_files.flist files, first run local/clean_wsj0_data_prep.sh"; exit 1; fi -cat tr05_simu_noisy_wav.scp | awk -F'[_]' '{print $2}' | tr '[A-Z]' '[a-z]' \ +cat tr05_simu_noisy_wav.scp.temp | awk -F'[_]' '{print $3}' | tr '[A-Z]' '[a-z]' \ | $local/find_noisy_transcripts.pl dot_files.flist | cut -f 2- -d" " > tr05_simu_noisy.txt -cat tr05_simu_noisy_wav.scp | cut -f 1 -d" " > tr05_simu_noisy.ids -paste -d" " tr05_simu_noisy.ids tr05_simu_noisy.txt | sort -k 1 > tr05_simu_noisy.trans1 +cat tr05_simu_noisy_wav.scp.temp | cut -f 1 -d" " > tr05_simu_noisy.ids +paste -d" " tr05_simu_noisy.ids tr05_simu_noisy.txt | sort -t_ -k1,1 -k3 > tr05_simu_noisy.trans1 # dt05 and et05 simulation data are generated from the CHiME4 booth recording # and we use CHiME4 dot files cat dt05_simu.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF ".CH1_SIMU"}'> dt05_simu_noisy.ids @@ -104,13 +108,17 @@ fi # data-preparation stage independent of the specific lexicon used. noiseword=""; for x in $list_set;do + cat ${x}_wav.scp.temp | awk '{print $1}' > $x.txt.part1 + cat $x.trans1 | awk '{$1=""; print $0}' | sed 's/^[ \t]*//g' > $x.txt.part2 + paste -d" " $x.txt.part1 $x.txt.part2 > $x.trans1 cat $x.trans1 | $local/normalize_transcript.pl $noiseword \ | sort > $x.txt || exit 1; done # Make the utt2spk and spk2utt files. for x in $list_set; do - cat ${x}_wav.scp | awk -F'_' '{print $1}' > $x.spk + sort ${x}_wav.scp.temp > ${x}_wav.scp + cat ${x}_wav.scp | awk -F'_' '{print $1"_"$2}' > $x.spk cat ${x}_wav.scp | awk '{print $1}' > $x.utt paste -d" " $x.utt $x.spk > $x.utt2spk cat $x.utt2spk | $utils/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1; @@ -125,4 +133,8 @@ for x in $list_set; do cp ${x}.utt2spk ../../$x/utt2spk || exit 1; done +# clean up temp files +rm *.temp +rm *.part{1,2} + echo "Data preparation succeeded" diff --git a/egs/chime4/s5_1ch/local/stoi_estoi_sdr.m b/egs/chime4/s5_1ch/local/stoi_estoi_sdr.m new file mode 100644 index 00000000000..45047fe1884 --- /dev/null +++ b/egs/chime4/s5_1ch/local/stoi_estoi_sdr.m @@ -0,0 +1,62 @@ +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% Copyright 2017 Johns Hopkins University (Author: Aswin Shanmugam Subramanian) +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +function stoi_estoi_sdr(nj,enhancement_method,destination_directory,set) + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% "stoi_estoi_sdr" : this function computes the average STOI, eSTOI and SDR +% scores by calling downloaded third party matlab functions +% +% Input: +% nj: number of jobs +% enhancement_method: the name of the enhacement method +% destination_directory: the directory where the results have to be stored, +% the list of the enhaced and reference files are +% stored here before calling this function +% set: name of the set to be evaluated ('et05' or 'dt05') +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +original_file_list=strcat(destination_directory,'/original_list'); +enhanced_file_list=strcat(destination_directory,'/enhanced_list'); +files1=textread(original_file_list,'%s'); +files2=textread(enhanced_file_list,'%s'); +d_stoi=zeros(1,length(files2)); +d_estoi=zeros(1,length(files2)); +SDR=zeros(1,length(files2)); +p = parpool('local', nj); +parfor i=1:length(files2) + [x, fs] = audioread(files1{i}); + [y, fs] = audioread(files2{i}); + m=length(x); + n=length(y); + d=abs(m-n); + if m>n + y=[y; zeros(d,1)]; + end + if n>m + x=[x; zeros(d,1)]; + end + + d_stoi(i)=stoi(x,y,fs); + d_estoi(i)=estoi(x,y,fs); + [SDR(i),SIR,SAR,perm]=bss_eval_sources(y',x'); +end +SDR_avg=mean(SDR); +STOI_avg=mean(d_stoi); +ESTOI_avg=mean(d_estoi); +SDRFile=strcat(destination_directory,'/',enhancement_method,'_',set,'_SDR'); +stoiFile=strcat(destination_directory,'/',enhancement_method,'_',set,'_STOI'); +estoiFile=strcat(destination_directory,'/',enhancement_method,'_',set,'_eSTOI'); +fileID = fopen(SDRFile,'w'); +fprintf(fileID,'%f\n',SDR_avg); +fclose(fileID); +fileID = fopen(stoiFile,'w'); +fprintf(fileID,'%f\n',STOI_avg); +fclose(fileID); +fileID = fopen(estoiFile,'w'); +fprintf(fileID,'%f\n',ESTOI_avg); +fclose(fileID); +ResultMATFile=strcat(destination_directory,'/',enhancement_method,'_',set,'_stoi_estoi_sdr.mat'); +save(ResultMATFile,'SDR','d_stoi','d_estoi'); +end diff --git a/egs/chime4/s5_1ch/local/write_se_results.sh b/egs/chime4/s5_1ch/local/write_se_results.sh new file mode 100755 index 00000000000..7ada63f8ccc --- /dev/null +++ b/egs/chime4/s5_1ch/local/write_se_results.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# Copyright 2017 Johns Hopkins University (Author: Aswin Shanmugam Subramanian) +# Apache 2.0 + +. ./cmd.sh +. ./path.sh + +# Config: + +if [ $# != 1 ]; then + echo "Wrong #arguments ($#, expected 1)" + echo "Usage: local/write_se_results.sh " + exit 1; +fi + +enhancement=$1 + +echo -e "PESQ ($enhancement) \t dt05_simu=$(cat exp/compute_pesq_$enhancement/pesq_dt05) \t et05_simu=$(cat exp/compute_pesq_$enhancement/pesq_et05)" +echo -e "STOI ($enhancement) \t dt05_simu=$(cat exp/compute_stoi_estoi_sdr_$enhancement/${enhancement}_dt05_STOI) \t et05_simu=$(cat exp/compute_stoi_estoi_sdr_$enhancement/${enhancement}_et05_STOI)" +echo -e "eSTOI ($enhancement) \t dt05_simu=$(cat exp/compute_stoi_estoi_sdr_$enhancement/${enhancement}_dt05_eSTOI) \t et05_simu=$(cat exp/compute_stoi_estoi_sdr_$enhancement/${enhancement}_et05_eSTOI)" +echo -e "SDR ($enhancement) \t dt05_simu=$(cat exp/compute_stoi_estoi_sdr_$enhancement/${enhancement}_dt05_SDR) \t et05_simu=$(cat exp/compute_stoi_estoi_sdr_$enhancement/${enhancement}_et05_SDR)" +echo "" diff --git a/egs/chime4/s5_1ch/rnnlm b/egs/chime4/s5_1ch/rnnlm new file mode 120000 index 00000000000..e136939ba72 --- /dev/null +++ b/egs/chime4/s5_1ch/rnnlm @@ -0,0 +1 @@ +../../../scripts/rnnlm/ \ No newline at end of file diff --git a/egs/chime4/s5_1ch/run.sh b/egs/chime4/s5_1ch/run.sh index beb8c80207f..5b980dec827 100755 --- a/egs/chime4/s5_1ch/run.sh +++ b/egs/chime4/s5_1ch/run.sh @@ -6,26 +6,29 @@ # Inria (Emmanuel Vincent) # Mitsubishi Electric Research Labs (Shinji Watanabe) # 2017 JHU CLSP (Szu-Jui Chen) +# 2017 JHU CLSP (Aswin Shanmugam Subramanian) # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) . ./path.sh . ./cmd.sh + #####Baseline settings##### # Usage: -# 1. For using original baseline, execute './run.sh --baseline chime4_official'. -# We don't provide the function to train original baseline models anymore. Instead, we provided the -# trained original baseline models in tools/ASR_models for directly using. +# Execute './run.sh' to get the models. +# We provide BLSTM masking based enhancement --enhancement single_blstmmask # -# 2. For using advanced baseline, first execute './run.sh --baseline advanced --flatstart true' to -# get the models. If you want to use DNN instead of TDNN, add option "--tdnn false". -# Then execute './run.sh --baseline advanced' for your experiments. +# We stopped to support the old CHiME-3/4 baseline. If you want to reproduce the old results +# Please use the old version of Kaldi, e.g., git checkout 9e8ff73648917836d0870c8f6fdd2ff4bdde384f # Config: stage=0 # resume training with --stage N - -baseline=advanced -flatstart=false -tdnn=true +enhancement=single_blstmmask #### or your method +# if the following options are true, they wouldn't train a model again and will only do decoding +gmm_decode_only=false +tdnn_decode_only=false +# make it true when you want to add enhanced data into training set. But please note that when changing enhancement method, +# you may need to retrain from run_gmm.sh and avoid using decode-only options above +add_enhanced_data=true . utils/parse_options.sh || exit 1; @@ -40,107 +43,82 @@ set -o pipefail # If you use scripts distributed in the CHiME4 package, chime4_data=`pwd`/../.. # Otherwise, please specify it, e.g., -chime4_data=/db/laputa1/data/processed/public/CHiME4 - +# chime4_data=/db/laputa1/data/processed/public/CHiME4 +# chime3_data=/data2/archive/speech-db/original/public/CHiME3 case $(hostname -f) in - *.clsp.jhu.edu) chime4_data=/export/corpora4/CHiME4/CHiME3 ;; # JHU, + *.clsp.jhu.edu) + chime4_data=/export/corpora4/CHiME4/CHiME3 # JHU, + chime3_data=/export/corpora5/CHiME3 + ;; esac if [ ! -d $chime4_data ]; then - echo "$chime4_data does not exist. Please specify chime4 data root correctly" && exit 1 + echo "$chime4_data does not exist. Please specify chime4 data root correctly" && exit 1; fi -# Set a model directory for the CHiME4 data. -case $baseline in - chime4_official) - if $flatstart; then - echo "We don't support this anymore for 'chime4_official' baseline" - echo " ... Automatically set it to false" - fi - modeldir=$chime4_data/tools/ASR_models - flatstart=false - ;; - advanced) - modeldir=`pwd` - ;; - *) - echo "Usage: './run.sh --baseline chime4_official' or './run.sh --baseline advanced'" - echo " ... If you haven't run flatstart for advanced baseline, please execute" - echo " ... './run.sh --baseline advanced --flatstart true' first"; - exit 1; -esac - -if [ "$flatstart" = false ]; then - for d in $modeldir $modeldir/data/{lang,lang_test_tgpr_5k,lang_test_5gkn_5k,lang_test_rnnlm_5k_h300,local} \ - $modeldir/exp/{tri3b_tr05_multi_noisy,tri4a_dnn_tr05_multi_noisy,tri4a_dnn_tr05_multi_noisy_smbr_i1lats}; do - [ ! -d $d ] && echo "$0: no such directory $d. specify models correctly" && \ - echo " or execute './run.sh --baseline advanced --flatstart true' first" && exit 1; - done +if [ ! -d $chime3_data ]; then + echo "$chime3_data does not exist. Please specify chime4 data root correctly" && exit 1; fi -#####check data and model paths finished####### - #####main program start################ # You can execute run_init.sh only "once" # This creates 3-gram LM, FSTs, and basic task files -if [ $stage -le 0 ] && $flatstart; then +if [ $stage -le 0 ]; then local/run_init.sh $chime4_data fi -# In this script, we use non-enhanced 6th microphone signals. -enhancement_method=isolated_1ch_track -enhancement_data=$chime4_data/data/audio/16kHz/$enhancement_method -#if [ $stage -le 1 ]; then -# put your single channel enhancement -#fi +if [[ "$enhancement" == *isolated_1ch_track* ]]; then + enhancement_data=$chime4_data/data/audio/16kHz/isolated_1ch_track +else + enhancement_data=`pwd`/enhan/$enhancement +fi -# GMM based ASR experiment without "retraining" -# Please set a directory of your speech enhancement method. -# run_gmm_recog.sh can be done every time when you change a speech enhancement technique. -# The directory structure and audio files must follow the attached baseline enhancement directory +if [ $stage -le 1 ]; then + local/run_blstm_gev.sh --cmd "$train_cmd" --nj 20 --track 1 $chime4_data $chime3_data $enhancement_data 0 +fi + +# Compute PESQ, STOI, eSTOI, and SDR scores if [ $stage -le 2 ]; then - if $flatstart; then - local/run_gmm.sh $enhancement_method $enhancement_data $chime4_data - else - local/run_gmm_recog.sh $enhancement_method $enhancement_data $modeldir + if [ ! -f local/bss_eval_sources.m ] || [ ! -f local/stoi.m ] || [ ! -f local/estoi.m ] || [ ! -f local/PESQ ]; then + # download and install speech enhancement evaluation tools + local/download_se_eval_tool.sh + fi + chime4_rir_data=local/nn-gev/data/audio/16kHz/isolated_ext + if [ ! -d $chime4_rir_data ]; then + echo "$chime4_rir_dir does not exist. Please run 'blstm_gev' enhancement method first;" && exit 1; fi + local/compute_pesq.sh $enhancement $enhancement_data $chime4_rir_data $PWD + local/compute_stoi_estoi_sdr.sh $enhancement $enhancement_data $chime4_rir_data + local/compute_pesq.sh NOISY_1ch $chime4_data/data/audio/16kHz/isolated_1ch_track/ $chime4_rir_data $PWD + local/compute_stoi_estoi_sdr.sh NOISY_1ch $chime4_data/data/audio/16kHz/isolated_1ch_track/ $chime4_rir_data + local/write_se_results.sh $enhancement + local/write_se_results.sh NOISY_1ch fi -# DNN based ASR experiment -# Since it takes time to evaluate DNN, we make the GMM and DNN scripts separately. -# You may execute it after you would have promising results using GMM-based ASR experiments +# GMM based ASR experiment +# Please set a directory of your speech enhancement method. +# The directory structure and audio files must follow the attached baseline enhancement directory if [ $stage -le 3 ]; then - if $tdnn; then - if $flatstart; then - local/chain/run_tdnn.sh $enhancement_method - else - local/chain/run_tdnn_recog.sh $enhancement_method $modeldir - fi - else - if $flatstart; then - local/run_dnn.sh $enhancement_method - else - local/run_dnn_recog.sh $enhancement_method $modeldir - fi - fi + local/run_gmm.sh --add-enhanced-data $add_enhanced_data \ + --decode-only $gmm_decode_only $enhancement $enhancement_data $chime4_data +fi + +# TDNN based ASR experiment +# Since it takes time to evaluate TDNN, we make the GMM and TDNN scripts separately. +# You may execute it after you would have promising results using GMM-based ASR experiments +if [ $stage -le 4 ]; then + local/chain/run_tdnn.sh --decode-only $tdnn_decode_only $enhancement fi # LM-rescoring experiment with 5-gram and RNN LMs # It takes a few days to train a RNNLM. -if [ $stage -le 4 ]; then - if $flatstart; then - if $tdnn; then - local/run_lmrescore_tdnn.sh $chime4_data $enhancement_method - else - local/run_lmrescore.sh $chime4_data $enhancement_method - fi - else - if $tdnn; then - local/run_lmrescore_tdnn_recog.sh $enhancement_method $modeldir - else - local/run_lmrescore_recog.sh $enhancement_method $modeldir - fi - fi +if [ $stage -le 5 ]; then + local/run_lmrescore_tdnn.sh $chime4_data $enhancement +fi + +# LM-rescoring experiment with LSTM LMs +if [ $stage -le 6 ]; then + local/rnnlm/run_lstm.sh $enhancement fi echo "Done." diff --git a/egs/chime4/s5_2ch/RESULTS b/egs/chime4/s5_2ch/RESULTS index f506b54c5db..156b94ebfa9 100644 --- a/egs/chime4/s5_2ch/RESULTS +++ b/egs/chime4/s5_2ch/RESULTS @@ -19,7 +19,8 @@ et05_simu WER: 27.57% (Average), 20.17% (BUS), 31.81% (CAFE), 29.96% (PEDESTRIAN et05_real WER: 29.03% (Average), 39.37% (BUS), 28.43% (CAFE), 27.56% (PEDESTRIAN), 20.77% (STREET) ------------------- -Advanced baseline: +GMM noisy multi-condition with beamformit using 6 channel data +exp/tri3b_tr05_multi_noisy/best_wer_beamformit_2mics.result ------------------- best overall dt05 WER 17.26% (language model weight = 10) ------------------- @@ -32,6 +33,19 @@ et05_simu WER: 26.85% (Average), 20.08% (BUS), 30.84% (CAFE), 29.03% (PEDESTRIAN et05_real WER: 27.91% (Average), 37.05% (BUS), 29.25% (CAFE), 25.37% (PEDESTRIAN), 19.97% (STREET) ------------------- +GMM noisy multi-condition with BLSTM masking using 6 channel data plus enhanced data +exp/tri3b_tr05_multi_noisy/best_wer_blstm_gev.result +------------------- +best overall dt05 WER 14.57% (language model weight = 10) +------------------- +dt05_simu WER: 15.62% (Average), 12.89% (BUS), 20.49% (CAFE), 14.22% (PEDESTRIAN), 14.90% (STREET) +------------------- +dt05_real WER: 13.52% (Average), 15.52% (BUS), 14.34% (CAFE), 11.57% (PEDESTRIAN), 12.67% (STREET) +------------------- +et05_simu WER: 19.05% (Average), 14.51% (BUS), 21.87% (CAFE), 20.41% (PEDESTRIAN), 19.39% (STREET) +------------------- +et05_real WER: 20.94% (Average), 26.66% (BUS), 21.52% (CAFE), 19.15% (PEDESTRIAN), 16.45% (STREET) +------------------- DNN sMBR exp/tri4a_dnn_tr05_multi_noisy_smbr_i1lats/best_wer_beamformit_2mics.result @@ -48,7 +62,7 @@ et05_simu WER: 19.04% (Average), 14.76% (BUS), 21.72% (CAFE), 19.22% (PEDESTRIAN et05_real WER: 20.44% (Average), 30.02% (BUS), 19.95% (CAFE), 17.79% (PEDESTRIAN), 14.01% (STREET) ------------------- -Advanced baseline: +DNN sMBR using all 6 channel data ------------------- best overall dt05 WER 10.13% (language model weight = 12) (Number of iterations = 3) @@ -77,7 +91,7 @@ et05_simu WER: 16.88% (Average), 12.08% (BUS), 19.70% (CAFE), 16.77% (PEDESTRIAN et05_real WER: 18.07% (Average), 26.77% (BUS), 17.93% (CAFE), 14.76% (PEDESTRIAN), 12.83% (STREET) ------------------- -Advanced baseline: +5-gram rescoring using all 6 channel data ------------------- best overall dt05 WER 8.53% (language model weight = 13) ------------------- @@ -105,7 +119,7 @@ et05_simu WER: 15.33% (Average), 10.66% (BUS), 18.21% (CAFE), 15.61% (PEDESTRIAN et05_real WER: 16.58% (Average), 25.37% (BUS), 15.97% (CAFE), 13.53% (PEDESTRIAN), 11.45% (STREET) ------------------- -Advanced baseline: +RNNLM using all 6 channel data ------------------- best overall dt05 WER 7.46% (language model weight = 14) ------------------- @@ -118,7 +132,7 @@ et05_simu WER: 12.57% (Average), 8.85% (BUS), 14.85% (CAFE), 12.44% (PEDESTRIAN) et05_real WER: 13.33% (Average), 18.94% (BUS), 13.04% (CAFE), 11.85% (PEDESTRIAN), 9.49% (STREET) ------------------- -TDNN +TDNN using all 6 channel data exp/chain/tdnn1d_sp/best_wer_beamformit_5mics.result ------------------- best overall dt05 WER 7.89% (language model weight = 10) @@ -132,8 +146,8 @@ et05_simu WER: 13.15% (Average), 9.77% (BUS), 14.16% (CAFE), 13.43% (PEDESTRIAN) et05_real WER: 13.39% (Average), 19.63% (BUS), 11.64% (CAFE), 11.49% (PEDESTRIAN), 10.80% (STREET) ------------------- -TDNN+RNNLM -exp/chain/tdnn1d_sp_smbr_lmrescore/best_wer_beamformit_5mics_rnnlm_5k_h300_w0.5_n100.result +TDNN+RNNLM using all 6 channel data +exp/chain/tdnn1d_sp_smbr_lmrescore/best_wer_beamformit_2mics_rnnlm_5k_h300_w0.5_n100.result ------------------- best overall dt05 WER 5.82% (language model weight = 11) ------------------- @@ -145,3 +159,73 @@ et05_simu WER: 9.90% (Average), 7.00% (BUS), 11.15% (CAFE), 10.05% (PEDESTRIAN), ------------------- et05_real WER: 10.53% (Average), 16.90% (BUS), 8.65% (CAFE), 8.52% (PEDESTRIAN), 8.05% (STREET) ------------------- + +TDNN using 6 channel data plus enhanced data +exp/chain/tdnn1a_sp/best_wer_beamformit_5mics.result +------------------- +best overall dt05 WER 7.57% (language model weight = 10) +------------------- +dt05_simu WER: 8.18% (Average), 7.12% (BUS), 10.16% (CAFE), 6.33% (PEDESTRIAN), 9.12% (STREET) +------------------- +dt05_real WER: 6.96% (Average), 9.38% (BUS), 6.46% (CAFE), 4.91% (PEDESTRIAN), 7.09% (STREET) +------------------- +et05_simu WER: 13.14% (Average), 9.92% (BUS), 14.55% (CAFE), 13.26% (PEDESTRIAN), 14.83% (STREET) +------------------- +et05_real WER: 12.81% (Average), 19.27% (BUS), 10.66% (CAFE), 11.29% (PEDESTRIAN), 10.03% (STREET) +------------------- + +TDNN+RNNLM using 6 channel data plus enhanced data +exp/chain/tdnn1a_sp_smbr_lmrescore/best_wer_beamformit_2mics_rnnlm_5k_h300_w0.5_n100.result +------------------- +best overall dt05 WER 5.52% (language model weight = 10) +------------------- +dt05_simu WER: 6.02% (Average), 5.28% (BUS), 7.37% (CAFE), 4.60% (PEDESTRIAN), 6.81% (STREET) +------------------- +dt05_real WER: 5.03% (Average), 7.23% (BUS), 4.26% (CAFE), 3.26% (PEDESTRIAN), 5.35% (STREET) +------------------- +et05_simu WER: 10.35% (Average), 7.84% (BUS), 11.04% (CAFE), 10.55% (PEDESTRIAN), 11.95% (STREET) +------------------- +et05_real WER: 10.20% (Average), 16.21% (BUS), 8.18% (CAFE), 8.43% (PEDESTRIAN), 7.98% (STREET) +------------------- + +TDNN with BLSTM masking using 6 channel data plus enhanced data +exp/chain/tdnn1a_sp/best_wer_blstm_gev.result +------------------- +best overall dt05 WER 6.35% (language model weight = 9) +------------------- +dt05_simu WER: 7.03% (Average), 5.72% (BUS), 9.32% (CAFE), 6.28% (PEDESTRIAN), 6.78% (STREET) +------------------- +dt05_real WER: 5.66% (Average), 6.89% (BUS), 5.99% (CAFE), 4.44% (PEDESTRIAN), 5.34% (STREET) +------------------- +et05_simu WER: 8.80% (Average), 6.80% (BUS), 10.20% (CAFE), 8.37% (PEDESTRIAN), 9.84% (STREET) +------------------- +et05_real WER: 9.46% (Average), 13.42% (BUS), 8.31% (CAFE), 8.76% (PEDESTRIAN), 7.34% (STREET) +------------------- + +TDNN+RNNLM with BLSTM masking using 6 channel data plus enhanced data +exp/chain/tdnn1a_sp_smbr_lmrescore/best_wer_blstm_gev_rnnlm_5k_h300_w0.5_n100.result +------------------- +best overall dt05 WER 4.41% (language model weight = 11) +------------------- +dt05_simu WER: 5.03% (Average), 4.13% (BUS), 6.83% (CAFE), 4.45% (PEDESTRIAN), 4.72% (STREET) +------------------- +dt05_real WER: 3.79% (Average), 4.68% (BUS), 3.94% (CAFE), 2.95% (PEDESTRIAN), 3.61% (STREET) +------------------- +et05_simu WER: 6.07% (Average), 4.52% (BUS), 6.93% (CAFE), 6.05% (PEDESTRIAN), 6.78% (STREET) +------------------- +et05_real WER: 6.93% (Average), 10.23% (BUS), 6.13% (CAFE), 6.41% (PEDESTRIAN), 4.97% (STREET) +------------------- + +TDNN+RNNLM with BLSTM masking using 6 channel data plus enhanced data +exp/chain/tdnn1a_sp_smbr_lmrescore/best_wer_blstm_gev_rnnlm_lstm_1a_w0.5_n100.result +------------------- +best overall dt05 WER 3.39% (language model weight = 10) +------------------- +dt05_simu WER: 3.94% (Average), 2.99% (BUS), 5.65% (CAFE), 3.44% (PEDESTRIAN), 3.67% (STREET) +------------------- +dt05_real WER: 2.85% (Average), 3.58% (BUS), 2.89% (CAFE), 2.07% (PEDESTRIAN), 2.85% (STREET) +------------------- +et05_simu WER: 5.03% (Average), 3.66% (BUS), 5.57% (CAFE), 4.87% (PEDESTRIAN), 6.03% (STREET) +------------------- +et05_real WER: 5.40% (Average), 7.81% (BUS), 4.71% (CAFE), 4.73% (PEDESTRIAN), 4.37% (STREET) +------------------- diff --git a/egs/chime4/s5_2ch/rnnlm b/egs/chime4/s5_2ch/rnnlm new file mode 120000 index 00000000000..e136939ba72 --- /dev/null +++ b/egs/chime4/s5_2ch/rnnlm @@ -0,0 +1 @@ +../../../scripts/rnnlm/ \ No newline at end of file diff --git a/egs/chime4/s5_2ch/run.sh b/egs/chime4/s5_2ch/run.sh index e1a3fecbce5..7ae5048c6fa 100755 --- a/egs/chime4/s5_2ch/run.sh +++ b/egs/chime4/s5_2ch/run.sh @@ -6,26 +6,30 @@ # Inria (Emmanuel Vincent) # Mitsubishi Electric Research Labs (Shinji Watanabe) # 2017 JHU CLSP (Szu-Jui Chen) +# 2017 JHU CLSP (Aswin Shanmugam Subramanian) # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) . ./path.sh . ./cmd.sh + #####Baseline settings##### # Usage: -# 1. For using original baseline, execute './run.sh --baseline chime4_official'. -# We don't provide the function to train original baseline models anymore. Instead, we provided the -# trained original baseline models in tools/ASR_models for directly using. +# Execute './run.sh' to get the models. +# We provide three kinds of beamform methods. Add option --enhancement blstm_gev, or --enhancement beamformit_2mics +# to use them. i.g. './run.sh --enhancement blstm_gev' # -# 2. For using advanced baseline, first execute './run.sh --baseline advanced --flatstart true' to -# get the models. If you want to use DNN instead of TDNN, add option "--tdnn false". -# Then execute './run.sh --baseline advanced' for your experiments. +# We stopped to support the old CHiME-3/4 baseline. If you want to reproduce the old results +# Please use the old version of Kaldi, e.g., git checkout 9e8ff73648917836d0870c8f6fdd2ff4bdde384f # Config: stage=0 # resume training with --stage N - -baseline=advanced -flatstart=false -tdnn=true +enhancement=blstm_gev #### or your method +# if the following options are true, they wouldn't train a model again and will only do decoding +gmm_decode_only=false +tdnn_decode_only=false +# make it true when you want to add enhanced data into training set. But please note that when changing enhancement method, +# you may need to retrain from run_gmm.sh and avoid using decode-only options above +add_enhanced_data=true . utils/parse_options.sh || exit 1; @@ -40,109 +44,89 @@ set -o pipefail # If you use scripts distributed in the CHiME4 package, chime4_data=`pwd`/../.. # Otherwise, please specify it, e.g., -chime4_data=/db/laputa1/data/processed/public/CHiME4 +# chime4_data=/db/laputa1/data/processed/public/CHiME4 +# chime3_data=/data2/archive/speech-db/original/public/CHiME3 case $(hostname -f) in - *.clsp.jhu.edu) chime4_data=/export/corpora4/CHiME4/CHiME3 ;; # JHU, + *.clsp.jhu.edu) + chime4_data=/export/corpora4/CHiME4/CHiME3 # JHU, + chime3_data=/export/corpora5/CHiME3 + ;; esac if [ ! -d $chime4_data ]; then - echo "$chime4_data does not exist. Please specify chime4 data root correctly" && exit 1 + echo "$chime4_data does not exist. Please specify chime4 data root correctly" && exit 1; fi -# Set a model directory for the CHiME4 data. -case $baseline in - chime4_official) - if $flatstart; then - echo "We don't support this anymore for 'chime4_official' baseline" - echo " ... Automatically set it to false" - fi - modeldir=$chime4_data/tools/ASR_models - flatstart=false - ;; - advanced) - modeldir=`pwd` - ;; - *) - echo "Usage: './run.sh --baseline chime4_official' or './run.sh --baseline advanced'" - echo " ... If you haven't run flatstart to train the model of advanced baseline," - echo " ... please execute './run.sh --baseline advanced --flatstart true' first"; - exit 1; -esac - -if [ "$flatstart" = false ]; then - for d in $modeldir $modeldir/data/{lang,lang_test_tgpr_5k,lang_test_5gkn_5k,lang_test_rnnlm_5k_h300,local} \ - $modeldir/exp/{tri3b_tr05_multi_noisy,tri4a_dnn_tr05_multi_noisy,tri4a_dnn_tr05_multi_noisy_smbr_i1lats}; do - [ ! -d $d ] && echo "$0: no such directory $d. specify models correctly" && \ - echo " or execute './run.sh --baseline advanced --flatstart true' first" && exit 1; - done +if [ ! -d $chime3_data ]; then + echo "$chime3_data does not exist. Please specify chime4 data root correctly" && exit 1; fi -#####check data and model paths finished####### - #####main program start################ # You can execute run_init.sh only "once" # This creates 3-gram LM, FSTs, and basic task files -if [ $stage -le 0 ] && $flatstart; then +if [ $stage -le 0 ]; then local/run_init.sh $chime4_data fi -# Using Beamformit -# See Hori et al, "The MERL/SRI system for the 3rd CHiME challenge using beamforming, -# robust feature extraction, and advanced speech recognition," in Proc. ASRU'15 -# note that beamformed wav files are generated in the following directory -enhancement_method=beamformit_2mics -enhancement_data=`pwd`/enhan/$enhancement_method +# Using Beamformit or mask-based beamformer +# note that beamformed WAV files are generated in the following directory +enhancement_data=`pwd`/enhan/$enhancement if [ $stage -le 1 ]; then - local/run_beamform_2ch_track.sh --cmd "$train_cmd" --nj 20 $chime4_data/data/audio/16kHz/isolated_2ch_track $enhancement_data + case $enhancement in + beamformit_2mics) + local/run_beamform_2ch_track.sh --cmd "$train_cmd" --nj 20 $chime4_data/data/audio/16kHz/isolated_2ch_track $enhancement_data + ;; + blstm_gev) + local/run_blstm_gev.sh --cmd "$train_cmd" --nj 20 --track 2 $chime4_data $chime3_data $enhancement_data 0 + ;; + *) + echo "Usage: --enhancement blstm_gev, or --enhancement beamformit_2mics" + exit 1; + esac fi -# GMM based ASR experiment without "retraining" -# Please set a directory of your speech enhancement method. -# run_gmm_recog.sh can be done every time when you change a speech enhancement technique. -# The directory structure and audio files must follow the attached baseline enhancement directory +# Compute PESQ, STOI, eSTOI, and SDR scores if [ $stage -le 2 ]; then - if $flatstart; then - local/run_gmm.sh $enhancement_method $enhancement_data $chime4_data - else - local/run_gmm_recog.sh $enhancement_method $enhancement_data $modeldir + if [ ! -f local/bss_eval_sources.m ] || [ ! -f local/stoi.m ] || [ ! -f local/estoi.m ] || [ ! -f local/PESQ ]; then + # download and install speech enhancement evaluation tools + local/download_se_eval_tool.sh + fi + chime4_rir_data=local/nn-gev/data/audio/16kHz/isolated_ext + if [ ! -d $chime4_rir_data ]; then + echo "$chime4_rir_dir does not exist. Please run 'blstm_gev' enhancement method first;" && exit 1; fi + local/compute_pesq.sh $enhancement $enhancement_data $chime4_rir_data $PWD + local/compute_stoi_estoi_sdr.sh $enhancement $enhancement_data $chime4_rir_data + local/compute_pesq.sh NOISY_1ch $chime4_data/data/audio/16kHz/isolated_1ch_track/ $chime4_rir_data $PWD + local/compute_stoi_estoi_sdr.sh NOISY_1ch $chime4_data/data/audio/16kHz/isolated_1ch_track/ $chime4_rir_data + local/write_se_results.sh $enhancement + local/write_se_results.sh NOISY_1ch fi -# DNN based ASR experiment -# Since it takes time to evaluate DNN, we make the GMM and DNN scripts separately. -# You may execute it after you would have promising results using GMM-based ASR experiments +# GMM based ASR experiment +# Please set a directory of your speech enhancement method. +# The directory structure and audio files must follow the attached baseline enhancement directory if [ $stage -le 3 ]; then - if $tdnn; then - if $flatstart; then - local/chain/run_tdnn.sh $enhancement_method - else - local/chain/run_tdnn_recog.sh $enhancement_method $modeldir - fi - else - if $flatstart; then - local/run_dnn.sh $enhancement_method - else - local/run_dnn_recog.sh $enhancement_method $modeldir - fi - fi + local/run_gmm.sh --add-enhanced-data $add_enhanced_data \ + --decode-only $gmm_decode_only $enhancement $enhancement_data $chime4_data +fi + +# TDNN based ASR experiment +# Since it takes time to evaluate TDNN, we make the GMM and TDNN scripts separately. +# You may execute it after you would have promising results using GMM-based ASR experiments +if [ $stage -le 4 ]; then + local/chain/run_tdnn.sh --decode-only $tdnn_decode_only $enhancement fi # LM-rescoring experiment with 5-gram and RNN LMs # It takes a few days to train a RNNLM. -if [ $stage -le 4 ]; then - if $flatstart; then - if $tdnn; then - local/run_lmrescore_tdnn.sh $chime4_data $enhancement_method - else - local/run_lmrescore.sh $chime4_data $enhancement_method - fi - else - if $tdnn; then - local/run_lmrescore_tdnn_recog.sh $enhancement_method $modeldir - else - local/run_lmrescore_recog.sh $enhancement_method $modeldir - fi - fi +if [ $stage -le 5 ]; then + local/run_lmrescore_tdnn.sh $chime4_data $enhancement +fi + +# LM-rescoring experiment with LSTM LMs +if [ $stage -le 6 ]; then + local/rnnlm/run_lstm.sh $enhancement fi echo "Done." diff --git a/egs/chime4/s5_6ch/RESULTS b/egs/chime4/s5_6ch/RESULTS index 7d602d49247..266216adc16 100644 --- a/egs/chime4/s5_6ch/RESULTS +++ b/egs/chime4/s5_6ch/RESULTS @@ -19,20 +19,21 @@ et05_simu WER: 21.30% (Average), 15.73% (BUS), 22.94% (CAFE), 22.51% (PEDESTRIAN et05_real WER: 21.83% (Average), 30.17% (BUS), 20.66% (CAFE), 19.82% (PEDESTRIAN), 16.68% (STREET) ------------------- -Advanced baseline: +GMM noisy multi-condition with blstm_gev +exp/tri3b_tr05_multi_noisy/best_wer_blstm_gev.result ------------------- -best overall dt05 WER 13.60% (language model weight = 12) +best overall dt05 WER 11.17% (language model weight = 12) ------------------- -dt05_simu WER: 14.23% (Average), 12.24% (BUS), 17.20% (CAFE), 12.05% (PEDESTRIAN), 15.44% (STREET) +dt05_simu WER: 11.44% (Average), 9.78% (BUS), 14.37% (CAFE), 10.10% (PEDESTRIAN), 11.50% (STREET) ------------------- -dt05_real WER: 12.96% (Average), 15.42% (BUS), 12.94% (CAFE), 10.18% (PEDESTRIAN), 13.30% (STREET) +dt05_real WER: 10.91% (Average), 11.21% (BUS), 11.24% (CAFE), 10.34% (PEDESTRIAN), 10.84% (STREET) ------------------- -et05_simu WER: 20.46% (Average), 14.77% (BUS), 21.78% (CAFE), 22.49% (PEDESTRIAN), 22.81% (STREET) +et05_simu WER: 13.54% (Average), 11.65% (BUS), 14.90% (CAFE), 13.73% (PEDESTRIAN), 13.86% (STREET) ------------------- -et05_real WER: 21.14% (Average), 28.40% (BUS), 21.29% (CAFE), 18.68% (PEDESTRIAN), 16.19% (STREET) +et05_real WER: 14.62% (Average), 16.43% (BUS), 15.43% (CAFE), 12.99% (PEDESTRIAN), 13.63% (STREET) ------------------- -DNN sMBR +DNN sMBR with beamformit exp/tri4a_dnn_tr05_multi_noisy_smbr_i1lats/best_wer_beamformit_5mics.result ------------------- best overall dt05 WER 8.60% (language model weight = 11) @@ -47,98 +48,120 @@ et05_simu WER: 14.23% (Average), 10.72% (BUS), 15.52% (CAFE), 13.90% (PEDESTRIAN et05_real WER: 15.00% (Average), 21.74% (BUS), 13.58% (CAFE), 12.84% (PEDESTRIAN), 11.86% (STREET) ------------------- -Advanced baseline: +DNN sMBR with blstm_gev +exp/tri4a_dnn_tr05_multi_noisy_smbr_i1lats/best_wer_blstm_gev.result ------------------- -best overall dt05 WER 7.72% (language model weight = 12) - (Number of iterations = 3) +best overall dt05 WER 7.38% (language model weight = 11) + (Number of iterations = 4) ------------------- -dt05_simu WER: 7.98% (Average), 6.96% (BUS), 9.75% (CAFE), 6.56% (PEDESTRIAN), 8.66% (STREET) +dt05_simu WER: 7.49% (Average), 5.93% (BUS), 9.69% (CAFE), 6.73% (PEDESTRIAN), 7.61% (STREET) ------------------- -dt05_real WER: 7.45% (Average), 9.15% (BUS), 8.10% (CAFE), 5.40% (PEDESTRIAN), 7.17% (STREET) +dt05_real WER: 7.28% (Average), 7.83% (BUS), 7.80% (CAFE), 6.37% (PEDESTRIAN), 7.11% (STREET) ------------------- -et05_simu WER: 12.30% (Average), 9.45% (BUS), 13.26% (CAFE), 11.77% (PEDESTRIAN), 14.74% (STREET) +et05_simu WER: 9.54% (Average), 8.18% (BUS), 10.87% (CAFE), 9.81% (PEDESTRIAN), 9.32% (STREET) ------------------- -et05_real WER: 12.64% (Average), 16.34% (BUS), 12.36% (CAFE), 10.93% (PEDESTRIAN), 10.93% (STREET) +et05_real WER: 9.77% (Average), 11.42% (BUS), 10.22% (CAFE), 9.23% (PEDESTRIAN), 8.22% (STREET) ------------------- -5-gram rescoring -exp/tri4a_dnn_tr05_multi_noisy_smbr_lmrescore/best_wer_beamformit_5mics_5gkn_5k.result +RNNLM with beamformit +exp/tri4a_dnn_tr05_multi_noisy_smbr_lmrescore/best_wer_beamformit_5mics_rnnlm_5k_h300_w0.5_n100.result ------------------- -best overall dt05 WER 7.30% (language model weight = 11) +best overall dt05 WER 6.27% (language model weight = 12) ------------------- -dt05_simu WER: 7.75% (Average), 7.14% (BUS), 9.13% (CAFE), 6.33% (PEDESTRIAN), 8.41% (STREET) +dt05_simu WER: 6.77% (Average), 6.02% (BUS), 8.10% (CAFE), 5.49% (PEDESTRIAN), 7.48% (STREET) ------------------- -dt05_real WER: 6.85% (Average), 8.53% (BUS), 6.90% (CAFE), 4.72% (PEDESTRIAN), 7.24% (STREET) +dt05_real WER: 5.76% (Average), 7.39% (BUS), 5.77% (CAFE), 3.72% (PEDESTRIAN), 6.18% (STREET) ------------------- -et05_simu WER: 12.31% (Average), 8.82% (BUS), 13.04% (CAFE), 11.84% (PEDESTRIAN), 15.54% (STREET) +et05_simu WER: 10.90% (Average), 7.68% (BUS), 11.54% (CAFE), 10.31% (PEDESTRIAN), 14.06% (STREET) ------------------- -et05_real WER: 13.23% (Average), 19.07% (BUS), 11.80% (CAFE), 11.51% (PEDESTRIAN), 10.53% (STREET) +et05_real WER: 11.51% (Average), 16.86% (BUS), 10.18% (CAFE), 9.83% (PEDESTRIAN), 9.19% (STREET) ------------------- -Advanced baseline: +######## Advanced baseline +######## All 6 channel training, enhanced data training, Lattice-free MMI TDNN, BLSTM-mask-based GEV beamformer + +TDNN with beamformit +exp/chain/tdnn1d_sp/best_wer_beamformit_5mics.result ------------------- -best overall dt05 WER 6.25% (language model weight = 13) +best overall dt05 WER 6.04% (language model weight = 9) ------------------- -dt05_simu WER: 6.58% (Average), 5.86% (BUS), 7.89% (CAFE), 5.19% (PEDESTRIAN), 7.39% (STREET) +dt05_simu WER: 6.25% (Average), 5.71% (BUS), 6.92% (CAFE), 5.37% (PEDESTRIAN), 7.02% (STREET) ------------------- -dt05_real WER: 5.92% (Average), 7.46% (BUS), 6.19% (CAFE), 4.25% (PEDESTRIAN), 5.77% (STREET) +dt05_real WER: 5.83% (Average), 7.48% (BUS), 5.28% (CAFE), 4.43% (PEDESTRIAN), 6.13% (STREET) ------------------- -et05_simu WER: 10.50% (Average), 7.81% (BUS), 11.06% (CAFE), 10.44% (PEDESTRIAN), 12.70% (STREET) +et05_simu WER: 10.30% (Average), 7.34% (BUS), 10.37% (CAFE), 10.05% (PEDESTRIAN), 13.43% (STREET) ------------------- -et05_real WER: 10.68% (Average), 13.97% (BUS), 10.48% (CAFE), 9.08% (PEDESTRIAN), 9.19% (STREET) +et05_real WER: 9.67% (Average), 12.71% (BUS), 8.33% (CAFE), 8.20% (PEDESTRIAN), 9.45% (STREET) ------------------- -RNNLM -exp/tri4a_dnn_tr05_multi_noisy_smbr_lmrescore/best_wer_beamformit_5mics_rnnlm_5k_h300_w0.5_n100.result +TDNN+RNNLM with beamformit +exp/chain/tdnn1d_sp_smbr_lmrescore/best_wer_beamformit_5mics_rnnlm_5k_h300_w0.5_n100.result ------------------- -best overall dt05 WER 6.27% (language model weight = 12) +best overall dt05 WER 4.15% (language model weight = 9) ------------------- -dt05_simu WER: 6.77% (Average), 6.02% (BUS), 8.10% (CAFE), 5.49% (PEDESTRIAN), 7.48% (STREET) +dt05_simu WER: 4.33% (Average), 3.95% (BUS), 4.87% (CAFE), 3.53% (PEDESTRIAN), 4.97% (STREET) ------------------- -dt05_real WER: 5.76% (Average), 7.39% (BUS), 5.77% (CAFE), 3.72% (PEDESTRIAN), 6.18% (STREET) +dt05_real WER: 3.97% (Average), 5.38% (BUS), 3.19% (CAFE), 2.94% (PEDESTRIAN), 4.37% (STREET) ------------------- -et05_simu WER: 10.90% (Average), 7.68% (BUS), 11.54% (CAFE), 10.31% (PEDESTRIAN), 14.06% (STREET) +et05_simu WER: 7.39% (Average), 4.87% (BUS), 7.58% (CAFE), 7.15% (PEDESTRIAN), 9.96% (STREET) ------------------- -et05_real WER: 11.51% (Average), 16.86% (BUS), 10.18% (CAFE), 9.83% (PEDESTRIAN), 9.19% (STREET) +et05_real WER: 7.04% (Average), 9.89% (BUS), 5.49% (CAFE), 5.70% (PEDESTRIAN), 7.10% (STREET) ------------------- -Advanced baseline: +TDNN using 6 channel data plus enhanced data with beamformit +exp/chain/tdnn7a_sp/best_wer_beamformit_5mics.result ------------------- -best overall dt05 WER 5.44% (language model weight = 13) +best overall dt05 WER 5.80% (language model weight = 10) ------------------- -dt05_simu WER: 5.82% (Average), 4.90% (BUS), 6.96% (CAFE), 4.62% (PEDESTRIAN), 6.81% (STREET) +dt05_simu WER: 6.19% (Average), 5.96% (BUS), 6.78% (CAFE), 5.10% (PEDESTRIAN), 6.92% (STREET) ------------------- -dt05_real WER: 5.05% (Average), 6.43% (BUS), 5.03% (CAFE), 3.42% (PEDESTRIAN), 5.31% (STREET) +dt05_real WER: 5.41% (Average), 6.86% (BUS), 4.87% (CAFE), 4.00% (PEDESTRIAN), 5.91% (STREET) ------------------- -et05_simu WER: 9.24% (Average), 6.65% (BUS), 9.81% (CAFE), 9.23% (PEDESTRIAN), 11.28% (STREET) +et05_simu WER: 10.26% (Average), 7.68% (BUS), 10.40% (CAFE), 10.16% (PEDESTRIAN), 12.79% (STREET) ------------------- -et05_real WER: 9.50% (Average), 12.64% (BUS), 8.76% (CAFE), 7.96% (PEDESTRIAN), 8.63% (STREET) +et05_real WER: 9.63% (Average), 13.46% (BUS), 7.98% (CAFE), 8.13% (PEDESTRIAN), 8.97% (STREET) ------------------- -TDNN -exp/chain/tdnn1d_sp/best_wer_beamformit_5mics.result +TDNN+RNNLM using 6 channel data plus enhanced data with beamformit +exp/chain/tdnn7a_sp_smbr_lmrescore/best_wer_beamformit_5mics_rnnlm_5k_h300_w0.5_n100.result +compute dt05 WER for each location ------------------- -best overall dt05 WER 6.04% (language model weight = 9) +best overall dt05 WER 4.02% (language model weight = 11) ------------------- -dt05_simu WER: 6.25% (Average), 5.71% (BUS), 6.92% (CAFE), 5.37% (PEDESTRIAN), 7.02% (STREET) +dt05_simu WER: 4.31% (Average), 4.04% (BUS), 4.88% (CAFE), 3.38% (PEDESTRIAN), 4.94% (STREET) ------------------- -dt05_real WER: 5.83% (Average), 7.48% (BUS), 5.28% (CAFE), 4.43% (PEDESTRIAN), 6.13% (STREET) +dt05_real WER: 3.74% (Average), 4.62% (BUS), 3.17% (CAFE), 3.02% (PEDESTRIAN), 4.14% (STREET) ------------------- -et05_simu WER: 10.30% (Average), 7.34% (BUS), 10.37% (CAFE), 10.05% (PEDESTRIAN), 13.43% (STREET) +et05_simu WER: 7.49% (Average), 5.16% (BUS), 7.21% (CAFE), 7.45% (PEDESTRIAN), 10.14% (STREET) ------------------- -et05_real WER: 9.67% (Average), 12.71% (BUS), 8.33% (CAFE), 8.20% (PEDESTRIAN), 9.45% (STREET) +et05_real WER: 6.84% (Average), 9.74% (BUS), 5.38% (CAFE), 5.25% (PEDESTRIAN), 7.00% (STREET) ------------------- -TDNN+RNNLM -exp/chain/tdnn1d_sp_smbr_lmrescore/best_wer_beamformit_5mics_rnnlm_5k_h300_w0.5_n100.result +TDNN+RNNLM using 6 channel data plus enhanced data with blstm_gev +exp/chain/tdnn1a_sp_smbr_lmrescore/best_wer_blstm_gev_rnnlm_5k_h300_w0.5_n100.result ------------------- -best overall dt05 WER 4.15% (language model weight = 9) +best overall dt05 WER 3.01% (language model weight = 10) ------------------- -dt05_simu WER: 4.33% (Average), 3.95% (BUS), 4.87% (CAFE), 3.53% (PEDESTRIAN), 4.97% (STREET) +dt05_simu WER: 3.10% (Average), 2.60% (BUS), 4.07% (CAFE), 2.80% (PEDESTRIAN), 2.92% (STREET) ------------------- -dt05_real WER: 3.97% (Average), 5.38% (BUS), 3.19% (CAFE), 2.94% (PEDESTRIAN), 4.37% (STREET) +dt05_real WER: 2.93% (Average), 3.32% (BUS), 2.83% (CAFE), 2.63% (PEDESTRIAN), 2.93% (STREET) ------------------- -et05_simu WER: 7.39% (Average), 4.87% (BUS), 7.58% (CAFE), 7.15% (PEDESTRIAN), 9.96% (STREET) +et05_simu WER: 3.95% (Average), 3.29% (BUS), 4.71% (CAFE), 4.30% (PEDESTRIAN), 3.53% (STREET) ------------------- -et05_real WER: 7.04% (Average), 9.89% (BUS), 5.49% (CAFE), 5.70% (PEDESTRIAN), 7.10% (STREET) -------------------- \ No newline at end of file +et05_real WER: 4.04% (Average), 4.94% (BUS), 3.66% (CAFE), 3.66% (PEDESTRIAN), 3.90% (STREET) +------------------- + +TDNN+LSTMLM using 6 channel data plus enhanced data with blstm_gev +exp/chain/tdnn1a_sp_smbr_lmrescore/best_wer_blstm_gev_rnnlm_lstm_1a_w0.5_n100.result +------------------- +best overall dt05 WER 2.00% (language model weight = 11) +------------------- +dt05_simu WER: 2.10% (Average), 2.06% (BUS), 2.58% (CAFE), 1.73% (PEDESTRIAN), 2.02% (STREET) +------------------- +dt05_real WER: 1.90% (Average), 2.05% (BUS), 1.78% (CAFE), 1.68% (PEDESTRIAN), 2.09% (STREET) +------------------- +et05_simu WER: 2.66% (Average), 2.33% (BUS), 2.73% (CAFE), 2.93% (PEDESTRIAN), 2.63% (STREET) +------------------- +et05_real WER: 2.74% (Average), 3.05% (BUS), 2.45% (CAFE), 2.65% (PEDESTRIAN), 2.82% (STREET) +------------------- + diff --git a/egs/chime4/s5_6ch/rnnlm b/egs/chime4/s5_6ch/rnnlm new file mode 120000 index 00000000000..e136939ba72 --- /dev/null +++ b/egs/chime4/s5_6ch/rnnlm @@ -0,0 +1 @@ +../../../scripts/rnnlm/ \ No newline at end of file diff --git a/egs/chime4/s5_6ch/run.sh b/egs/chime4/s5_6ch/run.sh index 090808c026b..1979a040bd8 100755 --- a/egs/chime4/s5_6ch/run.sh +++ b/egs/chime4/s5_6ch/run.sh @@ -1,33 +1,33 @@ -#!/bin/bash - # Kaldi ASR baseline for the CHiME-4 Challenge (6ch track: 6 channel track) # # Copyright 2016 University of Sheffield (Jon Barker, Ricard Marxer) # Inria (Emmanuel Vincent) # Mitsubishi Electric Research Labs (Shinji Watanabe) # 2017 JHU CLSP (Szu-Jui Chen) +# 2017 JHU CLSP (Aswin Shanmugam Subramanian) # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) . ./path.sh . ./cmd.sh + #####Baseline settings##### # Usage: -# 1. For using original baseline, execute './run.sh --baseline chime4_official'. -# We don't provide the function to train original baseline models anymore. Instead, we provided the -# trained original baseline models in tools/ASR_models for directly using. +# Execute './run.sh' to get the models. +# We provide three kinds of beamform methods. Add option --enhancement blstm_gev, or --enhancement beamformit_5mics +# or --enhancement single_blstmmask to use them. i.g. './run.sh --enhancement blstm_gev' # -# 2. For using advanced baseline, first execute './run.sh --baseline advanced --flatstart true' to -# get the models. If you want to use TDNN instead of DNN, add option "--tdnn true". If you want to -# use TDNN-LSTM instead of DNN, add option "--tdnn-lstm true". -# Then execute './run.sh --baseline advanced' for your experiments. +# We stopped to support the old CHiME-3/4 baseline. If you want to reproduce the old results +# Please use the old version of Kaldi, e.g., git checkout 9e8ff73648917836d0870c8f6fdd2ff4bdde384f # Config: stage=0 # resume training with --stage N - -baseline=advanced -flatstart=false -tdnn=true -tdnn_lstm=false +enhancement=blstm_gev #### or your method +# if the following options are true, they wouldn't train a model again and will only do decoding +gmm_decode_only=false +tdnn_decode_only=false +# make it true when you want to add enhanced data into training set. But please note that when changing enhancement method, +# you may need to retrain from run_gmm.sh and avoid using decode-only options above +add_enhanced_data=true . utils/parse_options.sh || exit 1; @@ -42,119 +42,92 @@ set -o pipefail # If you use scripts distributed in the CHiME4 package, chime4_data=`pwd`/../.. # Otherwise, please specify it, e.g., -chime4_data=/db/laputa1/data/processed/public/CHiME4 +# chime4_data=/db/laputa1/data/processed/public/CHiME4 +# chime3_data=/data2/archive/speech-db/original/public/CHiME3 case $(hostname -f) in - *.clsp.jhu.edu) chime4_data=/export/corpora4/CHiME4/CHiME3 ;; # JHU, + *.clsp.jhu.edu) + chime4_data=/export/corpora4/CHiME4/CHiME3 # JHU, + chime3_data=/export/corpora5/CHiME3 + ;; esac if [ ! -d $chime4_data ]; then echo "$chime4_data does not exist. Please specify chime4 data root correctly" && exit 1; fi -# Set a model directory for the CHiME4 data. -case $baseline in - chime4_official) - if $flatstart; then - echo "We don't support this anymore for 'chime4_official' baseline" - echo " ... Automatically set it to false" - fi - modeldir=$chime4_data/tools/ASR_models - flatstart=false - ;; - advanced) - modeldir=`pwd` - ;; - *) - echo "Usage: './run.sh --baseline chime4_official' or './run.sh --baseline advanced'" - echo " ... If you haven't run flatstart for advanced baseline, please execute" - echo " ... './run.sh --baseline advanced --flatstart true' first"; - exit 1; -esac - -if [ "$flatstart" = false ]; then - for d in $modeldir $modeldir/data/{lang,lang_test_tgpr_5k,lang_test_5gkn_5k,lang_test_rnnlm_5k_h300,local} \ - $modeldir/exp/{tri3b_tr05_multi_noisy,tri4a_dnn_tr05_multi_noisy,tri4a_dnn_tr05_multi_noisy_smbr_i1lats}; do - [ ! -d $d ] && echo "$0: no such directory $d. specify models correctly" && \ - echo " or execute './run.sh --baseline advanced --flatstart true' first" && exit 1; - done +if [ ! -d $chime3_data ]; then + echo "$chime3_data does not exist. Please specify chime4 data root correctly" && exit 1; fi -#####check data and model paths finished####### - #####main program start################ # You can execute run_init.sh only "once" # This creates 3-gram LM, FSTs, and basic task files -if [ $stage -le 0 ] && $flatstart; then +if [ $stage -le 0 ]; then local/run_init.sh $chime4_data fi -# Using Beamformit -# See Hori et al, "The MERL/SRI system for the 3rd CHiME challenge using beamforming, -# robust feature extraction, and advanced speech recognition," in Proc. ASRU'15 -# note that beamformed wav files are generated in the following directory -enhancement_method=beamformit_5mics -enhancement_data=`pwd`/enhan/$enhancement_method +# Using Beamformit or mask-based beamformer +# note that beamformed WAV files are generated in the following directory +enhancement_data=`pwd`/enhan/$enhancement if [ $stage -le 1 ]; then - local/run_beamform_6ch_track.sh --cmd "$train_cmd" --nj 20 $chime4_data/data/audio/16kHz/isolated_6ch_track $enhancement_data + case $enhancement in + beamformit_5mics) + local/run_beamform_6ch_track.sh --cmd "$train_cmd" --nj 20 $chime4_data/data/audio/16kHz/isolated_6ch_track $enhancement_data + ;; + blstm_gev) + local/run_blstm_gev.sh --cmd "$train_cmd" --nj 20 $chime4_data $chime3_data $enhancement_data 0 + ;; + single_blstmmask) + local/run_blstm_gev.sh --cmd "$train_cmd" --nj 20 $chime4_data $chime3_data $enhancement_data 5 + ;; + *) + echo "Usage: --enhancement blstm_gev, or --enhancement beamformit_5mics , or --enhancement single_blstmmask" + exit 1; + esac fi -# GMM based ASR experiment without "retraining" -# Please set a directory of your speech enhancement method. -# run_gmm_recog.sh can be done every time when you change a speech enhancement technique. -# The directory structure and audio files must follow the attached baseline enhancement directory +# Compute PESQ, STOI, eSTOI, and SDR scores if [ $stage -le 2 ]; then - if $flatstart; then - local/run_gmm.sh $enhancement_method $enhancement_data $chime4_data - else - local/run_gmm_recog.sh $enhancement_method $enhancement_data $modeldir + if [ ! -f local/bss_eval_sources.m ] || [ ! -f local/stoi.m ] || [ ! -f local/estoi.m ] || [ ! -f local/PESQ ]; then + # download and install speech enhancement evaluation tools + local/download_se_eval_tool.sh fi + chime4_rir_data=local/nn-gev/data/audio/16kHz/isolated_ext + if [ ! -d $chime4_rir_data ]; then + echo "$chime4_rir_data does not exist. Please run 'blstm_gev' enhancement method first;" && exit 1; + fi + local/compute_pesq.sh $enhancement $enhancement_data $chime4_rir_data $PWD + local/compute_stoi_estoi_sdr.sh $enhancement $enhancement_data $chime4_rir_data + local/compute_pesq.sh NOISY_1ch $chime4_data/data/audio/16kHz/isolated_1ch_track/ $chime4_rir_data $PWD + local/compute_stoi_estoi_sdr.sh NOISY_1ch $chime4_data/data/audio/16kHz/isolated_1ch_track/ $chime4_rir_data + local/write_se_results.sh $enhancement + local/write_se_results.sh NOISY_1ch fi -# DNN based ASR experiment -# Since it takes time to evaluate DNN, we make the GMM and DNN scripts separately. -# You may execute it after you would have promising results using GMM-based ASR experiments +# GMM based ASR experiment +# Please set a directory of your speech enhancement method. +# The directory structure and audio files must follow the attached baseline enhancement directory if [ $stage -le 3 ]; then - if $tdnn; then - if $flatstart; then - local/chain/run_tdnn.sh $enhancement_method - else - local/chain/run_tdnn_recog.sh $enhancement_method $modeldir - fi - elif $tdnn_lstm; then - if $flatstart; then - local/chain/run_tdnn_lstm.sh $enhancement_method - else - local/chain/run_tdnn_lstm_recog.sh $enhancement_method $modeldir - fi - else - if $flatstart; then - local/run_dnn.sh $enhancement_method - else - local/run_dnn_recog.sh $enhancement_method $modeldir - fi - fi + local/run_gmm.sh --add-enhanced-data $add_enhanced_data \ + --decode-only $gmm_decode_only $enhancement $enhancement_data $chime4_data +fi + +# TDNN based ASR experiment +# Since it takes time to evaluate TDNN, we make the GMM and TDNN scripts separately. +# You may execute it after you would have promising results using GMM-based ASR experiments +if [ $stage -le 4 ]; then + local/chain/run_tdnn.sh --decode-only $tdnn_decode_only $enhancement fi -flatstart=false + # LM-rescoring experiment with 5-gram and RNN LMs # It takes a few days to train a RNNLM. -if [ $stage -le 4 ]; then - if $flatstart; then - if $tdnn; then - local/run_lmrescore_tdnn.sh $chime4_data $enhancement_method - elif $tdnn_lstm; then - local/run_lmrescore_tdnn_lstm.sh $chime4_data $enhancement_method - else - local/run_lmrescore.sh $chime4_data $enhancement_method - fi - else - if $tdnn; then - local/run_lmrescore_tdnn_recog.sh $enhancement_method $modeldir - elif $tdnn_lstm; then - local/run_lmrescore_tdnn_lstm_recog.sh $enhancement_method $modeldir - else - local/run_lmrescore_recog.sh $enhancement_method $modeldir - fi - fi +if [ $stage -le 5 ]; then + local/run_lmrescore_tdnn.sh $chime4_data $enhancement +fi + +# LM-rescoring experiment with LSTM LMs +if [ $stage -le 6 ]; then + local/rnnlm/run_lstm.sh $enhancement fi echo "Done." diff --git a/egs/chime5/s5/cmd.sh b/egs/chime5/s5/cmd.sh index a697a22cda3..9702501f1a7 100644 --- a/egs/chime5/s5/cmd.sh +++ b/egs/chime5/s5/cmd.sh @@ -10,6 +10,6 @@ # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. -export train_cmd="queue.pl --mem 2G" +export train_cmd="retry.pl queue.pl --mem 2G" export decode_cmd="queue.pl --mem 4G" diff --git a/egs/chime5/s5/local/chain/run_tdnn.sh b/egs/chime5/s5/local/chain/run_tdnn.sh index 34499362831..61f8f499182 120000 --- a/egs/chime5/s5/local/chain/run_tdnn.sh +++ b/egs/chime5/s5/local/chain/run_tdnn.sh @@ -1 +1 @@ -tuning/run_tdnn_1a.sh \ No newline at end of file +tuning/run_tdnn_1b.sh \ No newline at end of file diff --git a/egs/chime5/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/chime5/s5/local/chain/tuning/run_tdnn_1a.sh index 45a7fd84bd6..f0f469e46c8 100755 --- a/egs/chime5/s5/local/chain/tuning/run_tdnn_1a.sh +++ b/egs/chime5/s5/local/chain/tuning/run_tdnn_1a.sh @@ -24,21 +24,16 @@ decode_iter= # training options # training chunk-options chunk_width=140,100,160 -# we don't need extra left/right context for TDNN systems. -chunk_left_context=0 -chunk_right_context=0 common_egs_dir= xent_regularize=0.1 # training options srand=0 remove_egs=true -reporting_email= #decode options test_online_decoding=false # if true, it will run the last decoding stage. - # End configuration section. echo "$0 $@" # Print the command line for logging @@ -59,7 +54,7 @@ fi # run those things. local/nnet3/run_ivector_common.sh --stage $stage \ --train-set $train_set \ - --test-sets "$test_sets" \ + --test-sets "$test_sets" \ --gmm $gmm \ --nnet3-affix "$nnet3_affix" || exit 1; @@ -133,7 +128,7 @@ if [ $stage -le 13 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) opts="l2-regularize=0.05" output_opts="l2-regularize=0.01 bottleneck-dim=320" @@ -176,7 +171,6 @@ EOF steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ fi - if [ $stage -le 14 ]; then if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then utils/create_split_dir.pl \ @@ -204,15 +198,10 @@ if [ $stage -le 14 ]; then --trainer.num-chunk-per-minibatch=256,128,64 \ --trainer.optimization.momentum=0.0 \ --egs.chunk-width=$chunk_width \ - --egs.chunk-left-context=$chunk_left_context \ - --egs.chunk-right-context=$chunk_right_context \ - --egs.chunk-left-context-initial=0 \ - --egs.chunk-right-context-final=0 \ --egs.dir="$common_egs_dir" \ --egs.opts="--frames-overlap-per-eg 0" \ --cleanup.remove-egs=$remove_egs \ --use-gpu=true \ - --reporting.email="$reporting_email" \ --feat-dir=$train_data_dir \ --tree-dir=$tree_dir \ --lat-dir=$lat_dir \ @@ -235,10 +224,6 @@ if [ $stage -le 16 ]; then ( steps/nnet3/decode.sh \ --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context $chunk_left_context \ - --extra-right-context $chunk_right_context \ - --extra-left-context-initial 0 \ - --extra-right-context-final 0 \ --frames-per-chunk $frames_per_chunk \ --nj 8 --cmd "$decode_cmd" --num-threads 4 \ --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \ diff --git a/egs/chime5/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/chime5/s5/local/chain/tuning/run_tdnn_1b.sh new file mode 100755 index 00000000000..920f2543132 --- /dev/null +++ b/egs/chime5/s5/local/chain/tuning/run_tdnn_1b.sh @@ -0,0 +1,277 @@ +#!/bin/bash + +# This factorized TDNN (TDNN-F) script is ported from s5b recipe +# It uses resnet-style skip connections. +# For details, refer to the paper: +# "Semi-Orthogonal Low-Rank Matrix Factorization for Deep Neural Networks", Daniel Povey, Gaofeng Cheng, Yiming Wang, Ke Li, Hainan Xu, Mahsa Yarmohamadi, Sanjeev Khudanpur, Interspeech 2018 + +# %WER 73.03 [ 43001 / 58881, 4433 ins, 22250 del, 16318 sub ] exp/chain_train_worn_u100k_cleaned/tdnn1b_sp/decode_dev_beamformit_ref/wer_10_0.0 +# %WER 38.88 [ 22895 / 58881, 1882 ins, 8235 del, 12778 sub ] exp/chain_train_worn_u100k_cleaned/tdnn1b_sp/decode_dev_worn/wer_10_0.0 + +# steps/info/chain_dir_info.pl exp/chain_train_worn_u100k_cleaned/tdnn1b_sp +# exp/chain_train_worn_u100k_cleaned/tdnn1b_sp: num-iters=96 nj=3..16 num-params=17.1M dim=40+100->2928 combine=-0.125->-0.125 (over 2) xent:train/valid[63,95,final]=(-2.12,-1.81,-1.82/-2.20,-1.96,-1.96) logprob:train/valid[63,95,final]=(-0.190,-0.126,-0.125/-0.218,-0.183,-0.183) + +set -e + +# configs for 'chain' +stage=0 +nj=96 +train_set=train_worn_u100k +test_sets="dev_worn dev_beamformit_ref" +gmm=tri3 +nnet3_affix=_train_worn_u100k +lm_suffix= + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1b # affix for the TDNN directory name +tree_affix= +train_stage=-10 +get_egs_stage=-10 +decode_iter= + +num_epochs=4 +# training options +# training chunk-options +chunk_width=140,100,160 +common_egs_dir= +xent_regularize=0.1 +dropout_schedule='0,0@0.20,0.5@0.50,0' + +# training options +srand=0 +remove_egs=true + +#decode options +test_online_decoding=false # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj ${nj} --cmd "$train_cmd" --generate-ali-from-lats true \ + ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $lat_dir $tree_dir +fi + + +if [ $stage -le 13 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + affine_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true" + tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66" + linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0" + prefinal_opts="l2-regularize=0.01" + output_opts="l2-regularize=0.002" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + linear-component name=prefinal-l dim=256 $linear_opts + + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/chime5-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$train_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule="$dropout_schedule" \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.frames-per-iter=1500000 \ + --trainer.optimization.num-jobs-initial=3 \ + --trainer.optimization.num-jobs-final=16 \ + --trainer.optimization.initial-effective-lrate=0.00025 \ + --trainer.optimization.final-effective-lrate=0.000025 \ + --trainer.num-chunk-per-minibatch=64 \ + --egs.stage $get_egs_stage \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang${lm_suffix}/ \ + $tree_dir $tree_dir/graph${lm_suffix} || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + steps/nnet3/decode.sh \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk $frames_per_chunk \ + --nj 8 --cmd "$decode_cmd" --num-threads 4 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \ + $tree_dir/graph${lm_suffix} data/${data}_hires ${dir}/decode${lm_suffix}_${data} || exit 1 + ) || touch $dir/.error & + done + wait + [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1 +fi + +# Not testing the 'looped' decoding separately, because for +# TDNN systems it would give exactly the same results as the +# normal decoding. + +if $test_online_decoding && [ $stage -le 17 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online + + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l " + echo "main options (for others, see top of script file)" + echo " --cmd # Command to run in parallel with" + echo " --nj 50 # number of jobs for parallel processing" + exit 1; +fi + +sdir=$1 +odir=$2 +array=$3 +task=`basename $sdir` +expdir=exp/wpe/${task}_${array} +# Set bash to 'debug' mode, it will exit on : +# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', +set -e +set -u +set -o pipefail + +miniconda_dir=$HOME/miniconda3/ +if [ ! -d $miniconda_dir ]; then + echo "$miniconda_dir does not exist. Please run '../../../tools/extras/install_miniconda.sh' and '../../../tools/extras/install_wpe.sh';" +fi + +# check if WPE is installed +result=`$HOME/miniconda3/bin/python -c "\ +try: + import nara_wpe + print('1') +except ImportError: + print('0')"` + +if [ "$result" == "1" ]; then + echo "WPE is installed" +else + echo "WPE is not installed. Please run ../../../tools/extras/install_wpe.sh" + exit 1 +fi + +mkdir -p $odir +mkdir -p $expdir/log + +# wavfiles.list can be used as the name of the output files +output_wavfiles=$expdir/wavfiles.list +find -L ${sdir} | grep -i ${array} > $expdir/channels_input +cat $expdir/channels_input | awk -F '/' '{print $NF}' | sed "s@S@$odir\/S@g" > $expdir/channels_output +paste -d" " $expdir/channels_input $expdir/channels_output > $output_wavfiles + +# split the list for parallel processing +split_wavfiles="" +for n in `seq $nj`; do + split_wavfiles="$split_wavfiles $output_wavfiles.$n" +done +utils/split_scp.pl $output_wavfiles $split_wavfiles || exit 1; + +echo -e "Dereverberation - $task - $array\n" +# making a shell script for each job +for n in `seq $nj`; do +cat <<-EOF > $expdir/log/wpe.$n.sh +while read line; do + $HOME/miniconda3/bin/python local/run_wpe.py \ + --file \$line +done < $output_wavfiles.$n +EOF +done + +chmod a+x $expdir/log/wpe.*.sh +$cmd JOB=1:$nj $expdir/log/wpe.JOB.log \ + $expdir/log/wpe.JOB.sh + +echo "`basename $0` Done." diff --git a/egs/chime5/s5/local/score_for_submit.sh b/egs/chime5/s5/local/score_for_submit.sh index 5502c5994e5..23121d68b93 100755 --- a/egs/chime5/s5/local/score_for_submit.sh +++ b/egs/chime5/s5/local/score_for_submit.sh @@ -43,7 +43,7 @@ for session in S02 S09; do # get nerror nerr=`grep "\#csid" $score_result | grep $room | grep $session | awk '{sum+=$4+$5+$6} END {print sum}'` # get nwords from references (NF-2 means to exclude utterance id and " ref ") - nwrd=`grep " ref " $score_result | grep $room | grep $session | sed -e "s/\*//g" | awk '{sum+=NF-2} END {print sum}'` + nwrd=`grep "\#csid" $score_result | grep $room | grep $session | awk '{sum+=$3+$4+$6} END {print sum}'` # compute wer with scale=2 wer=`echo "scale=2; 100 * $nerr / $nwrd" | bc` @@ -59,7 +59,7 @@ echo -n "overall: " # get nerror nerr=`grep "\#csid" $score_result | awk '{sum+=$4+$5+$6} END {print sum}'` # get nwords from references (NF-2 means to exclude utterance id and " ref ") -nwrd=`grep " ref " $score_result | sed -e "s/\*//g" | awk '{sum+=NF-2} END {print sum}'` +nwrd=`grep "\#csid" $score_result | awk '{sum+=$3+$4+$6} END {print sum}'` # compute wer with scale=2 wer=`echo "scale=2; 100 * $nerr / $nwrd" | bc` echo -n "#words $nwrd, " @@ -81,7 +81,7 @@ for session in S01 S21; do # get nerror nerr=`grep "\#csid" $score_result | grep $room | grep $session | awk '{sum+=$4+$5+$6} END {print sum}'` # get nwords from references (NF-2 means to exclude utterance id and " ref ") - nwrd=`grep " ref " $score_result | grep $room | grep $session | sed -e "s/\*//g" | awk '{sum+=NF-2} END {print sum}'` + nwrd=`grep "\#csid" $score_result | grep $room | grep $session | awk '{sum+=$3+$4+$6} END {print sum}'` # compute wer with scale=2 wer=`echo "scale=2; 100 * $nerr / $nwrd" | bc` @@ -98,7 +98,7 @@ if $do_eval; then # get nerror nerr=`grep "\#csid" $score_result | awk '{sum+=$4+$5+$6} END {print sum}'` # get nwords from references (NF-2 means to exclude utterance id and " ref ") - nwrd=`grep " ref " $score_result | sed -e "s/\*//g" | awk '{sum+=NF-2} END {print sum}'` + nwrd=`grep "\#csid" $score_result | awk '{sum+=$3+$4+$6} END {print sum}'` # compute wer with scale=2 wer=`echo "scale=2; 100 * $nerr / $nwrd" | bc` echo -n "overall: " diff --git a/egs/chime5/s5b/RESULTS b/egs/chime5/s5b/RESULTS new file mode 100644 index 00000000000..0dcea1f0031 --- /dev/null +++ b/egs/chime5/s5b/RESULTS @@ -0,0 +1,33 @@ + +# tri2 +%WER 76.40 [ 44985 / 58881, 3496 ins, 17652 del, 23837 sub ] exp/tri2/decode_dev_worn/wer_13_1.0 +%WER 93.56 [ 55091 / 58881, 2132 ins, 35555 del, 17404 sub ] exp/tri2/decode_dev_beamformit_ref/wer_17_1.0 + +# tri3 +%WER 72.81 [ 42869 / 58881, 3629 ins, 15998 del, 23242 sub ] exp/tri3/decode_dev_worn/wer_15_1.0 +%WER 91.73 [ 54013 / 58881, 3519 ins, 27098 del, 23396 sub ] exp/tri3/decode_dev_beamformit_ref/wer_17_1.0 + +# nnet3 tdnn+chain +%WER 47.91 [ 28212 / 58881, 2843 ins, 8957 del, 16412 sub ] exp/chain_train_worn_u100k_cleaned/tdnn1a_sp/decode_dev_worn/wer_9_0.0 +%WER 81.28 [ 47859 / 58881, 4210 ins, 27511 del, 16138 sub ] exp/chain_train_worn_u100k_cleaned/tdnn1a_sp/decode_dev_beamformit_ref/wer_9_0.5 + +# result with the challenge submission format (July 9, 2018) +# before the fix of speaker ID across arrays +session S02 room DINING: #words 8288, #errors 6593, wer 79.54 % +session S02 room KITCHEN: #words 12696, #errors 11096, wer 87.39 % +session S02 room LIVING: #words 15460, #errors 12219, wer 79.03 % +session S09 room DINING: #words 5766, #errors 4651, wer 80.66 % +session S09 room KITCHEN: #words 8911, #errors 7277, wer 81.66 % +session S09 room LIVING: #words 7760, #errors 6023, wer 77.61 % +overall: #words 58881, #errors 47859, wer 81.28 % + +# result with the challenge submission format (July 9, 2018) +# after the fix of speaker ID across arrays +==== development set ==== +session S02 room DINING: #words 8288, #errors 6556, wer 79.10 % +session S02 room KITCHEN: #words 12696, #errors 11096, wer 87.39 % +session S02 room LIVING: #words 15460, #errors 12182, wer 78.79 % +session S09 room DINING: #words 5766, #errors 4648, wer 80.61 % +session S09 room KITCHEN: #words 8911, #errors 7277, wer 81.66 % +session S09 room LIVING: #words 7760, #errors 6022, wer 77.60 % +overall: #words 58881, #errors 47781, wer 81.14 % diff --git a/egs/chime5/s5b/cmd.sh b/egs/chime5/s5b/cmd.sh new file mode 100644 index 00000000000..9702501f1a7 --- /dev/null +++ b/egs/chime5/s5b/cmd.sh @@ -0,0 +1,15 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="retry.pl queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" + diff --git a/egs/chime5/s5b/conf/beamformit.cfg b/egs/chime5/s5b/conf/beamformit.cfg new file mode 100755 index 00000000000..70fdd858651 --- /dev/null +++ b/egs/chime5/s5b/conf/beamformit.cfg @@ -0,0 +1,50 @@ +#BeamformIt sample configuration file for AMI data (http://groups.inf.ed.ac.uk/ami/download/) + +# scrolling size to compute the delays +scroll_size = 250 + +# cross correlation computation window size +window_size = 500 + +#amount of maximum points for the xcorrelation taken into account +nbest_amount = 4 + +#flag wether to apply an automatic noise thresholding +do_noise_threshold = 1 + +#Percentage of frames with lower xcorr taken as noisy +noise_percent = 10 + +######## acoustic modelling parameters + +#transition probabilities weight for multichannel decoding +trans_weight_multi = 25 +trans_weight_nbest = 25 + +### + +#flag wether to print the feaures after setting them, or not +print_features = 1 + +#flag wether to use the bad frames in the sum process +do_avoid_bad_frames = 1 + +#flag to use the best channel (SNR) as a reference +#defined from command line +do_compute_reference = 1 + +#flag wether to use a uem file or not(process all the file) +do_use_uem_file = 0 + +#flag wether to use an adaptative weights scheme or fixed weights +do_adapt_weights = 1 + +#flag wether to output the sph files or just run the system to create the auxiliary files +do_write_sph_files = 1 + +####directories where to store/retrieve info#### +#channels_file = ./cfg-files/channels + +#show needs to be passed as argument normally, here a default one is given just in case +#show_id = Ttmp + diff --git a/egs/chime5/s5b/conf/mfcc.conf b/egs/chime5/s5b/conf/mfcc.conf new file mode 100644 index 00000000000..32988403b00 --- /dev/null +++ b/egs/chime5/s5b/conf/mfcc.conf @@ -0,0 +1,2 @@ +--use-energy=false +--sample-frequency=16000 diff --git a/egs/chime5/s5b/conf/mfcc_hires.conf b/egs/chime5/s5b/conf/mfcc_hires.conf new file mode 100644 index 00000000000..fd64b62eb16 --- /dev/null +++ b/egs/chime5/s5b/conf/mfcc_hires.conf @@ -0,0 +1,10 @@ +# config for high-resolution MFCC features, intended for neural network training. +# Note: we keep all cepstra, so it has the same info as filterbank features, +# but MFCC is more easily compressible (because less correlated) which is why +# we prefer this method. +--use-energy=false # use average of log energy, not energy. +--sample-frequency=16000 +--num-mel-bins=40 +--num-ceps=40 +--low-freq=40 +--high-freq=-400 diff --git a/egs/chime5/s5b/conf/online_cmvn.conf b/egs/chime5/s5b/conf/online_cmvn.conf new file mode 100644 index 00000000000..7748a4a4dd3 --- /dev/null +++ b/egs/chime5/s5b/conf/online_cmvn.conf @@ -0,0 +1 @@ +# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh diff --git a/egs/chime5/s5b/local/chain/run_tdnn.sh b/egs/chime5/s5b/local/chain/run_tdnn.sh new file mode 120000 index 00000000000..34499362831 --- /dev/null +++ b/egs/chime5/s5b/local/chain/run_tdnn.sh @@ -0,0 +1 @@ +tuning/run_tdnn_1a.sh \ No newline at end of file diff --git a/egs/chime5/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1a.sh b/egs/chime5/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1a.sh new file mode 100755 index 00000000000..95e9d934bd3 --- /dev/null +++ b/egs/chime5/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1a.sh @@ -0,0 +1,304 @@ +#!/bin/bash + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=96 +train_set=train_worn_u400k_cleaned +test_sets="dev_beamformit_ref" +gmm=tri3_cleaned +nnet3_affix=_train_worn_u400k_cleaned +lm_suffix= + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=_1a # affix for the TDNN directory name +tree_affix= +train_stage=-10 +get_egs_stage=-10 +decode_iter= + +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=2 # 2 works better than 4 +chunk_width=140,100,160 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +# training options +srand=0 +remove_egs=true + +#decode options +test_online_decoding=false # if true, it will run the last decoding stage. + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj ${nj} --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + +if [ $stage -le 13 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + idct-layer name=idct input=input dim=40 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat + + conv-relu-batchnorm-layer name=cnn1 input=idct height-in=40 height-out=20 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=256 learning-rate-factor=0.333 max-change=0.25 + conv-relu-batchnorm-layer name=cnn2 input=cnn1 height-in=20 height-out=20 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=128 + + relu-batchnorm-layer name=affine1 input=lda dim=512 + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 input=cnn2 dim=1024 + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,affine1) dim=1024 + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024 + + # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults + fast-lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024 + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024 + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024 + fast-lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024 + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=1024 + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=1024 + fast-lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/chime5-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$train_cmd --mem 4G" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.srand=$srand \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial=3 \ + --trainer.optimization.num-jobs-final=16 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.momentum=0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts="--frames-overlap-per-eg 0" \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --cleanup.remove-egs=$remove_egs \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang${lm_suffix}/ \ + $tree_dir $tree_dir/graph${lm_suffix} || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + steps/nnet3/decode.sh \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj 8 --cmd "$decode_cmd" --num-threads 4 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \ + $tree_dir/graph${lm_suffix} data/${data}_hires ${dir}/decode${lm_suffix}_${data} || exit 1 + ) || touch $dir/.error & + done + wait + [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1 +fi + +# Not testing the 'looped' decoding separately, because for +# TDNN systems it would give exactly the same results as the +# normal decoding. + +if $test_online_decoding && [ $stage -le 17 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online + + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l $lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj ${nj} --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 13 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + opts="l2-regularize=0.05" + output_opts="l2-regularize=0.01 bottleneck-dim=320" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 $opts dim=512 + relu-batchnorm-layer name=tdnn2 $opts dim=512 input=Append(-1,0,1) + relu-batchnorm-layer name=tdnn3 $opts dim=512 + relu-batchnorm-layer name=tdnn4 $opts dim=512 input=Append(-1,0,1) + relu-batchnorm-layer name=tdnn5 $opts dim=512 + relu-batchnorm-layer name=tdnn6 $opts dim=512 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn7 $opts dim=512 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn8 $opts dim=512 input=Append(-6,-3,0) + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain $opts dim=512 target-rms=0.5 + output-layer name=output include-log-softmax=false $output_opts dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn8 $opts dim=512 target-rms=0.5 + output-layer name=output-xent $output_opts dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/chime5-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=10 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=4 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=256,128,64 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang${lm_suffix}/ \ + $tree_dir $tree_dir/graph${lm_suffix} || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + steps/nnet3/decode.sh \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk $frames_per_chunk \ + --nj 8 --cmd "$decode_cmd" --num-threads 4 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \ + $tree_dir/graph${lm_suffix} data/${data}_hires ${dir}/decode${lm_suffix}_${data} || exit 1 + ) || touch $dir/.error & + done + wait + [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1 +fi + +# Not testing the 'looped' decoding separately, because for +# TDNN systems it would give exactly the same results as the +# normal decoding. + +if $test_online_decoding && [ $stage -le 17 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online + + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l 2792 combine=-0.149->-0.149 (over 2) xent:train/valid[210,316,final]=(-2.50,-1.99,-2.00/-2.36,-1.95,-1.95) logprob:train/valid[210,316,final]=(-0.228,-0.136,-0.136/-0.223,-0.156,-0.155) + +set -e + +# configs for 'chain' +stage=0 +nj=96 +train_set=train_worn_u400k +test_sets="dev_worn dev_beamformit_ref" +gmm=tri3 +nnet3_affix=_train_worn_u400k +lm_suffix= + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1b # affix for the TDNN directory name +tree_affix= +train_stage=-10 +get_egs_stage=-10 +decode_iter= + +num_epochs=4 +common_egs_dir= +# training options +# training chunk-options +chunk_width=140,100,160 +xent_regularize=0.1 +dropout_schedule='0,0@0.20,0.5@0.50,0' + +# training options +srand=0 +remove_egs=true + +#decode options +test_online_decoding=false # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj ${nj} --cmd "$train_cmd" --generate-ali-from-lats true \ + ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $lat_dir $tree_dir +fi + +if [ $stage -le 13 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + affine_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true" + tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66" + linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0" + prefinal_opts="l2-regularize=0.01" + output_opts="l2-regularize=0.002" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + linear-component name=prefinal-l dim=256 $linear_opts + + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/chime5-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$train_cmd --mem 4G" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule "$dropout_schedule" \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $chunk_width \ + --trainer.num-chunk-per-minibatch 64 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.00025 \ + --trainer.optimization.final-effective-lrate 0.000025 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang${lm_suffix}/ \ + $tree_dir $tree_dir/graph${lm_suffix} || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + steps/nnet3/decode.sh \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk $frames_per_chunk \ + --nj 8 --cmd "$decode_cmd" --num-threads 4 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \ + $tree_dir/graph${lm_suffix} data/${data}_hires ${dir}/decode${lm_suffix}_${data} || exit 1 + ) || touch $dir/.error & + done + wait + [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1 +fi + +exit 0; diff --git a/egs/chime5/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/chime5/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh new file mode 100755 index 00000000000..e3d8e6ac4dc --- /dev/null +++ b/egs/chime5/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh @@ -0,0 +1,297 @@ +#!/bin/bash + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=96 +train_set=train_worn_u400k_cleaned +test_sets="dev_worn dev_beamformit_ref" +gmm=tri3_cleaned +nnet3_affix=_train_worn_u400k_cleaned +lm_suffix= + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=_1a # affix for the TDNN directory name +tree_affix= +train_stage=-10 +get_egs_stage=-10 +decode_iter= + +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=2 # 2 works better than 4 +chunk_width=140,100,160 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +# training options +srand=0 +remove_egs=true + +#decode options +test_online_decoding=false # if true, it will run the last decoding stage. + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj ${nj} --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + +if [ $stage -le 13 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/chime5-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$train_cmd --mem 4G" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.srand=$srand \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial=3 \ + --trainer.optimization.num-jobs-final=16 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.momentum=0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts="--frames-overlap-per-eg 0" \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --cleanup.remove-egs=$remove_egs \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang${lm_suffix}/ \ + $tree_dir $tree_dir/graph${lm_suffix} || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + steps/nnet3/decode.sh \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj 8 --cmd "$decode_cmd" --num-threads 4 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \ + $tree_dir/graph${lm_suffix} data/${data}_hires ${dir}/decode${lm_suffix}_${data} || exit 1 + ) || touch $dir/.error & + done + wait + [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1 +fi + +# Not testing the 'looped' decoding separately, because for +# TDNN systems it would give exactly the same results as the +# normal decoding. + +if $test_online_decoding && [ $stage -le 17 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online + + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l ) + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +[ -f ./path.sh ] && . ./path.sh + +command -v uconv &>/dev/null \ + || { echo >&2 "uconv not found on PATH. You will have to install ICU4C"; exit 1; } + +command -v ngram &>/dev/null \ + || { echo >&2 "srilm not found on PATH. Please use the script $KALDI_ROOT/tools/extras/install_srilm.sh to install it"; exit 1; } + +if [ -z ${LIBLBFGS} ]; then + echo >&2 "SRILM is not compiled with the support of MaxEnt models." + echo >&2 "You should use the script in \$KALDI_ROOT/tools/install_srilm.sh" + echo >&2 "which will take care of compiling the SRILM with MaxEnt support" + exit 1; +fi + +sox=`command -v sox 2>/dev/null` \ + || { echo >&2 "sox not found on PATH. Please install it manually (you will need version 14.4.0 and higher)."; exit 1; } + +# If sox is found on path, check if the version is correct +if [ ! -z "$sox" ]; then + sox_version=`$sox --version 2>&1| head -1 | sed -e 's?.*: ??' -e 's?.* ??'` + if [[ ! $sox_version =~ v14.4.* ]]; then + echo "Unsupported sox version $sox_version found on path. You will need version v14.4.0 and higher." + exit 1 + fi +fi + +command -v phonetisaurus-align &>/dev/null \ + || { echo >&2 "Phonetisaurus not found on PATH. Please use the script $KALDI_ROOT/tools/extras/install_phonetisaurus.sh to install it"; exit 1; } + +command -v BeamformIt &>/dev/null \ + || { echo >&2 "BeamformIt not found on PATH. Please use the script $KALDI_ROOT/tools/extras/install_beamformit.sh to install it"; exit 1; } + +miniconda_dir=$HOME/miniconda3/ +if [ ! -d $miniconda_dir ]; then + echo "$miniconda_dir does not exist. Please run '../../../tools/extras/install_miniconda.sh'" +fi + +# check if WPE is installed +result=`$miniconda_dir/bin/python -c "\ +try: + import nara_wpe + print('1') +except ImportError: + print('0')"` + +if [ "$result" == "1" ]; then + echo "WPE is installed" +else + echo "WPE is not installed. Please run ../../../tools/extras/install_wpe.sh" + exit 1 +fi + +exit 0 diff --git a/egs/chime5/s5b/local/copy_lat_dir_parallel.sh b/egs/chime5/s5b/local/copy_lat_dir_parallel.sh new file mode 100755 index 00000000000..82839604c9e --- /dev/null +++ b/egs/chime5/s5b/local/copy_lat_dir_parallel.sh @@ -0,0 +1,97 @@ +#!/bin/bash + +cmd=queue.pl +nj=40 +stage=0 +speed_perturb=true + +. ./path.sh +. utils/parse_options.sh + +if [ $# -ne 4 ]; then + echo "Usage: $0 " + exit 1 +fi + +utt_map=$1 +data=$2 +srcdir=$3 +dir=$4 + +mkdir -p $dir + +cp $srcdir/{phones.txt,tree,final.mdl} $dir || exit 1 +cp $srcdir/{final.alimdl,final.occs,splice_opts,cmvn_opts,delta_opts,final.mat,full.mat} 2>/dev/null || true + +nj_src=$(cat $srcdir/num_jobs) || exit 1 + +if [ $stage -le 1 ]; then + $cmd JOB=1:$nj_src $dir/log/copy_lats_orig.JOB.log \ + lattice-copy "ark:gunzip -c $srcdir/lat.JOB.gz |" \ + ark,scp:$dir/lat_orig.JOB.ark,$dir/lat_orig.JOB.scp || exit 1 +fi + +for n in $(seq $nj_src); do + cat $dir/lat_orig.$n.scp +done > $dir/lat_orig.scp || exit 1 + +if $speed_perturb; then + for s in 0.9 1.1; do + awk -v s=$s '{print "sp"s"-"$1" sp"s"-"$2}' $utt_map + done | cat - $utt_map | sort -k1,1 > $dir/utt_map + utt_map=$dir/utt_map +fi + +if [ $stage -le 2 ]; then + utils/filter_scp.pl -f 2 $dir/lat_orig.scp < $utt_map | \ + utils/apply_map.pl -f 2 $dir/lat_orig.scp > \ + $dir/lat.scp || exit 1 + + if [ ! -s $dir/lat.scp ]; then + echo "$0: $dir/lat.scp is empty. Something went wrong!" + exit 1 + fi +fi + +utils/split_data.sh $data $nj + +if [ $stage -le 3 ]; then + $cmd JOB=1:$nj $dir/log/copy_lats.JOB.log \ + lattice-copy "scp:utils/filter_scp.pl $data/split$nj/JOB/utt2spk $dir/lat.scp |" \ + "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1 +fi + +echo $nj > $dir/num_jobs + +if [ -f $srcdir/ali.1.gz ]; then + if [ $stage -le 4 ]; then + $cmd JOB=1:$nj_src $dir/log/copy_ali_orig.JOB.log \ + copy-int-vector "ark:gunzip -c $srcdir/ali.JOB.gz |" \ + ark,scp:$dir/ali_orig.JOB.ark,$dir/ali_orig.JOB.scp || exit 1 + fi + + for n in $(seq $nj_src); do + cat $dir/ali_orig.$n.scp + done > $dir/ali_orig.scp || exit 1 + + if [ $stage -le 5 ]; then + utils/filter_scp.pl -f 2 $dir/ali_orig.scp < $utt_map | \ + utils/apply_map.pl -f 2 $dir/ali_orig.scp > \ + $dir/ali.scp || exit 1 + + if [ ! -s $dir/ali.scp ]; then + echo "$0: $dir/ali.scp is empty. Something went wrong!" + exit 1 + fi + fi + + utils/split_data.sh $data $nj + + if [ $stage -le 6 ]; then + $cmd JOB=1:$nj $dir/log/copy_ali.JOB.log \ + copy-int-vector "scp:utils/filter_scp.pl $data/split$nj/JOB/utt2spk $dir/ali.scp |" \ + "ark:|gzip -c > $dir/ali.JOB.gz" || exit 1 + fi +fi + +rm $dir/lat_orig.*.{ark,scp} $dir/ali_orig.*.{ark,scp} 2>/dev/null || true diff --git a/egs/chime5/s5b/local/distant_audio_list b/egs/chime5/s5b/local/distant_audio_list new file mode 100644 index 00000000000..fc7aff15cd0 --- /dev/null +++ b/egs/chime5/s5b/local/distant_audio_list @@ -0,0 +1,376 @@ +S03_U01.CH1 +S03_U01.CH2 +S03_U01.CH3 +S03_U01.CH4 +S03_U02.CH1 +S03_U02.CH2 +S03_U02.CH3 +S03_U02.CH4 +S03_U03.CH1 +S03_U03.CH2 +S03_U03.CH3 +S03_U03.CH4 +S03_U04.CH1 +S03_U04.CH2 +S03_U04.CH3 +S03_U04.CH4 +S03_U05.CH1 +S03_U05.CH2 +S03_U05.CH3 +S03_U05.CH4 +S03_U06.CH1 +S03_U06.CH2 +S03_U06.CH3 +S03_U06.CH4 +S04_U01.CH1 +S04_U01.CH2 +S04_U01.CH3 +S04_U01.CH4 +S04_U02.CH1 +S04_U02.CH2 +S04_U02.CH3 +S04_U02.CH4 +S04_U03.CH1 +S04_U03.CH2 +S04_U03.CH3 +S04_U03.CH4 +S04_U04.CH1 +S04_U04.CH2 +S04_U04.CH3 +S04_U04.CH4 +S04_U05.CH1 +S04_U05.CH2 +S04_U05.CH3 +S04_U05.CH4 +S04_U06.CH1 +S04_U06.CH2 +S04_U06.CH3 +S04_U06.CH4 +S05_U01.CH1 +S05_U01.CH2 +S05_U01.CH3 +S05_U01.CH4 +S05_U02.CH1 +S05_U02.CH2 +S05_U02.CH3 +S05_U02.CH4 +S05_U04.CH1 +S05_U04.CH2 +S05_U04.CH3 +S05_U04.CH4 +S05_U05.CH1 +S05_U05.CH2 +S05_U05.CH3 +S05_U05.CH4 +S05_U06.CH1 +S05_U06.CH2 +S05_U06.CH3 +S05_U06.CH4 +S06_U01.CH1 +S06_U01.CH2 +S06_U01.CH3 +S06_U01.CH4 +S06_U02.CH1 +S06_U02.CH2 +S06_U02.CH3 +S06_U02.CH4 +S06_U03.CH1 +S06_U03.CH2 +S06_U03.CH3 +S06_U03.CH4 +S06_U04.CH1 +S06_U04.CH2 +S06_U04.CH3 +S06_U04.CH4 +S06_U05.CH1 +S06_U05.CH2 +S06_U05.CH3 +S06_U05.CH4 +S06_U06.CH1 +S06_U06.CH2 +S06_U06.CH3 +S06_U06.CH4 +S07_U01.CH1 +S07_U01.CH2 +S07_U01.CH3 +S07_U01.CH4 +S07_U02.CH1 +S07_U02.CH2 +S07_U02.CH3 +S07_U02.CH4 +S07_U03.CH1 +S07_U03.CH2 +S07_U03.CH3 +S07_U03.CH4 +S07_U04.CH1 +S07_U04.CH2 +S07_U04.CH3 +S07_U04.CH4 +S07_U05.CH1 +S07_U05.CH2 +S07_U05.CH3 +S07_U05.CH4 +S07_U06.CH1 +S07_U06.CH2 +S07_U06.CH3 +S07_U06.CH4 +S08_U01.CH1 +S08_U01.CH2 +S08_U01.CH3 +S08_U01.CH4 +S08_U02.CH1 +S08_U02.CH2 +S08_U02.CH3 +S08_U02.CH4 +S08_U03.CH1 +S08_U03.CH2 +S08_U03.CH3 +S08_U03.CH4 +S08_U04.CH1 +S08_U04.CH2 +S08_U04.CH3 +S08_U04.CH4 +S08_U05.CH1 +S08_U05.CH2 +S08_U05.CH3 +S08_U05.CH4 +S08_U06.CH1 +S08_U06.CH2 +S08_U06.CH3 +S08_U06.CH4 +S12_U01.CH1 +S12_U01.CH2 +S12_U01.CH3 +S12_U01.CH4 +S12_U02.CH1 +S12_U02.CH2 +S12_U02.CH3 +S12_U02.CH4 +S12_U03.CH1 +S12_U03.CH2 +S12_U03.CH3 +S12_U03.CH4 +S12_U04.CH1 +S12_U04.CH2 +S12_U04.CH3 +S12_U04.CH4 +S12_U05.CH1 +S12_U05.CH2 +S12_U05.CH3 +S12_U05.CH4 +S12_U06.CH1 +S12_U06.CH2 +S12_U06.CH3 +S12_U06.CH4 +S13_U01.CH1 +S13_U01.CH2 +S13_U01.CH3 +S13_U01.CH4 +S13_U02.CH1 +S13_U02.CH2 +S13_U02.CH3 +S13_U02.CH4 +S13_U03.CH1 +S13_U03.CH2 +S13_U03.CH3 +S13_U03.CH4 +S13_U04.CH1 +S13_U04.CH2 +S13_U04.CH3 +S13_U04.CH4 +S13_U05.CH1 +S13_U05.CH2 +S13_U05.CH3 +S13_U05.CH4 +S13_U06.CH1 +S13_U06.CH2 +S13_U06.CH3 +S13_U06.CH4 +S16_U01.CH1 +S16_U01.CH2 +S16_U01.CH3 +S16_U01.CH4 +S16_U02.CH1 +S16_U02.CH2 +S16_U02.CH3 +S16_U02.CH4 +S16_U03.CH1 +S16_U03.CH2 +S16_U03.CH3 +S16_U03.CH4 +S16_U04.CH1 +S16_U04.CH2 +S16_U04.CH3 +S16_U04.CH4 +S16_U05.CH1 +S16_U05.CH2 +S16_U05.CH3 +S16_U05.CH4 +S16_U06.CH1 +S16_U06.CH2 +S16_U06.CH3 +S16_U06.CH4 +S17_U01.CH1 +S17_U01.CH2 +S17_U01.CH3 +S17_U01.CH4 +S17_U02.CH1 +S17_U02.CH2 +S17_U02.CH3 +S17_U02.CH4 +S17_U03.CH1 +S17_U03.CH2 +S17_U03.CH3 +S17_U03.CH4 +S17_U04.CH1 +S17_U04.CH2 +S17_U04.CH3 +S17_U04.CH4 +S17_U05.CH1 +S17_U05.CH2 +S17_U05.CH3 +S17_U05.CH4 +S17_U06.CH1 +S17_U06.CH2 +S17_U06.CH3 +S17_U06.CH4 +S18_U01.CH1 +S18_U01.CH2 +S18_U01.CH3 +S18_U01.CH4 +S18_U02.CH1 +S18_U02.CH2 +S18_U02.CH3 +S18_U02.CH4 +S18_U03.CH1 +S18_U03.CH2 +S18_U03.CH3 +S18_U03.CH4 +S18_U04.CH1 +S18_U04.CH2 +S18_U04.CH3 +S18_U04.CH4 +S18_U05.CH1 +S18_U05.CH2 +S18_U05.CH3 +S18_U05.CH4 +S18_U06.CH1 +S18_U06.CH2 +S18_U06.CH3 +S18_U06.CH4 +S19_U01.CH1 +S19_U01.CH2 +S19_U01.CH3 +S19_U01.CH4 +S19_U02.CH1 +S19_U02.CH2 +S19_U02.CH3 +S19_U02.CH4 +S19_U03.CH1 +S19_U03.CH2 +S19_U03.CH3 +S19_U03.CH4 +S19_U04.CH1 +S19_U04.CH2 +S19_U04.CH3 +S19_U04.CH4 +S19_U05.CH1 +S19_U05.CH2 +S19_U05.CH3 +S19_U05.CH4 +S19_U06.CH1 +S19_U06.CH2 +S19_U06.CH3 +S19_U06.CH4 +S20_U01.CH1 +S20_U01.CH2 +S20_U01.CH3 +S20_U01.CH4 +S20_U02.CH1 +S20_U02.CH2 +S20_U02.CH3 +S20_U02.CH4 +S20_U03.CH1 +S20_U03.CH2 +S20_U03.CH3 +S20_U03.CH4 +S20_U04.CH1 +S20_U04.CH2 +S20_U04.CH3 +S20_U04.CH4 +S20_U05.CH1 +S20_U05.CH2 +S20_U05.CH3 +S20_U05.CH4 +S20_U06.CH1 +S20_U06.CH2 +S20_U06.CH3 +S20_U06.CH4 +S22_U01.CH1 +S22_U01.CH2 +S22_U01.CH3 +S22_U01.CH4 +S22_U02.CH1 +S22_U02.CH2 +S22_U02.CH3 +S22_U02.CH4 +S22_U04.CH1 +S22_U04.CH2 +S22_U04.CH3 +S22_U04.CH4 +S22_U05.CH1 +S22_U05.CH2 +S22_U05.CH3 +S22_U05.CH4 +S22_U06.CH1 +S22_U06.CH2 +S22_U06.CH3 +S22_U06.CH4 +S23_U01.CH1 +S23_U01.CH2 +S23_U01.CH3 +S23_U01.CH4 +S23_U02.CH1 +S23_U02.CH2 +S23_U02.CH3 +S23_U02.CH4 +S23_U03.CH1 +S23_U03.CH2 +S23_U03.CH3 +S23_U03.CH4 +S23_U04.CH1 +S23_U04.CH2 +S23_U04.CH3 +S23_U04.CH4 +S23_U05.CH1 +S23_U05.CH2 +S23_U05.CH3 +S23_U05.CH4 +S23_U06.CH1 +S23_U06.CH2 +S23_U06.CH3 +S23_U06.CH4 +S24_U01.CH1 +S24_U01.CH2 +S24_U01.CH3 +S24_U01.CH4 +S24_U02.CH1 +S24_U02.CH2 +S24_U02.CH3 +S24_U02.CH4 +S24_U03.CH1 +S24_U03.CH2 +S24_U03.CH3 +S24_U03.CH4 +S24_U04.CH1 +S24_U04.CH2 +S24_U04.CH3 +S24_U04.CH4 +S24_U05.CH1 +S24_U05.CH2 +S24_U05.CH3 +S24_U05.CH4 +S24_U06.CH1 +S24_U06.CH2 +S24_U06.CH3 +S24_U06.CH4 diff --git a/egs/chime5/s5b/local/extract_noises.py b/egs/chime5/s5b/local/extract_noises.py new file mode 100755 index 00000000000..f7b7f752d9e --- /dev/null +++ b/egs/chime5/s5b/local/extract_noises.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 + +import argparse +import json +import logging +import os +import sys +import scipy.io.wavfile as siw +import math +import numpy as np + + +def get_args(): + parser = argparse.ArgumentParser( + """Extract noises from the corpus based on the non-speech regions. + e.g. {} /export/corpora4/CHiME5/audio/train/ \\ + /export/corpora4/CHiME5/transcriptions/train/ \\ + /export/b05/zhiqiw/noise/""".format(sys.argv[0])) + + parser.add_argument("--segment-length", default=20) + parser.add_argument("audio_dir", help="""Location of the CHiME5 Audio files. e.g. /export/corpora4/CHiME5/audio/train/""") + parser.add_argument("trans_dir", help="""Location of the CHiME5 Transcriptions. e.g. /export/corpora4/CHiME5/transcriptions/train/""") + parser.add_argument("audio_list", help="""List of ids of the CHiME5 recordings from which noise is extracted. e.g. local/distant_audio_list""") + parser.add_argument("out_dir", help="Output directory to write noise files. e.g. /export/b05/zhiqiw/noise/") + + args = parser.parse_args() + return args + + +def Trans_time(time, fs): + units = time.split(':') + time_second = float(units[0]) * 3600 + float(units[1]) * 60 + float(units[2]) + return int(time_second*fs) + + +def Get_time(conf, tag, mic, fs): + for i in conf: + st = Trans_time(i['start_time'][mic], fs) + ed = Trans_time(i['end_time'][mic], fs) + tag[st:ed] = 0 + return tag + + +def write_noise(out_dir, seg, audio, sig, tag, fs, cnt): + sig_noise = sig[np.nonzero(tag)] + for i in range(math.floor(len(sig_noise)/(seg*fs))): + siw.write(out_dir +'/noise'+str(cnt)+'.wav', fs, sig_noise[i*seg*fs:(i+1)*seg*fs]) + cnt += 1 + return cnt + + +def main(): + args = get_args() + + if not os.path.exists(args.out_dir): + os.makedirs(args.out_dir) + + wav_list = open(args.audio_list).readlines() + + cnt = 1 + for i, audio in enumerate(wav_list): + parts = audio.strip().split('.') + if len(parts) == 2: + # Assuming distant mic with name like S03_U01.CH1 + session, mic = parts[0].split('_') + channel = parts[1] + base_name = session + "_" + mic + "." + channel + else: + # Assuming close talk mic with name like S03_P09 + session, mic = audio.strip().split('_') + base_name = session + "_" + mic + fs, sig = siw.read(args.audio_dir + "/" + base_name + '.wav') + tag = np.ones(len(sig)) + if i == 0 or session != session_p: + with open(args.trans_dir + "/" + session + '.json') as f: + conf = json.load(f) + tag = Get_time(conf, tag, mic, fs) + cnt = write_noise(args.out_dir, args.segment_length, audio, sig, tag, fs, cnt) + session_p = session + + +if __name__ == '__main__': + main() diff --git a/egs/chime5/s5b/local/extract_vad_weights.sh b/egs/chime5/s5b/local/extract_vad_weights.sh new file mode 100755 index 00000000000..250b021bd8f --- /dev/null +++ b/egs/chime5/s5b/local/extract_vad_weights.sh @@ -0,0 +1,86 @@ +#!/bin/bash + +# Copyright 2016 Johns Hopkins University (Author: Daniel Povey, Vijayaditya Peddinti) +# 2019 Vimal Manohar +# Apache 2.0. + +# This script converts lattices available from a first pass decode into a per-frame weights file +# The ctms generated from the lattices are filtered. Silence frames are assigned a low weight (e.g.0.00001) +# and voiced frames have a weight of 1. + +set -e + +stage=1 +cmd=run.pl +silence_weight=0.00001 +#end configuration section. + +. ./cmd.sh + +[ -f ./path.sh ] && . ./path.sh +. utils/parse_options.sh || exit 1; +if [ $# -ne 4 ]; then + echo "Usage: $0 [--cmd (run.pl|queue.pl...)] " + echo " Options:" + echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." + exit 1; +fi + +data_dir=$1 +lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied. +decode_dir=$3 +output_wts_file_gz=$4 + +if [ $stage -le 1 ]; then + echo "$0: generating CTM from input lattices" + steps/get_ctm_conf.sh --cmd "$cmd" \ + --use-segments false \ + $data_dir \ + $lang \ + $decode_dir +fi + +if [ $stage -le 2 ]; then + name=`basename $data_dir` + # we just take the ctm from LMWT 10, it doesn't seem to affect the results a lot + ctm=$decode_dir/score_10/$name.ctm + echo "$0: generating weights file from ctm $ctm" + + pad_frames=0 # this did not seem to be helpful but leaving it as an option. + feat-to-len scp:$data_dir/feats.scp ark,t:- >$decode_dir/utt.lengths + if [ ! -f $ctm ]; then echo "$0: expected ctm to exist: $ctm"; exit 1; fi + + cat $ctm | awk '$6 == 1.0 && $4 < 1.0' | \ + grep -v -w mm | grep -v -w mhm | grep -v -F '[noise]' | \ + grep -v -F '[laughter]' | grep -v -F '' | \ + perl -e ' $lengths=shift @ARGV; $pad_frames=shift @ARGV; $silence_weight=shift @ARGV; + $pad_frames >= 0 || die "bad pad-frames value $pad_frames"; + open(L, "<$lengths") || die "opening lengths file"; + @all_utts = (); + $utt2ref = { }; + while () { + ($utt, $len) = split(" ", $_); + push @all_utts, $utt; + $array_ref = [ ]; + for ($n = 0; $n < $len; $n++) { ${$array_ref}[$n] = $silence_weight; } + $utt2ref{$utt} = $array_ref; + } + while () { + @A = split(" ", $_); + @A == 6 || die "bad ctm line $_"; + $utt = $A[0]; $beg = $A[2]; $len = $A[3]; + $beg_int = int($beg * 100) - $pad_frames; + $len_int = int($len * 100) + 2*$pad_frames; + $array_ref = $utt2ref{$utt}; + !defined $array_ref && die "No length info for utterance $utt"; + for ($t = $beg_int; $t < $beg_int + $len_int; $t++) { + if ($t >= 0 && $t < @$array_ref) { + ${$array_ref}[$t] = 1; + } + } + } + foreach $utt (@all_utts) { $array_ref = $utt2ref{$utt}; + print $utt, " [ ", join(" ", @$array_ref), " ]\n"; + } ' $decode_dir/utt.lengths $pad_frames $silence_weight | \ + gzip -c > $output_wts_file_gz +fi diff --git a/egs/chime5/s5b/local/json2text.py b/egs/chime5/s5b/local/json2text.py new file mode 100755 index 00000000000..4df0160efb6 --- /dev/null +++ b/egs/chime5/s5b/local/json2text.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 + +# Copyright 2017 Johns Hopkins University (Shinji Watanabe) +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +import json +import argparse +import logging +import sys + + +def hms_to_seconds(hms): + hour = hms.split(':')[0] + minute = hms.split(':')[1] + second = hms.split(':')[2].split('.')[0] + + # .xx (10 ms order) + ms10 = hms.split(':')[2].split('.')[1] + + # total seconds + seconds = int(hour) * 3600 + int(minute) * 60 + int(second) + + return '{:07d}'.format(int(str(seconds) + ms10)) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('json', type=str, help='JSON transcription file') + parser.add_argument('--mictype', type=str, + choices=['ref', 'worn', 'u01', 'u02', 'u03', 'u04', 'u05', 'u06'], + help='Type of microphones') + args = parser.parse_args() + + # logging info + log_format = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s:%(message)s" + logging.basicConfig(level=logging.INFO, format=log_format) + + logging.debug("reading %s", args.json) + with open(args.json, 'rt', encoding="utf-8") as f: + j = json.load(f) + + for x in j: + if '[redacted]' not in x['words']: + session_id = x['session_id'] + speaker_id = x['speaker'] + if args.mictype == 'ref': + mictype = x['ref'] + elif args.mictype == 'worn': + mictype = 'original' + else: + mictype = args.mictype.upper() # convert from u01 to U01 + + # add location tag for scoring (only for dev and eval sets) + if 'location' in x.keys(): + location = x['location'].upper() + else: + location = 'NOLOCATION' + + start_time = x['start_time'][mictype] + end_time = x['end_time'][mictype] + + # remove meta chars and convert to lower + words = x['words'].replace('"', '')\ + .replace('.', '')\ + .replace('?', '')\ + .replace(',', '')\ + .replace(':', '')\ + .replace(';', '')\ + .replace('!', '').lower() + + # remove multiple spaces + words = " ".join(words.split()) + + # convert to seconds, e.g., 1:10:05.55 -> 3600 + 600 + 5.55 = 4205.55 + start_time = hms_to_seconds(start_time) + end_time = hms_to_seconds(end_time) + + uttid = speaker_id + '_' + session_id + if not args.mictype == 'worn': + uttid += '_' + mictype + uttid += '_' + location + '-' + start_time + '-' + end_time + + if end_time > start_time: + sys.stdout.buffer.write((uttid + ' ' + words + '\n').encode("utf-8")) diff --git a/egs/chime5/s5b/local/make_noise_list.py b/egs/chime5/s5b/local/make_noise_list.py new file mode 100755 index 00000000000..5aaf7fa4062 --- /dev/null +++ b/egs/chime5/s5b/local/make_noise_list.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python3 + +import glob +import os +import sys + + +if len(sys.argv) != 2: + print ("Usage: {} ".format(sys.argv[0])) + raise SystemExit(1) + + +for line in glob.glob("{}/*.wav".format(sys.argv[1])): + fname = os.path.basename(line.strip()) + + print ("--noise-id {} --noise-type point-source " + "--bg-fg-type foreground {}".format(fname, line.strip())) diff --git a/egs/chime5/s5b/local/nnet3/compare_wer.sh b/egs/chime5/s5b/local/nnet3/compare_wer.sh new file mode 100644 index 00000000000..fa627acd27b --- /dev/null +++ b/egs/chime5/s5b/local/nnet3/compare_wer.sh @@ -0,0 +1,133 @@ +#!/bin/bash + +# this script is used for comparing decoding results between systems. +# e.g. local/chain/compare_wer.sh exp/chain/tdnn_{c,d}_sp +# For use with discriminatively trained systems you specify the epochs after a colon: +# for instance, +# local/chain/compare_wer.sh exp/chain/tdnn_c_sp exp/chain/tdnn_c_sp_smbr:{1,2,3} + + +if [ $# == 0 ]; then + echo "Usage: $0: [--looped] [--online] [ ... ]" + echo "e.g.: $0 exp/chain/tdnn_{b,c}_sp" + echo "or (with epoch numbers for discriminative training):" + echo "$0 exp/chain/tdnn_b_sp_disc:{1,2,3}" + exit 1 +fi + +echo "# $0 $*" + +include_looped=false +if [ "$1" == "--looped" ]; then + include_looped=true + shift +fi +include_online=false +if [ "$1" == "--online" ]; then + include_online=true + shift +fi + + +used_epochs=false + +# this function set_names is used to separate the epoch-related parts of the name +# [for discriminative training] and the regular parts of the name. +# If called with a colon-free directory name, like: +# set_names exp/chain/tdnn_lstm1e_sp_bi_smbr +# it will set dir=exp/chain/tdnn_lstm1e_sp_bi_smbr and epoch_infix="" +# If called with something like: +# set_names exp/chain/tdnn_d_sp_smbr:3 +# it will set dir=exp/chain/tdnn_d_sp_smbr and epoch_infix="_epoch3" + + +set_names() { + if [ $# != 1 ]; then + echo "compare_wer_general.sh: internal error" + exit 1 # exit the program + fi + dirname=$(echo $1 | cut -d: -f1) + epoch=$(echo $1 | cut -s -d: -f2) + if [ -z $epoch ]; then + epoch_infix="" + else + used_epochs=true + epoch_infix=_epoch${epoch} + fi +} + + + +echo -n "# System " +for x in $*; do printf "% 10s" " $(basename $x)"; done +echo + +strings=( + "#WER dev_clean_2 (tgsmall) " + "#WER dev_clean_2 (tglarge) ") + +for n in 0 1; do + echo -n "${strings[$n]}" + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + decode_names=(tgsmall_dev_clean_2 tglarge_dev_clean_2) + + wer=$(cat $dirname/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + if $include_looped; then + echo -n "# [looped:] " + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + wer=$(cat $dirname/decode_looped_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + fi + if $include_online; then + echo -n "# [online:] " + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + wer=$(cat ${dirname}_online/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + fi +done + + +if $used_epochs; then + exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. +fi + +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.{final,combined}.log 2>/dev/null | grep log-like | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.{final,combined}.log 2>/dev/null | grep log-like | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final train acc " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.{final,combined}.log 2>/dev/null | grep accuracy | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid acc " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.{final,combined}.log 2>/dev/null | grep accuracy | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo + diff --git a/egs/chime5/s5b/local/nnet3/decode.sh b/egs/chime5/s5b/local/nnet3/decode.sh new file mode 100755 index 00000000000..8fa54e0d4a6 --- /dev/null +++ b/egs/chime5/s5b/local/nnet3/decode.sh @@ -0,0 +1,164 @@ +#!/bin/bash + +# Copyright 2016 Johns Hopkins University (Author: Daniel Povey, Vijayaditya Peddinti) +# 2019 Vimal Manohar +# Apache 2.0. + +# This script does 2-stage decoding where the first stage is used to get +# reliable frames for i-vector extraction. + +set -e + +# general opts +iter= +stage=0 +nj=30 +affix= # affix for decode directory + +# ivector opts +max_count=75 # parameter for extract_ivectors.sh +sub_speaker_frames=6000 +ivector_scale=0.75 +get_weights_from_ctm=true +weights_file= # use weights from this archive (must be compressed using gunzip) +silence_weight=0.00001 # apply this weight to silence frames during i-vector extraction +ivector_dir=exp/nnet3 + +# decode opts +pass2_decode_opts="--min-active 1000" +lattice_beam=8 +extra_left_context=0 # change for (B)LSTM +extra_right_context=0 # change for BLSTM +frames_per_chunk=50 # change for (B)LSTM +acwt=0.1 # important to change this when using chain models +post_decode_acwt=1.0 # important to change this when using chain models +extra_left_context_initial=0 +extra_right_context_final=0 + +graph_affix= + +score_opts="--min-lmwt 6 --max-lmwt 13" + +. ./cmd.sh +[ -f ./path.sh ] && . ./path.sh +. utils/parse_options.sh || exit 1; + +if [ $# -ne 4 ]; then + echo "Usage: $0 [options] " + echo " Options:" + echo " --stage (0|1|2) # start scoring script from part-way through." + echo "e.g.:" + echo "$0 data/dev data/lang exp/tri5a/graph_pp exp/nnet3/tdnn" + exit 1; +fi + +data=$1 # data directory +lang=$2 # data/lang +graph=$3 #exp/tri5a/graph_pp +dir=$4 # exp/nnet3/tdnn + +model_affix=`basename $dir` +ivector_affix=${affix:+_$affix}_chain_${model_affix}${iter:+_iter$iter} +affix=${affix:+_${affix}}${iter:+_iter${iter}} + +if [ $stage -le 1 ]; then + if [ ! -s ${data}_hires/feats.scp ]; then + utils/copy_data_dir.sh $data ${data}_hires + steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj $nj --cmd "$train_cmd" ${data}_hires + steps/compute_cmvn_stats.sh ${data}_hires + utils/fix_data_dir.sh ${data}_hires + fi +fi + +data_set=$(basename $data) +if [ $stage -le 2 ]; then + echo "Extracting i-vectors, stage 1" + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \ + --max-count $max_count \ + ${data}_hires $ivector_dir/extractor \ + $ivector_dir/ivectors_${data_set}${ivector_affix}_stage1; + # float comparisons are hard in bash + if [ `bc <<< "$ivector_scale != 1"` -eq 1 ]; then + ivector_scale_affix=_scale$ivector_scale + else + ivector_scale_affix= + fi + + if [ ! -z "$ivector_scale_affix" ]; then + echo "$0: Scaling iVectors, stage 1" + srcdir=$ivector_dir/ivectors_${data_set}${ivector_affix}_stage1 + outdir=$ivector_dir/ivectors_${data_set}${ivector_affix}${ivector_scale_affix}_stage1 + mkdir -p $outdir + $train_cmd $outdir/log/scale_ivectors.log \ + copy-matrix --scale=$ivector_scale scp:$srcdir/ivector_online.scp ark:- \| \ + copy-feats --compress=true ark:- ark,scp:$outdir/ivector_online.ark,$outdir/ivector_online.scp; + cp $srcdir/ivector_period $outdir/ivector_period + fi +fi + +decode_dir=$dir/decode${graph_affix}_${data_set}${affix} +# generate the lattices +if [ $stage -le 3 ]; then + echo "Generating lattices, stage 1" + steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" \ + --acwt $acwt --post-decode-acwt $post_decode_acwt \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial $extra_left_context_initial \ + --extra-right-context-final $extra_right_context_final \ + --frames-per-chunk "$frames_per_chunk" \ + --online-ivector-dir $ivector_dir/ivectors_${data_set}${ivector_affix}${ivector_scale_affix}_stage1 \ + --skip-scoring true ${iter:+--iter $iter} \ + $graph ${data}_hires ${decode_dir}_stage1; +fi + +if [ $stage -le 4 ]; then + if $get_weights_from_ctm; then + if [ ! -z $weights_file ]; then + echo "$0: Using provided vad weights file $weights_file" + ivector_extractor_weights=$weights_file + else + echo "$0 : Generating vad weights file" + ivector_extractor_weights=${decode_dir}_stage1/weights${affix}.gz + local/extract_vad_weights.sh --silence-weight $silence_weight \ + --cmd "$decode_cmd" ${iter:+--iter $iter} \ + ${data}_hires $lang \ + ${decode_dir}_stage1 $ivector_extractor_weights + fi + else + # get weights from best path decoding + ivector_extractor_weights=${decode_dir}_stage1 + fi +fi + +if [ $stage -le 5 ]; then + echo "Extracting i-vectors, stage 2 with weights from $ivector_extractor_weights" + # this does offline decoding, except we estimate the iVectors per + # speaker, excluding silence (based on alignments from a DNN decoding), with a + # different script. This is just to demonstrate that script. + # the --sub-speaker-frames is optional; if provided, it will divide each speaker + # up into "sub-speakers" of at least that many frames... can be useful if + # acoustic conditions drift over time within the speaker's data. + steps/online/nnet2/extract_ivectors.sh --cmd "$train_cmd" --nj $nj \ + --silence-weight $silence_weight \ + --sub-speaker-frames $sub_speaker_frames --max-count $max_count \ + ${data}_hires $lang $ivector_dir/extractor \ + $ivector_extractor_weights $ivector_dir/ivectors_${data_set}${ivector_affix}; +fi + +if [ $stage -le 6 ]; then + echo "Generating lattices, stage 2 with --acwt $acwt" + rm -f ${decode_dir}/.error + steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" $pass2_decode_opts \ + --acwt $acwt --post-decode-acwt $post_decode_acwt \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial $extra_left_context_initial \ + --extra-right-context-final $extra_right_context_final \ + --frames-per-chunk "$frames_per_chunk" \ + --skip-scoring false ${iter:+--iter $iter} --lattice-beam $lattice_beam \ + --online-ivector-dir $ivector_dir/ivectors_${data_set}${ivector_affix} \ + $graph ${data}_hires ${decode_dir} || touch ${decode_dir}/.error + [ -f ${decode_dir}/.error ] && echo "$0: Error decoding" && exit 1; +fi +exit 0 diff --git a/egs/chime5/s5b/local/nnet3/run_ivector_common.sh b/egs/chime5/s5b/local/nnet3/run_ivector_common.sh new file mode 100755 index 00000000000..3910e1812a3 --- /dev/null +++ b/egs/chime5/s5b/local/nnet3/run_ivector_common.sh @@ -0,0 +1,151 @@ +#!/bin/bash + +set -euo pipefail + +# This script is called from local/nnet3/run_tdnn.sh and +# local/chain/run_tdnn.sh (and may eventually be called by more +# scripts). It contains the common feature preparation and +# iVector-related parts of the script. See those scripts for examples +# of usage. + +stage=0 +train_set=train_worn_u100k +test_sets="dev_worn dev_beamformit_ref" +gmm=tri3 +nj=96 + +nnet3_affix=_train_worn_u100k + +. ./cmd.sh +. ./path.sh +. utils/parse_options.sh + +gmm_dir=exp/${gmm} +ali_dir=exp/${gmm}_ali_${train_set}_sp + +for f in ${gmm_dir}/final.mdl; do + if [ ! -f $f ]; then + echo "$0: expected file $f to exist" + exit 1 + fi +done + +if [ $stage -le 1 ]; then + # Although the nnet will be trained by high resolution data, we still have to + # perturb the normal data to get the alignment _sp stands for speed-perturbed + echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)" + utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp + echo "$0: making MFCC features for low-resolution speed-perturbed data" + steps/make_mfcc.sh --cmd "$train_cmd" --nj 20 data/${train_set}_sp || exit 1; + steps/compute_cmvn_stats.sh data/${train_set}_sp || exit 1; + utils/fix_data_dir.sh data/${train_set}_sp +fi + +if [ $stage -le 2 ]; then + echo "$0: aligning with the perturbed low-resolution data" + steps/align_fmllr.sh --nj ${nj} --cmd "$train_cmd" \ + data/${train_set}_sp data/lang $gmm_dir $ali_dir || exit 1 +fi + +if [ $stage -le 3 ]; then + # Create high-resolution MFCC features (with 40 cepstra instead of 13). + # this shows how you can split across multiple file-systems. + echo "$0: creating high-resolution MFCC features" + mfccdir=data/${train_set}_sp_hires/data + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + utils/create_split_dir.pl /export/b1{5,6,8,9}/$USER/kaldi-data/mfcc/chime5-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage + fi + + for datadir in ${train_set}_sp ${test_sets}; do + utils/copy_data_dir.sh data/$datadir data/${datadir}_hires + done + + # do volume-perturbation on the training data prior to extracting hires + # features; this helps make trained nnets more invariant to test data volume. + utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires || exit 1; + + for datadir in ${train_set}_sp ${test_sets}; do + steps/make_mfcc.sh --nj 20 --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/${datadir}_hires || exit 1; + steps/compute_cmvn_stats.sh data/${datadir}_hires || exit 1; + utils/fix_data_dir.sh data/${datadir}_hires || exit 1; + done +fi + +if [ $stage -le 4 ]; then + echo "$0: computing a subset of data to train the diagonal UBM." + # We'll use about a quarter of the data. + mkdir -p exp/nnet3${nnet3_affix}/diag_ubm + temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm + + num_utts_total=$(wc -l &2 "$0" "$@" +if [ $# -ne 3 ] ; then + echo >&2 "$0" "$@" + echo >&2 "$0: Error: wrong number of arguments" + echo -e >&2 "Usage:\n $0 [opts] " + echo -e >&2 "eg:\n $0 /corpora/chime5/audio/train /corpora/chime5/transcriptions/train data/train" + exit 1 +fi + +set -e -o pipefail + +adir=$1 +jdir=$2 +dir=$3 + +json_count=$(find -L $jdir -name "*.json" | wc -l) +wav_count=$(find -L $adir -name "*.wav" | wc -l) + +if [ "$json_count" -eq 0 ]; then + echo >&2 "We expect that the directory $jdir will contain json files." + echo >&2 "That implies you have supplied a wrong path to the data." + exit 1 +fi +if [ "$wav_count" -eq 0 ]; then + echo >&2 "We expect that the directory $adir will contain wav files." + echo >&2 "That implies you have supplied a wrong path to the data." + exit 1 +fi + +echo "$0: Converting transcription to text" + +mkdir -p $dir +for file in $jdir/*json; do + ./local/json2text.py --mictype $mictype $file +done | \ + sed -e "s/\[inaudible[- 0-9]*\]/[inaudible]/g" |\ + sed -e 's/ - / /g' |\ + sed -e 's/mm-/mm/g' > $dir/text.orig + +echo "$0: Creating datadir $dir for type=\"$mictype\"" + +if [ $mictype == "worn" ]; then + # convert the filenames to wav.scp format, use the basename of the file + # as a the wav.scp key, add .L and .R for left and right channel + # i.e. each file will have two entries (left and right channel) + find -L $adir -name "S[0-9]*_P[0-9]*.wav" | \ + perl -ne '{ + chomp; + $path = $_; + next unless $path; + @F = split "/", $path; + ($f = $F[@F-1]) =~ s/.wav//; + @F = split "_", $f; + print "${F[1]}_${F[0]}.L sox $path -t wav - remix 1 |\n"; + print "${F[1]}_${F[0]}.R sox $path -t wav - remix 2 |\n"; + }' | sort > $dir/wav.scp + + # generate the transcripts for both left and right channel + # from the original transcript in the form + # P09_S03-0006072-0006147 gimme the baker + # create left and right channel transcript + # P09_S03.L-0006072-0006147 gimme the baker + # P09_S03.R-0006072-0006147 gimme the baker + sed -n 's/ *$//; h; s/-/\.L-/p; g; s/-/\.R-/p' $dir/text.orig | sort > $dir/text +elif [ $mictype == "ref" ]; then + # fixed reference array + + # first get a text, which will be used to extract reference arrays + perl -ne 's/-/.ENH-/;print;' $dir/text.orig | sort > $dir/text + + find -L $adir | grep "\.wav" | sort > $dir/wav.flist + # following command provide the argument for grep to extract only reference arrays + grep `cut -f 1 -d"-" $dir/text | awk -F"_" '{print $2 "_" $3}' | sed -e "s/\.ENH//" | sort | uniq | sed -e "s/^/ -e /" | tr "\n" " "` $dir/wav.flist > $dir/wav.flist2 + paste -d" " \ + <(awk -F "/" '{print $NF}' $dir/wav.flist2 | sed -e "s/\.wav/.ENH/") \ + $dir/wav.flist2 | sort > $dir/wav.scp +else + # array mic case + # convert the filenames to wav.scp format, use the basename of the file + # as a the wav.scp key + find -L $adir -name "*.wav" -ipath "*${mictype}*" |\ + perl -ne '$p=$_;chomp $_;@F=split "/";$F[$#F]=~s/\.wav//;print "$F[$#F] $p";' |\ + sort -u > $dir/wav.scp + + # convert the transcripts from + # P09_S03-0006072-0006147 gimme the baker + # to the per-channel transcripts + # P09_S03_U01_NOLOCATION.CH1-0006072-0006147 gimme the baker + # P09_S03_U01_NOLOCATION.CH2-0006072-0006147 gimme the baker + # P09_S03_U01_NOLOCATION.CH3-0006072-0006147 gimme the baker + # P09_S03_U01_NOLOCATION.CH4-0006072-0006147 gimme the baker + perl -ne '$l=$_; + for($i=1; $i<=4; $i++) { + ($x=$l)=~ s/-/.CH\Q$i\E-/; + print $x;}' $dir/text.orig | sort > $dir/text + +fi +$cleanup && rm -f $dir/text.* $dir/wav.scp.* $dir/wav.flist + +# Prepare 'segments', 'utt2spk', 'spk2utt' +if [ $mictype == "worn" ]; then + cut -d" " -f 1 $dir/text | \ + awk -F"-" '{printf("%s %s %08.2f %08.2f\n", $0, $1, $2/100.0, $3/100.0)}' |\ + sed -e "s/_[A-Z]*\././2" \ + > $dir/segments +elif [ $mictype == "ref" ]; then + cut -d" " -f 1 $dir/text | \ + awk -F"-" '{printf("%s %s %08.2f %08.2f\n", $0, $1, $2/100.0, $3/100.0)}' |\ + sed -e "s/_[A-Z]*\././2" |\ + sed -e "s/ P.._/ /" > $dir/segments +else + cut -d" " -f 1 $dir/text | \ + awk -F"-" '{printf("%s %s %08.2f %08.2f\n", $0, $1, $2/100.0, $3/100.0)}' |\ + sed -e "s/_[A-Z]*\././2" |\ + sed -e 's/ P.._/ /' > $dir/segments +fi +cut -f 1 -d ' ' $dir/segments | \ + perl -ne 'chomp;$utt=$_;s/_.*//;print "$utt $_\n";' > $dir/utt2spk + +utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt + +# Check that data dirs are okay! +utils/validate_data_dir.sh --no-feats $dir || exit 1 diff --git a/egs/chime5/s5b/local/prepare_dict.sh b/egs/chime5/s5b/local/prepare_dict.sh new file mode 100755 index 00000000000..09083d0e795 --- /dev/null +++ b/egs/chime5/s5b/local/prepare_dict.sh @@ -0,0 +1,124 @@ +#!/bin/bash +# Copyright (c) 2018, Johns Hopkins University (Jan "Yenda" Trmal) +# License: Apache 2.0 + +# Begin configuration section. +# End configuration section +. ./utils/parse_options.sh + +. ./path.sh + +set -e -o pipefail +set -o nounset # Treat unset variables as an error + + +# The parts of the output of this that will be needed are +# [in data/local/dict/ ] +# lexicon.txt +# extra_questions.txt +# nonsilence_phones.txt +# optional_silence.txt +# silence_phones.txt + + +# check existing directories +[ $# != 0 ] && echo "Usage: $0" && exit 1; + +dir=data/local/dict + +mkdir -p $dir +echo "$0: Getting CMU dictionary" +if [ ! -f $dir/cmudict.done ]; then + [ -d $dir/cmudict ] && rm -rf $dir/cmudict + svn co https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict $dir/cmudict + touch $dir/cmudict.done +fi + +# silence phones, one per line. +for w in sil spn inaudible laughs noise; do + echo $w; +done > $dir/silence_phones.txt +echo sil > $dir/optional_silence.txt + +# For this setup we're discarding stress. +cat $dir/cmudict/cmudict-0.7b.symbols | \ + perl -ne 's:[0-9]::g; s:\r::; print lc($_)' | \ + sort -u > $dir/nonsilence_phones.txt + +# An extra question will be added by including the silence phones in one class. +paste -d ' ' -s $dir/silence_phones.txt > $dir/extra_questions.txt + +grep -v ';;;' $dir/cmudict/cmudict-0.7b |\ + uconv -f latin1 -t utf-8 -x Any-Lower |\ + perl -ne 's:(\S+)\(\d+\) :$1 :; s: : :; print;' |\ + perl -ne '@F = split " ",$_,2; $F[1] =~ s/[0-9]//g; print "$F[0] $F[1]";' \ + > $dir/lexicon1_raw_nosil.txt || exit 1; + +# Add prons for laughter, noise, oov +for w in `grep -v sil $dir/silence_phones.txt`; do + echo "[$w] $w" +done | cat - $dir/lexicon1_raw_nosil.txt > $dir/lexicon2_raw.txt || exit 1; + +# we keep all words from the cmudict in the lexicon +# might reduce OOV rate on dev and eval +cat $dir/lexicon2_raw.txt \ + <( echo "mm m" + echo " spn" + echo "cuz k aa z" + echo "cuz k ah z" + echo "cuz k ao z" + echo "mmm m"; \ + echo "hmm hh m"; \ + ) | sort -u | sed 's/[\t ]/\t/' > $dir/iv_lexicon.txt + + +cat data/train*/text | \ + awk '{for (n=2;n<=NF;n++){ count[$n]++; } } END { for(n in count) { print count[n], n; }}' | \ + sort -nr > $dir/word_counts + +cat $dir/word_counts | awk '{print $2}' > $dir/word_list + +awk '{print $1}' $dir/iv_lexicon.txt | \ + perl -e '($word_counts)=@ARGV; + open(W, "<$word_counts")||die "opening word-counts $word_counts"; + while() { chop; $seen{$_}=1; } + while() { + ($c,$w) = split; + if (!defined $seen{$w}) { print; } + } ' $dir/word_counts > $dir/oov_counts.txt + +echo "*Highest-count OOVs (including fragments) are:" +head -n 10 $dir/oov_counts.txt +echo "*Highest-count OOVs (excluding fragments) are:" +grep -v -E '^-|-$' $dir/oov_counts.txt | head -n 10 || true + +echo "*Training a G2P and generating missing pronunciations" +mkdir -p $dir/g2p/ +phonetisaurus-align --input=$dir/iv_lexicon.txt --ofile=$dir/g2p/aligned_lexicon.corpus +ngram-count -order 4 -kn-modify-counts-at-end -ukndiscount\ + -gt1min 0 -gt2min 0 -gt3min 0 -gt4min 0 \ + -text $dir/g2p/aligned_lexicon.corpus -lm $dir/g2p/aligned_lexicon.arpa +phonetisaurus-arpa2wfst --lm=$dir/g2p/aligned_lexicon.arpa --ofile=$dir/g2p/g2p.fst +awk '{print $2}' $dir/oov_counts.txt > $dir/oov_words.txt +phonetisaurus-apply --nbest 2 --model $dir/g2p/g2p.fst --thresh 5 --accumulate \ + --word_list $dir/oov_words.txt > $dir/oov_lexicon.txt + +## The next section is again just for debug purposes +## to show words for which the G2P failed +cat $dir/oov_lexicon.txt $dir/iv_lexicon.txt | sort -u > $dir/lexicon.txt +rm -f $dir/lexiconp.txt 2>/dev/null; # can confuse later script if this exists. +awk '{print $1}' $dir/lexicon.txt | \ + perl -e '($word_counts)=@ARGV; + open(W, "<$word_counts")||die "opening word-counts $word_counts"; + while() { chop; $seen{$_}=1; } + while() { + ($c,$w) = split; + if (!defined $seen{$w}) { print; } + } ' $dir/word_counts > $dir/oov_counts.g2p.txt + +echo "*Highest-count OOVs (including fragments) after G2P are:" +head -n 10 $dir/oov_counts.g2p.txt + +utils/validate_dict_dir.pl $dir +exit 0; + diff --git a/egs/chime5/s5b/local/reverberate_lat_dir.sh b/egs/chime5/s5b/local/reverberate_lat_dir.sh new file mode 100755 index 00000000000..f601a37c0e1 --- /dev/null +++ b/egs/chime5/s5b/local/reverberate_lat_dir.sh @@ -0,0 +1,93 @@ +#!/bin/bash + +# Copyright 2018 Vimal Manohar +# Apache 2.0 + +num_data_reps=1 +cmd=run.pl +nj=20 +include_clean=false + +. utils/parse_options.sh +. ./path.sh + +if [ $# -ne 4 ]; then + echo "Usage: $0 " + exit 1 +fi + +train_data_dir=$1 +noisy_latdir=$2 +clean_latdir=$3 +dir=$4 + +clean_nj=$(cat $clean_latdir/num_jobs) + +$cmd JOB=1:$clean_nj $dir/copy_clean_lattices.JOB.log \ + lattice-copy "ark:gunzip -c $clean_latdir/lat.JOB.gz |" \ + ark,scp:$dir/lats_clean.JOB.ark,$dir/lats_clean.JOB.scp || exit 1 + +for n in $(seq $clean_nj); do + cat $dir/lats_clean.$n.scp +done > $dir/lats_clean.scp + +for i in $(seq $num_data_reps); do + cat $dir/lats_clean.scp | awk -vi=$i '{print "rev"i"_"$0}' +done > $dir/lats_rvb.scp + +noisy_nj=$(cat $noisy_latdir/num_jobs) +$cmd JOB=1:$noisy_nj $dir/copy_noisy_lattices.JOB>log \ + lattice-copy "ark:gunzip -c $noisy_latdir/lat.JOB.gz |" \ + ark,scp:$dir/lats_noisy.JOB.ark,$dir/lats_noisy.JOB.scp || exit 1 + +optional_clean= +if $include_clean; then + optional_clean=$dir/lats_clean.scp +fi + +for n in $(seq $noisy_nj); do + cat $dir/lats_noisy.$n.scp +done | cat - $dir/lats_rvb.scp ${optional_clean} | sort -k1,1 > $dir/lats.scp + +utils/split_data.sh $train_data_dir $nj +$cmd JOB=1:$nj $dir/copy_lattices.JOB.log \ + lattice-copy "scp:utils/filter_scp.pl $train_data_dir/split$nj/JOB/utt2spk $dir/lats.scp |" \ + "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1 + +echo $nj > $dir/num_jobs + +if [ -f $clean_latdir/ali.1.gz ]; then + $cmd JOB=1:$clean_nj $dir/copy_clean_alignments.JOB.log \ + copy-int-vector "ark:gunzip -c $clean_latdir/ali.JOB.gz |" \ + ark,scp:$dir/ali_clean.JOB.ark,$dir/ali_clean.JOB.scp + + for n in $(seq $clean_nj); do + cat $dir/ali_clean.$n.scp + done > $dir/ali_clean.scp + + for i in $(seq $num_data_reps); do + cat $dir/ali_clean.scp | awk -vi=$i '{print "rev"i"_"$0}' + done > $dir/ali_rvb.scp + + optional_clean= + if $include_clean; then + optional_clean=$dir/ali_clean.scp + fi + + $cmd JOB=1:$noisy_nj $dir/copy_noisy_alignments.JOB.log \ + copy-int-vector "ark:gunzip -c $noisy_latdir/ali.JOB.gz |" \ + ark,scp:$dir/ali_noisy.JOB.ark,$dir/ali_noisy.JOB.scp + + for n in $(seq $noisy_nj); do + cat $dir/ali_noisy.$n.scp + done | cat - $dir/ali_rvb.scp $optional_clean | sort -k1,1 > $dir/ali.scp + + utils/split_data.sh $train_data_dir $nj || exit 1 + $cmd JOB=1:$nj $dir/copy_rvb_alignments.JOB.log \ + copy-int-vector "scp:utils/filter_scp.pl $train_data_dir/split$nj/JOB/utt2spk $dir/ali.scp |" \ + "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1 +fi + +cp $clean_latdir/{final.*,tree,*.mat,*opts,*.txt} $dir || true + +rm $dir/lats_{clean,noisy}.*.{ark,scp} $dir/ali_{clean,noisy}.*.{ark,scp} || true # save space diff --git a/egs/chime5/s5b/local/run_beamformit.sh b/egs/chime5/s5b/local/run_beamformit.sh new file mode 100755 index 00000000000..aa3badd90d8 --- /dev/null +++ b/egs/chime5/s5b/local/run_beamformit.sh @@ -0,0 +1,87 @@ +#!/bin/bash + +# Copyright 2015, Mitsubishi Electric Research Laboratories, MERL (Author: Shinji Watanabe) + +. ./cmd.sh +. ./path.sh + +# Config: +cmd=run.pl +bmf="1 2 3 4" + +. utils/parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "Wrong #arguments ($#, expected 3)" + echo "Usage: local/run_beamformit.sh [options] " + echo "main options (for others, see top of script file)" + echo " --cmd # Command to run in parallel with" + echo " --bmf \"1 2 3 4\" # microphones used for beamforming" + exit 1; +fi + +sdir=$1 +odir=$2 +array=$3 +expdir=exp/enhan/`echo $odir | awk -F '/' '{print $NF}'`_`echo $bmf | tr ' ' '_'` + +if ! command -v BeamformIt &>/dev/null ; then + echo "Missing BeamformIt, run 'cd $KALDI_ROOT/tools/; ./extras/install_beamformit.sh; cd -;'" && exit 1 +fi + +# Set bash to 'debug' mode, it will exit on : +# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', +set -e +set -u +set -o pipefail + +mkdir -p $odir +mkdir -p $expdir/log + +echo "Will use the following channels: $bmf" +# number of channels +numch=`echo $bmf | tr ' ' '\n' | wc -l` +echo "the number of channels: $numch" + +# wavfiles.list can be used as the name of the output files +output_wavfiles=$expdir/wavfiles.list +find -L ${sdir} | grep -i ${array} | awk -F "/" '{print $NF}' | sed -e "s/\.CH.\.wav//" | sort | uniq > $expdir/wavfiles.list + +# this is an input file list of the microphones +# format: 1st_wav 2nd_wav ... nth_wav +input_arrays=$expdir/channels_$numch +for x in `cat $output_wavfiles`; do + echo -n "$x" + for ch in $bmf; do + echo -n " $x.CH$ch.wav" + done + echo "" +done > $input_arrays + +# split the list for parallel processing +# number of jobs are set by the number of WAV files +nj=`wc -l $expdir/wavfiles.list | awk '{print $1}'` +split_wavfiles="" +for n in `seq $nj`; do + split_wavfiles="$split_wavfiles $output_wavfiles.$n" +done +utils/split_scp.pl $output_wavfiles $split_wavfiles || exit 1; + +echo -e "Beamforming\n" +# making a shell script for each job +for n in `seq $nj`; do +cat << EOF > $expdir/log/beamform.$n.sh +while read line; do + $BEAMFORMIT/BeamformIt -s \$line -c $input_arrays \ + --config_file `pwd`/conf/beamformit.cfg \ + --source_dir $sdir \ + --result_dir $odir +done < $output_wavfiles.$n +EOF +done + +chmod a+x $expdir/log/beamform.*.sh +$cmd JOB=1:$nj $expdir/log/beamform.JOB.log \ + $expdir/log/beamform.JOB.sh + +echo "`basename $0` Done." diff --git a/egs/chime5/s5b/local/run_recog.sh b/egs/chime5/s5b/local/run_recog.sh new file mode 100755 index 00000000000..989a5f95d01 --- /dev/null +++ b/egs/chime5/s5b/local/run_recog.sh @@ -0,0 +1,156 @@ +#!/bin/bash +# +# Based mostly on the TED-LIUM and Switchboard recipe +# +# Copyright 2017 Johns Hopkins University (Author: Shinji Watanabe and Yenda Trmal) +# Apache 2.0 +# +# This is a subset of run.sh to only perform recognition experiments with evaluation data + +# Begin configuration section. +decode_nj=20 +stage=0 +enhancement=beamformit # for a new enhancement method, + # change this variable and stage 4 +# End configuration section +. ./utils/parse_options.sh + +. ./cmd.sh +. ./path.sh + + +set -e # exit on error + +# chime5 main directory path +# please change the path accordingly +chime5_corpus=/export/corpora4/CHiME5 +json_dir=${chime5_corpus}/transcriptions +audio_dir=${chime5_corpus}/audio + +# training and test data +train_set=train_worn_simu_u400k +test_sets="eval_${enhancement}_dereverb_ref" + +# This script also needs the phonetisaurus g2p, srilm, beamformit +./local/check_tools.sh || exit 1 + +if [ $stage -le 4 ]; then + # Beamforming using reference arrays + # enhanced WAV directory + enhandir=enhan + dereverb_dir=${PWD}/wav/wpe/ + for dset in eval; do + for mictype in u01 u02 u03 u04 u05 u06; do + local/run_wpe.sh --nj 4 --cmd "$train_cmd --mem 120G" \ + ${audio_dir}/${dset} \ + ${dereverb_dir}/${dset} \ + ${mictype} + done + done + for dset in dev eval; do + for mictype in u01 u02 u03 u04 u05 u06; do + local/run_beamformit.sh --cmd "$train_cmd" \ + ${dereverb_dir}/${dset} \ + ${enhandir}/${dset}_${enhancement}_${mictype} \ + ${mictype} + done + done + + for dset in eval; do + local/prepare_data.sh --mictype ref "$PWD/${enhandir}/${dset}_${enhancement}_u0*" \ + ${json_dir}/${dset} data/${dset}_${enhancement}_dereverb_ref + done +fi + +if [ $stage -le 6 ]; then + # fix speaker ID issue (thanks to Dr. Naoyuki Kanda) + # add array ID to the speaker ID to avoid the use of other array information to meet regulations + # Before this fix + # $ head -n 2 data/eval_beamformit_ref_nosplit/utt2spk + # P01_S01_U02_KITCHEN.ENH-0000192-0001278 P01 + # P01_S01_U02_KITCHEN.ENH-0001421-0001481 P01 + # After this fix + # $ head -n 2 data/eval_beamformit_ref_nosplit_fix/utt2spk + # P01_S01_U02_KITCHEN.ENH-0000192-0001278 P01_U02 + # P01_S01_U02_KITCHEN.ENH-0001421-0001481 P01_U02 + for dset in ${test_sets}; do + utils/copy_data_dir.sh data/${dset} data/${dset}_nosplit + mkdir -p data/${dset}_nosplit_fix + cp data/${dset}_nosplit/{segments,text,wav.scp} data/${dset}_nosplit_fix/ + awk -F "_" '{print $0 "_" $3}' data/${dset}_nosplit/utt2spk > data/${dset}_nosplit_fix/utt2spk + utils/utt2spk_to_spk2utt.pl data/${dset}_nosplit_fix/utt2spk > data/${dset}_nosplit_fix/spk2utt + done + + # Split speakers up into 3-minute chunks. This doesn't hurt adaptation, and + # lets us use more jobs for decoding etc. + for dset in ${test_sets}; do + utils/data/modify_speaker_info.sh --seconds-per-spk-max 180 data/${dset}_nosplit_fix data/${dset} + done +fi + +if [ $stage -le 7 ]; then + # Now make MFCC features. + # mfccdir should be some place with a largish disk where you + # want to store MFCC features. + mfccdir=mfcc + for x in ${test_sets}; do + steps/make_mfcc.sh --nj 20 --cmd "$train_cmd" \ + data/$x exp/make_mfcc/$x $mfccdir + steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir + utils/fix_data_dir.sh data/$x + done +fi + +nnet3_affix=_${train_set}_cleaned_rvb + +lm_suffix= + +if [ $stage -le 18 ]; then + # First the options that are passed through to run_ivector_common.sh + # (some of which are also used in this script directly). + + # The rest are configs specific to this script. Most of the parameters + # are just hardcoded at this level, in the commands below. + affix=1a # affix for the TDNN directory name + tree_affix= + tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix} + dir=exp/chain${nnet3_affix}/tdnn${affix}_sp + + # training options + # training chunk-options + chunk_width=140,100,160 + # we don't need extra left/right context for TDNN systems. + chunk_left_context=0 + chunk_right_context=0 + + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang${lm_suffix}/ \ + $tree_dir $tree_dir/graph${lm_suffix} || exit 1; + + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + local/nnet3/decode.sh --affix 2stage --pass2-decode-opts "--min-active 1000" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk 150 --nj $decode_nj \ + --ivector-dir exp/nnet3${nnet3_affix} \ + --graph-affix ${lm_suffix} \ + data/${data} data/lang${lm_suffix} \ + $tree_dir/graph${lm_suffix} \ + exp/chain${nnet3_affix}/tdnn1b_sp + ) || touch $dir/.error & + done + wait + [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1 +fi + +if [ $stage -le 20 ]; then + # final scoring to get the official challenge result + # please specify both dev and eval set directories so that the search parameters + # (insertion penalty and language model weight) will be tuned using the dev set + local/score_for_submit.sh \ + --dev exp/chain${nnet3_affix}/tdnn1b_sp/decode${lm_suffix}_dev_${enhancement}_dereverb_ref_2stage \ + --eval exp/chain${nnet3_affix}/tdnn1b_sp/decode${lm_suffix}_eval_${enhancement}_dereverb_ref_2stage +fi diff --git a/egs/chime5/s5b/local/run_wpe.py b/egs/chime5/s5b/local/run_wpe.py new file mode 100755 index 00000000000..2f3818f9c42 --- /dev/null +++ b/egs/chime5/s5b/local/run_wpe.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python +# Copyright 2018 Johns Hopkins University (Author: Aswin Shanmugam Subramanian) +# Apache 2.0 +# Works with both python2 and python3 +# This script assumes that WPE (nara_wpe) is installed locally using miniconda. +# ../../../tools/extras/install_miniconda.sh and ../../../tools/extras/install_wpe.sh +# needs to be run and this script needs to be launched run with that version of +# python. +# See local/run_wpe.sh for example. + +import numpy as np +import soundfile as sf +import time +import os, errno +from tqdm import tqdm +import argparse + +from nara_wpe.wpe import wpe +from nara_wpe.utils import stft, istft +from nara_wpe import project_root + +parser = argparse.ArgumentParser() +parser.add_argument('--files', '-f', nargs='+') +args = parser.parse_args() + +input_files = args.files[:len(args.files)//2] +output_files = args.files[len(args.files)//2:] +out_dir = os.path.dirname(output_files[0]) +try: + os.makedirs(out_dir) +except OSError as e: + if e.errno != errno.EEXIST: + raise + +stft_options = dict( + size=512, + shift=128, + window_length=None, + fading=True, + pad=True, + symmetric_window=False +) + +sampling_rate = 16000 +delay = 3 +iterations = 5 +taps = 10 + +signal_list = [ + sf.read(f)[0] + for f in input_files +] +y = np.stack(signal_list, axis=0) +Y = stft(y, **stft_options).transpose(2, 0, 1) +Z = wpe(Y, iterations=iterations, statistics_mode='full').transpose(1, 2, 0) +z = istft(Z, size=stft_options['size'], shift=stft_options['shift']) + +for d in range(len(signal_list)): + sf.write(output_files[d], z[d,:], sampling_rate) diff --git a/egs/chime5/s5b/local/run_wpe.sh b/egs/chime5/s5b/local/run_wpe.sh new file mode 100755 index 00000000000..ed512e69aae --- /dev/null +++ b/egs/chime5/s5b/local/run_wpe.sh @@ -0,0 +1,86 @@ +#!/bin/bash +# Copyright 2018 Johns Hopkins University (Author: Aswin Shanmugam Subramanian) +# Apache 2.0 + +. ./cmd.sh +. ./path.sh + +# Config: +nj=4 +cmd=run.pl + +. utils/parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "Wrong #arguments ($#, expected 3)" + echo "Usage: local/run_wpe.sh [options] " + echo "main options (for others, see top of script file)" + echo " --cmd # Command to run in parallel with" + echo " --nj 50 # number of jobs for parallel processing" + exit 1; +fi + +sdir=$1 +odir=$2 +array=$3 +task=`basename $sdir` +expdir=exp/wpe/${task}_${array} +# Set bash to 'debug' mode, it will exit on : +# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', +set -e +set -u +set -o pipefail + +miniconda_dir=$HOME/miniconda3/ +if [ ! -d $miniconda_dir ]; then + echo "$miniconda_dir does not exist. Please run '$KALDI_ROOT/tools/extras/install_miniconda.sh'." + exit 1 +fi + +# check if WPE is installed +result=`$miniconda_dir/bin/python -c "\ +try: + import nara_wpe + print('1') +except ImportError: + print('0')"` + +if [ "$result" == "1" ]; then + echo "WPE is installed" +else + echo "WPE is not installed. Please run ../../../tools/extras/install_wpe.sh" + exit 1 +fi + +mkdir -p $odir +mkdir -p $expdir/log + +# wavfiles.list can be used as the name of the output files +output_wavfiles=$expdir/wavfiles.list +find -L ${sdir} | grep -i ${array} > $expdir/channels_input +cat $expdir/channels_input | awk -F '/' '{print $NF}' | sed "s@S@$odir\/S@g" > $expdir/channels_output +paste -d" " $expdir/channels_input $expdir/channels_output > $output_wavfiles + +# split the list for parallel processing +split_wavfiles="" +for n in `seq $nj`; do + split_wavfiles="$split_wavfiles $output_wavfiles.$n" +done +utils/split_scp.pl $output_wavfiles $split_wavfiles || exit 1; + +echo -e "Dereverberation - $task - $array\n" +# making a shell script for each job +for n in `seq $nj`; do +cat <<-EOF > $expdir/log/wpe.$n.sh +while read line; do + $miniconda_dir/bin/python local/run_wpe.py \ + --file \$line +done < $output_wavfiles.$n +EOF +done + +chmod a+x $expdir/log/wpe.*.sh +$cmd JOB=1:$nj $expdir/log/wpe.JOB.log \ + $expdir/log/wpe.JOB.sh + +echo "`basename $0` Done." diff --git a/egs/chime5/s5b/local/score.sh b/egs/chime5/s5b/local/score.sh new file mode 120000 index 00000000000..6a200b42ed3 --- /dev/null +++ b/egs/chime5/s5b/local/score.sh @@ -0,0 +1 @@ +../steps/scoring/score_kaldi_wer.sh \ No newline at end of file diff --git a/egs/chime5/s5b/local/score_for_submit.sh b/egs/chime5/s5b/local/score_for_submit.sh new file mode 100755 index 00000000000..23121d68b93 --- /dev/null +++ b/egs/chime5/s5b/local/score_for_submit.sh @@ -0,0 +1,119 @@ +#!/bin/bash +# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey, Yenda Trmal) +# Apache 2.0 +# +# This script provides official CHiME-5 challenge submission scores per room and session. +# It first calculates the best search parameter configurations by using the dev set +# and also create the transcriptions for dev and eval sets to be submitted. +# The default setup does not calculate scores of the evaluation set since +# the evaluation transcription is not distributed (July 9 2018) + +cmd=run.pl +dev=exp/chain_train_worn_u100k_cleaned/tdnn1a_sp/decode_dev_beamformit_ref +eval=exp/chain_train_worn_u100k_cleaned/tdnn1a_sp/decode_eval_beamformit_ref +do_eval=false + +echo "$0 $@" # Print the command line for logging +[ -f ./path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# -ne 0 ]; then + echo "Usage: $0 [--cmd (run.pl|queue.pl...)]" + echo "This script provides official CHiME-5 challenge submission scores" + echo " Options:" + echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." + echo " --dev # dev set decoding directory" + echo " --eval # eval set decoding directory" + exit 1; +fi + +# get language model weight and word insertion penalty from the dev set +best_lmwt=`cat $dev/scoring_kaldi/wer_details/lmwt` +best_wip=`cat $dev/scoring_kaldi/wer_details/wip` + +echo "best LM weight: $best_lmwt" +echo "insertion penalty weight: $best_wip" + +echo "==== development set ====" +# development set +# get the scoring result per utterance +score_result=$dev/scoring_kaldi/wer_details/per_utt +for session in S02 S09; do + for room in DINING KITCHEN LIVING; do + # get nerror + nerr=`grep "\#csid" $score_result | grep $room | grep $session | awk '{sum+=$4+$5+$6} END {print sum}'` + # get nwords from references (NF-2 means to exclude utterance id and " ref ") + nwrd=`grep "\#csid" $score_result | grep $room | grep $session | awk '{sum+=$3+$4+$6} END {print sum}'` + # compute wer with scale=2 + wer=`echo "scale=2; 100 * $nerr / $nwrd" | bc` + + # report the results + echo -n "session $session " + echo -n "room $room: " + echo -n "#words $nwrd, " + echo -n "#errors $nerr, " + echo "wer $wer %" + done +done +echo -n "overall: " +# get nerror +nerr=`grep "\#csid" $score_result | awk '{sum+=$4+$5+$6} END {print sum}'` +# get nwords from references (NF-2 means to exclude utterance id and " ref ") +nwrd=`grep "\#csid" $score_result | awk '{sum+=$3+$4+$6} END {print sum}'` +# compute wer with scale=2 +wer=`echo "scale=2; 100 * $nerr / $nwrd" | bc` +echo -n "#words $nwrd, " +echo -n "#errors $nerr, " +echo "wer $wer %" + +echo "==== evaluation set ====" +# evaluation set +# get the scoring result per utterance. Copied from local/score.sh +mkdir -p $eval/scoring_kaldi/wer_details_devbest +$cmd $eval/scoring_kaldi/log/stats1.log \ + cat $eval/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \| \ + align-text --special-symbol="'***'" ark:$eval/scoring_kaldi/test_filt.txt ark:- ark,t:- \| \ + utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \> $eval/scoring_kaldi/wer_details_devbest/per_utt +score_result=$eval/scoring_kaldi/wer_details_devbest/per_utt +for session in S01 S21; do + for room in DINING KITCHEN LIVING; do + if $do_eval; then + # get nerror + nerr=`grep "\#csid" $score_result | grep $room | grep $session | awk '{sum+=$4+$5+$6} END {print sum}'` + # get nwords from references (NF-2 means to exclude utterance id and " ref ") + nwrd=`grep "\#csid" $score_result | grep $room | grep $session | awk '{sum+=$3+$4+$6} END {print sum}'` + # compute wer with scale=2 + wer=`echo "scale=2; 100 * $nerr / $nwrd" | bc` + + # report the results + echo -n "session $session " + echo -n "room $room: " + echo -n "#words $nwrd, " + echo -n "#errors $nerr, " + echo "wer $wer %" + fi + done +done +if $do_eval; then + # get nerror + nerr=`grep "\#csid" $score_result | awk '{sum+=$4+$5+$6} END {print sum}'` + # get nwords from references (NF-2 means to exclude utterance id and " ref ") + nwrd=`grep "\#csid" $score_result | awk '{sum+=$3+$4+$6} END {print sum}'` + # compute wer with scale=2 + wer=`echo "scale=2; 100 * $nerr / $nwrd" | bc` + echo -n "overall: " + echo -n "#words $nwrd, " + echo -n "#errors $nerr, " + echo "wer $wer %" +else + echo "skip evaluation scoring" + echo "" + echo "==== when you submit your result to the CHiME-5 challenge ====" + echo "Please rename your recognition results of " + echo "$dev/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt" + echo "$eval/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt" + echo "with {dev,eval}__.txt, e.g., dev_watanabe_jhu.txt and eval_watanabe_jhu.txt, " + echo "and submit both of them as your final challenge result" + echo "==================================================================" +fi + diff --git a/egs/chime5/s5b/local/train_lms_srilm.sh b/egs/chime5/s5b/local/train_lms_srilm.sh new file mode 100755 index 00000000000..5a1d56d24b3 --- /dev/null +++ b/egs/chime5/s5b/local/train_lms_srilm.sh @@ -0,0 +1,261 @@ +#!/bin/bash +# Copyright (c) 2017 Johns Hopkins University (Author: Yenda Trmal, Shinji Watanabe) +# Apache 2.0 + +export LC_ALL=C + +# Begin configuration section. +words_file= +train_text= +dev_text= +oov_symbol="" +# End configuration section + +echo "$0 $@" + +[ -f path.sh ] && . ./path.sh +. ./utils/parse_options.sh || exit 1 + +echo "-------------------------------------" +echo "Building an SRILM language model " +echo "-------------------------------------" + +if [ $# -ne 2 ] ; then + echo "Incorrect number of parameters. " + echo "Script has to be called like this:" + echo " $0 [switches] " + echo "For example: " + echo " $0 data data/srilm" + echo "The allowed switches are: " + echo " words_file= word list file -- data/lang/words.txt by default" + echo " train_text= data/train/text is used in case when not specified" + echo " dev_text= last 10 % of the train text is used by default" + echo " oov_symbol=> symbol to use for oov modeling -- by default" + exit 1 +fi + +datadir=$1 +tgtdir=$2 + +##End of configuration +loc=`which ngram-count`; +if [ -z $loc ]; then + echo >&2 "You appear to not have SRILM tools installed, either on your path," + echo >&2 "Use the script \$KALDI_ROOT/tools/install_srilm.sh to install it." + exit 1 +fi + +# Prepare the destination directory +mkdir -p $tgtdir + +for f in $words_file $train_text $dev_text; do + [ ! -s $f ] && echo "No such file $f" && exit 1; +done + +[ -z $words_file ] && words_file=$datadir/lang/words.txt +if [ ! -z "$train_text" ] && [ -z "$dev_text" ] ; then + nr=`cat $train_text | wc -l` + nr_dev=$(($nr / 10 )) + nr_train=$(( $nr - $nr_dev )) + orig_train_text=$train_text + head -n $nr_train $train_text > $tgtdir/train_text + tail -n $nr_dev $train_text > $tgtdir/dev_text + + train_text=$tgtdir/train_text + dev_text=$tgtdir/dev_text + echo "Using words file: $words_file" + echo "Using train text: 9/10 of $orig_train_text" + echo "Using dev text : 1/10 of $orig_train_text" +elif [ ! -z "$train_text" ] && [ ! -z "$dev_text" ] ; then + echo "Using words file: $words_file" + echo "Using train text: $train_text" + echo "Using dev text : $dev_text" + train_text=$train_text + dev_text=$dev_text +else + train_text=$datadir/train/text + dev_text=$datadir/dev2h/text + echo "Using words file: $words_file" + echo "Using train text: $train_text" + echo "Using dev text : $dev_text" + +fi + +[ ! -f $words_file ] && echo >&2 "File $words_file must exist!" && exit 1 +[ ! -f $train_text ] && echo >&2 "File $train_text must exist!" && exit 1 +[ ! -f $dev_text ] && echo >&2 "File $dev_text must exist!" && exit 1 + + +# Extract the word list from the training dictionary; exclude special symbols +sort $words_file | awk '{print $1}' | grep -v '\#0' | grep -v '' | grep -v -F "$oov_symbol" > $tgtdir/vocab +if (($?)); then + echo "Failed to create vocab from $words_file" + exit 1 +else + # wc vocab # doesn't work due to some encoding issues + echo vocab contains `cat $tgtdir/vocab | perl -ne 'BEGIN{$l=$w=0;}{split; $w+=$#_; $w++; $l++;}END{print "$l lines, $w words\n";}'` +fi + +# Kaldi transcript files contain Utterance_ID as the first word; remove it +# We also have to avoid skewing the LM by incorporating the same sentences +# from different channels +sed -e "s/\.CH.//" -e "s/_.\-./_/" -e "s/NOLOCATION\(\.[LR]\)*-//" -e "s/U[0-9][0-9]_//" $train_text | sort -u | \ + perl -ane 'print join(" ", @F[1..$#F]) . "\n" if @F > 1' > $tgtdir/train.txt +if (($?)); then + echo "Failed to create $tgtdir/train.txt from $train_text" + exit 1 +else + echo "Removed first word (uid) from every line of $train_text" + # wc text.train train.txt # doesn't work due to some encoding issues + echo $train_text contains `cat $train_text | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $w--; $s++;}END{print "$w words, $s sentences\n";}'` + echo train.txt contains `cat $tgtdir/train.txt | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $s++;}END{print "$w words, $s sentences\n";}'` +fi + +# Kaldi transcript files contain Utterance_ID as the first word; remove it +sed -e "s/\.CH.//" -e "s/_.\-./_/" $dev_text | sort -u | \ + perl -ane 'print join(" ", @F[1..$#F]) . "\n" if @F > 1' > $tgtdir/dev.txt +if (($?)); then + echo "Failed to create $tgtdir/dev.txt from $dev_text" + exit 1 +else + echo "Removed first word (uid) from every line of $dev_text" + # wc text.train train.txt # doesn't work due to some encoding issues + echo $dev_text contains `cat $dev_text | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $w--; $s++;}END{print "$w words, $s sentences\n";}'` + echo $tgtdir/dev.txt contains `cat $tgtdir/dev.txt | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $s++;}END{print "$w words, $s sentences\n";}'` +fi + + +echo "-------------------" +echo "Good-Turing 3grams" +echo "-------------------" +ngram-count -lm $tgtdir/3gram.gt011.gz -gt1min 0 -gt2min 1 -gt3min 1 -order 3 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.gt012.gz -gt1min 0 -gt2min 1 -gt3min 2 -order 3 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.gt022.gz -gt1min 0 -gt2min 2 -gt3min 2 -order 3 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.gt023.gz -gt1min 0 -gt2min 2 -gt3min 3 -order 3 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" + +echo "-------------------" +echo "Kneser-Ney 3grams" +echo "-------------------" +ngram-count -lm $tgtdir/3gram.kn011.gz -kndiscount1 -gt1min 0 \ + -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -order 3 -interpolate \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.kn012.gz -kndiscount1 -gt1min 0 \ + -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -order 3 -interpolate \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.kn022.gz -kndiscount1 -gt1min 0 \ + -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -order 3 -interpolate \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.kn023.gz -kndiscount1 -gt1min 0 \ + -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 3 -order 3 -interpolate \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.kn111.gz -kndiscount1 -gt1min 1 \ + -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -order 3 -interpolate \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.kn112.gz -kndiscount1 -gt1min 1 \ + -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -order 3 -interpolate \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.kn122.gz -kndiscount1 -gt1min 1 \ + -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -order 3 -interpolate \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.kn123.gz -kndiscount1 -gt1min 1 \ + -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 3 -order 3 -interpolate \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" + + +echo "-------------------" +echo "Good-Turing 4grams" +echo "-------------------" +ngram-count -lm $tgtdir/4gram.gt0111.gz \ + -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 1 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.gt0112.gz \ + -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 2 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.gt0122.gz \ + -gt1min 0 -gt2min 1 -gt3min 2 -gt4min 2 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.gt0123.gz \ + -gt1min 0 -gt2min 1 -gt3min 2 -gt4min 3 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.gt0113.gz \ + -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 3 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.gt0222.gz \ + -gt1min 0 -gt2min 2 -gt3min 2 -gt4min 2 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.gt0223.gz \ + -gt1min 0 -gt2min 2 -gt3min 2 -gt4min 3 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" + +echo "-------------------" +echo "Kneser-Ney 4grams" +echo "-------------------" +ngram-count -lm $tgtdir/4gram.kn0111.gz \ + -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 1 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.kn0112.gz \ + -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 2 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.kn0113.gz \ + -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 3 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.kn0122.gz \ + -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 2 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.kn0123.gz \ + -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 3 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.kn0222.gz \ + -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 2 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.kn0223.gz \ + -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 3 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" + +if [ ! -z ${LIBLBFGS} ]; then + #please note that if the switch -map-unk "$oov_symbol" is used with -maxent-convert-to-arpa, ngram-count will segfault + #instead of that, we simply output the model in the maxent format and convert it using the "ngram" + echo "-------------------" + echo "Maxent 3grams" + echo "-------------------" + sed 's/'${oov_symbol}'//g' $tgtdir/train.txt | \ + ngram-count -lm - -order 3 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\ + ngram -lm - -order 3 -unk -map-unk "$oov_symbol" -prune-lowprobs -write-lm - |\ + sed 's//'${oov_symbol}'/g' | gzip -c > $tgtdir/3gram.me.gz || exit 1 + + echo "-------------------" + echo "Maxent 4grams" + echo "-------------------" + sed 's/'${oov_symbol}'//g' $tgtdir/train.txt | \ + ngram-count -lm - -order 4 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\ + ngram -lm - -order 4 -unk -map-unk "$oov_symbol" -prune-lowprobs -write-lm - |\ + sed 's//'${oov_symbol}'/g' | gzip -c > $tgtdir/4gram.me.gz || exit 1 +else + echo >&2 "SRILM is not compiled with the support of MaxEnt models." + echo >&2 "You should use the script in \$KALDI_ROOT/tools/install_srilm.sh" + echo >&2 "which will take care of compiling the SRILM with MaxEnt support" + exit 1; +fi + + +echo "--------------------" +echo "Computing perplexity" +echo "--------------------" +( + for f in $tgtdir/3gram* ; do ( echo $f; ngram -order 3 -lm $f -unk -map-unk "$oov_symbol" -prune-lowprobs -ppl $tgtdir/dev.txt ) | paste -s -d ' ' ; done + for f in $tgtdir/4gram* ; do ( echo $f; ngram -order 4 -lm $f -unk -map-unk "$oov_symbol" -prune-lowprobs -ppl $tgtdir/dev.txt ) | paste -s -d ' ' ; done +) | sort -r -n -k 15,15g | column -t | tee $tgtdir/perplexities.txt + +echo "The perlexity scores report is stored in $tgtdir/perplexities.txt " +echo "" + +for best_ngram in {3,4}gram ; do + outlm=best_${best_ngram}.gz + lmfilename=$(grep "${best_ngram}" $tgtdir/perplexities.txt | head -n 1 | cut -f 1 -d ' ') + echo "$outlm -> $lmfilename" + (cd $tgtdir; rm -f $outlm; ln -sf $(basename $lmfilename) $outlm ) +done diff --git a/egs/chime5/s5b/local/wer_output_filter b/egs/chime5/s5b/local/wer_output_filter new file mode 100755 index 00000000000..6f4b6400716 --- /dev/null +++ b/egs/chime5/s5b/local/wer_output_filter @@ -0,0 +1,25 @@ +#!/bin/bash +# Copyright (c) 2017 Johns Hopkins University (Author: Yenda Trmal ) +# Apache 2.0 + + +## Filter for scoring of the STT results. Convert everything to lowercase +## and add some ad-hoc fixes for the hesitations + +perl -e ' + while() { + @A = split(" ", $_); + $id = shift @A; print "$id "; + foreach $a (@A) { + print lc($a) . " " unless $a =~ /\[.*\]/; + } + print "\n"; + }' | \ +sed -e ' + s/\/hmm/g; + s/\/hmm/g; + s/\/hmm/g; +' + +#| uconv -f utf-8 -t utf-8 -x Latin-ASCII + diff --git a/egs/chime5/s5b/local/worn_audio_list b/egs/chime5/s5b/local/worn_audio_list new file mode 100644 index 00000000000..fc7a44ad77d --- /dev/null +++ b/egs/chime5/s5b/local/worn_audio_list @@ -0,0 +1,64 @@ +/export/corpora4/CHiME5/audio/train/S03_P09.wav +/export/corpora4/CHiME5/audio/train/S03_P10.wav +/export/corpora4/CHiME5/audio/train/S03_P11.wav +/export/corpora4/CHiME5/audio/train/S03_P12.wav +/export/corpora4/CHiME5/audio/train/S04_P09.wav +/export/corpora4/CHiME5/audio/train/S04_P10.wav +/export/corpora4/CHiME5/audio/train/S04_P11.wav +/export/corpora4/CHiME5/audio/train/S04_P12.wav +/export/corpora4/CHiME5/audio/train/S05_P13.wav +/export/corpora4/CHiME5/audio/train/S05_P14.wav +/export/corpora4/CHiME5/audio/train/S05_P15.wav +/export/corpora4/CHiME5/audio/train/S05_P16.wav +/export/corpora4/CHiME5/audio/train/S06_P13.wav +/export/corpora4/CHiME5/audio/train/S06_P14.wav +/export/corpora4/CHiME5/audio/train/S06_P15.wav +/export/corpora4/CHiME5/audio/train/S06_P16.wav +/export/corpora4/CHiME5/audio/train/S07_P17.wav +/export/corpora4/CHiME5/audio/train/S07_P18.wav +/export/corpora4/CHiME5/audio/train/S07_P19.wav +/export/corpora4/CHiME5/audio/train/S07_P20.wav +/export/corpora4/CHiME5/audio/train/S08_P21.wav +/export/corpora4/CHiME5/audio/train/S08_P22.wav +/export/corpora4/CHiME5/audio/train/S08_P23.wav +/export/corpora4/CHiME5/audio/train/S08_P24.wav +/export/corpora4/CHiME5/audio/train/S12_P33.wav +/export/corpora4/CHiME5/audio/train/S12_P34.wav +/export/corpora4/CHiME5/audio/train/S12_P35.wav +/export/corpora4/CHiME5/audio/train/S12_P36.wav +/export/corpora4/CHiME5/audio/train/S13_P33.wav +/export/corpora4/CHiME5/audio/train/S13_P34.wav +/export/corpora4/CHiME5/audio/train/S13_P35.wav +/export/corpora4/CHiME5/audio/train/S13_P36.wav +/export/corpora4/CHiME5/audio/train/S16_P21.wav +/export/corpora4/CHiME5/audio/train/S16_P22.wav +/export/corpora4/CHiME5/audio/train/S16_P23.wav +/export/corpora4/CHiME5/audio/train/S16_P24.wav +/export/corpora4/CHiME5/audio/train/S17_P17.wav +/export/corpora4/CHiME5/audio/train/S17_P18.wav +/export/corpora4/CHiME5/audio/train/S17_P19.wav +/export/corpora4/CHiME5/audio/train/S17_P20.wav +/export/corpora4/CHiME5/audio/train/S18_P41.wav +/export/corpora4/CHiME5/audio/train/S18_P42.wav +/export/corpora4/CHiME5/audio/train/S18_P43.wav +/export/corpora4/CHiME5/audio/train/S18_P44.wav +/export/corpora4/CHiME5/audio/train/S19_P49.wav +/export/corpora4/CHiME5/audio/train/S19_P50.wav +/export/corpora4/CHiME5/audio/train/S19_P51.wav +/export/corpora4/CHiME5/audio/train/S19_P52.wav +/export/corpora4/CHiME5/audio/train/S20_P49.wav +/export/corpora4/CHiME5/audio/train/S20_P50.wav +/export/corpora4/CHiME5/audio/train/S20_P51.wav +/export/corpora4/CHiME5/audio/train/S20_P52.wav +/export/corpora4/CHiME5/audio/train/S22_P41.wav +/export/corpora4/CHiME5/audio/train/S22_P42.wav +/export/corpora4/CHiME5/audio/train/S22_P43.wav +/export/corpora4/CHiME5/audio/train/S22_P44.wav +/export/corpora4/CHiME5/audio/train/S23_P53.wav +/export/corpora4/CHiME5/audio/train/S23_P54.wav +/export/corpora4/CHiME5/audio/train/S23_P55.wav +/export/corpora4/CHiME5/audio/train/S23_P56.wav +/export/corpora4/CHiME5/audio/train/S24_P53.wav +/export/corpora4/CHiME5/audio/train/S24_P54.wav +/export/corpora4/CHiME5/audio/train/S24_P55.wav +/export/corpora4/CHiME5/audio/train/S24_P56.wav diff --git a/egs/chime5/s5b/path.sh b/egs/chime5/s5b/path.sh new file mode 100644 index 00000000000..fb1c0489386 --- /dev/null +++ b/egs/chime5/s5b/path.sh @@ -0,0 +1,7 @@ +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C + diff --git a/egs/chime5/s5b/run.sh b/egs/chime5/s5b/run.sh new file mode 100755 index 00000000000..37bc5c2c94e --- /dev/null +++ b/egs/chime5/s5b/run.sh @@ -0,0 +1,297 @@ +#!/bin/bash +# +# Based mostly on the TED-LIUM and Switchboard recipe +# +# Copyright 2017 Johns Hopkins University (Author: Shinji Watanabe and Yenda Trmal) +# Apache 2.0 +# + +# Begin configuration section. +nj=96 +decode_nj=20 +stage=0 +nnet_stage=-10 +num_data_reps=4 +snrs="20:10:15:5:0" +foreground_snrs="20:10:15:5:0" +background_snrs="20:10:15:5:0" +enhancement=beamformit # for a new enhancement method, + # change this variable and stage 4 +# End configuration section +. ./utils/parse_options.sh + +. ./cmd.sh +. ./path.sh + + +set -e # exit on error + +# chime5 main directory path +# please change the path accordingly +chime5_corpus=/export/corpora4/CHiME5 +json_dir=${chime5_corpus}/transcriptions +audio_dir=${chime5_corpus}/audio + +# training and test data +train_set=train_worn_simu_u400k +test_sets="dev_${enhancement}_dereverb_ref" #"dev_worn dev_addition_dereverb_ref" +#test_sets="dev_${enhancement}_ref" #"dev_worn dev_addition_dereverb_ref" + +# This script also needs the phonetisaurus g2p, srilm, beamformit +./local/check_tools.sh || exit 1 + +if [ $stage -le 1 ]; then + # skip u03 as they are missing + for mictype in worn u01 u02 u04 u05 u06; do + local/prepare_data.sh --mictype ${mictype} \ + ${audio_dir}/train ${json_dir}/train data/train_${mictype} + done + for dataset in dev; do + for mictype in worn; do + local/prepare_data.sh --mictype ${mictype} \ + ${audio_dir}/${dataset} ${json_dir}/${dataset} \ + data/${dataset}_${mictype} + done + done +fi + +if [ $stage -le 2 ]; then + local/prepare_dict.sh + + utils/prepare_lang.sh \ + data/local/dict "" data/local/lang data/lang + + local/train_lms_srilm.sh \ + --train-text data/train_worn/text --dev-text data/dev_worn/text \ + --oov-symbol "" --words-file data/lang/words.txt \ + data/ data/srilm +fi + +LM=data/srilm/best_3gram.gz +if [ $stage -le 3 ]; then + # Compiles G for chime5 trigram LM + utils/format_lm.sh \ + data/lang $LM data/local/dict/lexicon.txt data/lang + +fi + +if [ $stage -le 4 ]; then + # Beamforming using reference arrays + # enhanced WAV directory + enhandir=enhan + dereverb_dir=${PWD}/wav/wpe/ + for dset in dev eval; do + for mictype in u01 u02 u03 u04 u06; do + local/run_wpe.sh --nj 4 --cmd "$train_cmd --mem 120G" \ + ${audio_dir}/${dset} \ + ${dereverb_dir}/${dset} \ + ${mictype} + done + done + + for dset in dev eval; do + for mictype in u01 u02 u03 u04 u06; do + local/run_beamformit.sh --cmd "$train_cmd" \ + ${dereverb_dir}/${dset} \ + ${enhandir}/${dset}_${enhancement}_${mictype} \ + ${mictype} + done + done + + for dset in dev eval; do + local/prepare_data.sh --mictype ref "$PWD/${enhandir}/${dset}_${enhancement}_u0*" \ + ${json_dir}/${dset} data/${dset}_${enhancement}_dereverb_ref + done +fi + +if [ $stage -le 5 ]; then + # remove possibly bad sessions (P11_S03, P52_S19, P53_S24, P54_S24) + # see http://spandh.dcs.shef.ac.uk/chime_challenge/data.html for more details + utils/copy_data_dir.sh data/train_worn data/train_worn_org # back up + grep -v -e "^P11_S03" -e "^P52_S19" -e "^P53_S24" -e "^P54_S24" data/train_worn_org/text > data/train_worn/text + utils/fix_data_dir.sh data/train_worn +fi + +if [ $stage -le 6 ]; then + local/extract_noises.py $chime5_corpus/audio/train $chime5_corpus/transcriptions/train \ + local/distant_audio_list distant_noises + local/make_noise_list.py distant_noises > distant_noise_list + + noise_list=distant_noise_list + + if [ ! -d RIRS_NOISES/ ]; then + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + + # This is the config for the system using simulated RIRs and point-source noises + rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list") + rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list") + rvb_opts+=(--noise-set-parameters $noise_list) + + steps/data/reverberate_data_dir.py \ + "${rvb_opts[@]}" \ + --prefix "rev" \ + --foreground-snrs $foreground_snrs \ + --background-snrs $background_snrs \ + --speech-rvb-probability 1 \ + --pointsource-noise-addition-probability 1 \ + --isotropic-noise-addition-probability 1 \ + --num-replications $num_data_reps \ + --max-noises-per-minute 1 \ + --source-sampling-rate 16000 \ + data/train_worn data/train_worn_rvb +fi + +if [ $stage -le 7 ]; then + # combine mix array and worn mics + # randomly extract first 100k utterances from all mics + # if you want to include more training data, you can increase the number of array mic utterances + utils/combine_data.sh data/train_uall data/train_u01 data/train_u02 data/train_u04 data/train_u05 data/train_u06 + utils/subset_data_dir.sh data/train_uall 400000 data/train_u400k + utils/combine_data.sh data/${train_set} data/train_worn data/train_worn_rvb data/train_u400k + + # only use left channel for worn mic recognition + # you can use both left and right channels for training + for dset in train dev; do + utils/copy_data_dir.sh data/${dset}_worn data/${dset}_worn_stereo + grep "\.L-" data/${dset}_worn_stereo/text > data/${dset}_worn/text + utils/fix_data_dir.sh data/${dset}_worn + done +fi + +if [ $stage -le 8 ]; then + # fix speaker ID issue (thanks to Dr. Naoyuki Kanda) + # add array ID to the speaker ID to avoid the use of other array information to meet regulations + # Before this fix + # $ head -n 2 data/eval_beamformit_ref_nosplit/utt2spk + # P01_S01_U02_KITCHEN.ENH-0000192-0001278 P01 + # P01_S01_U02_KITCHEN.ENH-0001421-0001481 P01 + # After this fix + # $ head -n 2 data/eval_beamformit_ref_nosplit_fix/utt2spk + # P01_S01_U02_KITCHEN.ENH-0000192-0001278 P01_U02 + # P01_S01_U02_KITCHEN.ENH-0001421-0001481 P01_U02 + for dset in dev_${enhancement}_dereverb_ref eval_${enhancement}_dereverb_ref; do + utils/copy_data_dir.sh data/${dset} data/${dset}_nosplit + mkdir -p data/${dset}_nosplit_fix + cp data/${dset}_nosplit/{segments,text,wav.scp} data/${dset}_nosplit_fix/ + awk -F "_" '{print $0 "_" $3}' data/${dset}_nosplit/utt2spk > data/${dset}_nosplit_fix/utt2spk + utils/utt2spk_to_spk2utt.pl data/${dset}_nosplit_fix/utt2spk > data/${dset}_nosplit_fix/spk2utt + done + + # Split speakers up into 3-minute chunks. This doesn't hurt adaptation, and + # lets us use more jobs for decoding etc. + for dset in ${train_set} dev_worn; do + utils/copy_data_dir.sh data/${dset} data/${dset}_nosplit + utils/data/modify_speaker_info.sh --seconds-per-spk-max 180 data/${dset}_nosplit data/${dset} + done + for dset in dev_${enhancement}_dereverb_ref eval_${enhancement}_dereverb_ref; do + utils/data/modify_speaker_info.sh --seconds-per-spk-max 180 data/${dset}_nosplit_fix data/${dset} + done +fi + +if [ $stage -le 8 ]; then + # Now make MFCC features. + # mfccdir should be some place with a largish disk where you + # want to store MFCC features. + mfccdir=mfcc + for x in ${train_set} ${test_sets}; do + steps/make_mfcc.sh --nj 20 --cmd "$train_cmd" \ + data/$x exp/make_mfcc/$x $mfccdir + steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir + utils/fix_data_dir.sh data/$x + done +fi + +if [ $stage -le 9 ]; then + # make a subset for monophone training + utils/subset_data_dir.sh --shortest data/${train_set} 100000 data/${train_set}_100kshort + utils/subset_data_dir.sh data/${train_set}_100kshort 30000 data/${train_set}_30kshort +fi + +if [ $stage -le 10 ]; then + # Starting basic training on MFCC features + steps/train_mono.sh --nj $nj --cmd "$train_cmd" \ + data/${train_set}_30kshort data/lang exp/mono +fi + +if [ $stage -le 11 ]; then + steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + data/${train_set} data/lang exp/mono exp/mono_ali + + steps/train_deltas.sh --cmd "$train_cmd" \ + 2500 30000 data/${train_set} data/lang exp/mono_ali exp/tri1 +fi + +if [ $stage -le 12 ]; then + steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + data/${train_set} data/lang exp/tri1 exp/tri1_ali + + steps/train_lda_mllt.sh --cmd "$train_cmd" \ + 4000 50000 data/${train_set} data/lang exp/tri1_ali exp/tri2 +fi + +if [ $stage -le 13 ]; then + utils/mkgraph.sh data/lang exp/tri2 exp/tri2/graph + for dset in ${test_sets}; do + steps/decode.sh --nj $decode_nj --cmd "$decode_cmd" --num-threads 4 \ + exp/tri2/graph data/${dset} exp/tri2/decode_${dset} & + done + wait +fi + +if [ $stage -le 14 ]; then + steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + data/${train_set} data/lang exp/tri2 exp/tri2_ali + + steps/train_sat.sh --cmd "$train_cmd" \ + 5000 100000 data/${train_set} data/lang exp/tri2_ali exp/tri3 +fi + +if [ $stage -le 15 ]; then + utils/mkgraph.sh data/lang exp/tri3 exp/tri3/graph + for dset in ${test_sets}; do + steps/decode_fmllr.sh --nj $decode_nj --cmd "$decode_cmd" --num-threads 4 \ + exp/tri3/graph data/${dset} exp/tri3/decode_${dset} & + done + wait +fi + +if [ $stage -le 16 ]; then + # The following script cleans the data and produces cleaned data + steps/cleanup/clean_and_segment_data.sh --nj ${nj} --cmd "$train_cmd" \ + --segmentation-opts "--min-segment-length 0.3 --min-new-segment-length 0.6" \ + data/${train_set} data/lang exp/tri3 exp/tri3_cleaned data/${train_set}_cleaned +fi + +if [ $stage -le 17 ]; then + # chain TDNN + local/chain/tuning/run_tdnn_1b.sh --nj ${nj} \ + --stage $nnet_stage \ + --train-set ${train_set}_cleaned \ + --test-sets "$test_sets" \ + --gmm tri3_cleaned --nnet3-affix _${train_set}_cleaned_rvb +fi + +if [ $stage -le 18 ]; then + # 2-stage decoding + for test_set in $test_sets; do + local/nnet3/decode.sh --affix 2stage --pass2-decode-opts "--min-active 1000" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk 150 --nj $decode_nj \ + --ivector-dir exp/nnet3_${train_set}_cleaned_rvb \ + data/${test_set} data/lang_chain \ + exp/chain_${train_set}_cleaned_rvb/tree_sp/graph \ + exp/chain_${train_set}_cleaned_rvb/tdnn1b_sp + done +fi + +if [ $stage -le 19 ]; then + # final scoring to get the official challenge result + # please specify both dev and eval set directories so that the search parameters + # (insertion penalty and language model weight) will be tuned using the dev set + local/score_for_submit.sh \ + --dev exp/chain_${train_set}_cleaned_rvb/tdnn1b_sp/decode_dev_${enhancement}_dereverb_ref \ + --eval exp/chain_${train_set}_cleaned_rvb/tdnn1b_sp/decode_eval_${enhancement}_dereverb_ref +fi diff --git a/egs/chime5/s5b/steps b/egs/chime5/s5b/steps new file mode 120000 index 00000000000..1b186770dd1 --- /dev/null +++ b/egs/chime5/s5b/steps @@ -0,0 +1 @@ +../../wsj/s5/steps/ \ No newline at end of file diff --git a/egs/chime5/s5b/utils b/egs/chime5/s5b/utils new file mode 120000 index 00000000000..a3279dc8679 --- /dev/null +++ b/egs/chime5/s5b/utils @@ -0,0 +1 @@ +../../wsj/s5/utils/ \ No newline at end of file diff --git a/egs/chime6/README.txt b/egs/chime6/README.txt new file mode 100644 index 00000000000..9fb48c26822 --- /dev/null +++ b/egs/chime6/README.txt @@ -0,0 +1,6 @@ +This is a kaldi recipe for the 6th CHiME Speech Separation and Recognition Challenge (CHiME-6). + +See http://spandh.dcs.shef.ac.uk/chime_challenge/ for more detailed information. + +s5_track1 : Track 1 of the challenge (oracle segments and speaker label is provided) +s5_track2 : Track 2 of the challenge (only raw audio is provided) diff --git a/egs/chime6/s5_track1/RESULTS b/egs/chime6/s5_track1/RESULTS new file mode 100644 index 00000000000..73b47ddf3cc --- /dev/null +++ b/egs/chime6/s5_track1/RESULTS @@ -0,0 +1,21 @@ + +# tri2 +%WER 88.52 [ 52121 / 58881, 2023 ins, 30285 del, 19813 sub ] exp/tri2/decode_dev_gss/wer_17_0.5 + +# tri3 +%WER 85.72 [ 50471 / 58881, 3079 ins, 23787 del, 23605 sub ] exp/tri3/decode_dev_gss/wer_17_0.5 + +# nnet3 tdnn+chain +%WER 41.21 [ 24267 / 58881, 2428 ins, 7606 del, 14233 sub ] exp/chain_train_worn_simu_u400k_cleaned_rvb/tdnn1b_sp/decode_dev_worn_2stage/wer_11_0.0 +%WER 51.76 [ 30474 / 58881, 2665 ins, 11749 del, 16060 sub ] exp/chain_train_worn_simu_u400k_cleaned_rvb/tdnn1b_sp/decode_dev_gss_multiarray_2stage/wer_10_0.0 + +# result with the challenge submission format (Nov 17, 2019) +# after the fix of speaker ID across arrays +==== development set ==== +session S02 room DINING: #words 8288, #errors 4459, wer 53.80 % +session S02 room KITCHEN: #words 12696, #errors 7170, wer 56.47 % +session S02 room LIVING: #words 15460, #errors 7388, wer 47.78 % +session S09 room DINING: #words 5766, #errors 3100, wer 53.76 % +session S09 room KITCHEN: #words 8911, #errors 4483, wer 50.30 % +session S09 room LIVING: #words 7760, #errors 3874, wer 49.92 % +overall: #words 58881, #errors 30474, wer 51.75 % diff --git a/egs/chime6/s5_track1/cmd.sh b/egs/chime6/s5_track1/cmd.sh new file mode 100644 index 00000000000..9702501f1a7 --- /dev/null +++ b/egs/chime6/s5_track1/cmd.sh @@ -0,0 +1,15 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="retry.pl queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" + diff --git a/egs/chime6/s5_track1/conf/beamformit.cfg b/egs/chime6/s5_track1/conf/beamformit.cfg new file mode 100755 index 00000000000..70fdd858651 --- /dev/null +++ b/egs/chime6/s5_track1/conf/beamformit.cfg @@ -0,0 +1,50 @@ +#BeamformIt sample configuration file for AMI data (http://groups.inf.ed.ac.uk/ami/download/) + +# scrolling size to compute the delays +scroll_size = 250 + +# cross correlation computation window size +window_size = 500 + +#amount of maximum points for the xcorrelation taken into account +nbest_amount = 4 + +#flag wether to apply an automatic noise thresholding +do_noise_threshold = 1 + +#Percentage of frames with lower xcorr taken as noisy +noise_percent = 10 + +######## acoustic modelling parameters + +#transition probabilities weight for multichannel decoding +trans_weight_multi = 25 +trans_weight_nbest = 25 + +### + +#flag wether to print the feaures after setting them, or not +print_features = 1 + +#flag wether to use the bad frames in the sum process +do_avoid_bad_frames = 1 + +#flag to use the best channel (SNR) as a reference +#defined from command line +do_compute_reference = 1 + +#flag wether to use a uem file or not(process all the file) +do_use_uem_file = 0 + +#flag wether to use an adaptative weights scheme or fixed weights +do_adapt_weights = 1 + +#flag wether to output the sph files or just run the system to create the auxiliary files +do_write_sph_files = 1 + +####directories where to store/retrieve info#### +#channels_file = ./cfg-files/channels + +#show needs to be passed as argument normally, here a default one is given just in case +#show_id = Ttmp + diff --git a/egs/chime6/s5_track1/conf/mfcc.conf b/egs/chime6/s5_track1/conf/mfcc.conf new file mode 100644 index 00000000000..32988403b00 --- /dev/null +++ b/egs/chime6/s5_track1/conf/mfcc.conf @@ -0,0 +1,2 @@ +--use-energy=false +--sample-frequency=16000 diff --git a/egs/chime6/s5_track1/conf/mfcc_hires.conf b/egs/chime6/s5_track1/conf/mfcc_hires.conf new file mode 100644 index 00000000000..fd64b62eb16 --- /dev/null +++ b/egs/chime6/s5_track1/conf/mfcc_hires.conf @@ -0,0 +1,10 @@ +# config for high-resolution MFCC features, intended for neural network training. +# Note: we keep all cepstra, so it has the same info as filterbank features, +# but MFCC is more easily compressible (because less correlated) which is why +# we prefer this method. +--use-energy=false # use average of log energy, not energy. +--sample-frequency=16000 +--num-mel-bins=40 +--num-ceps=40 +--low-freq=40 +--high-freq=-400 diff --git a/egs/chime6/s5_track1/conf/online_cmvn.conf b/egs/chime6/s5_track1/conf/online_cmvn.conf new file mode 100644 index 00000000000..7748a4a4dd3 --- /dev/null +++ b/egs/chime6/s5_track1/conf/online_cmvn.conf @@ -0,0 +1 @@ +# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh diff --git a/egs/chime6/s5_track1/conf/queue.conf b/egs/chime6/s5_track1/conf/queue.conf new file mode 100644 index 00000000000..73103195684 --- /dev/null +++ b/egs/chime6/s5_track1/conf/queue.conf @@ -0,0 +1,10 @@ +command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* +option mem=* -l mem_free=$0,ram_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -pe smp $0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l hostname='!b19*' +option gpu=* -l gpu=$0 -q g.q -l hostname='!b19*' + diff --git a/egs/chime6/s5_track1/local/add_location_to_uttid.sh b/egs/chime6/s5_track1/local/add_location_to_uttid.sh new file mode 100755 index 00000000000..91bd0c0dd37 --- /dev/null +++ b/egs/chime6/s5_track1/local/add_location_to_uttid.sh @@ -0,0 +1,37 @@ +#!/bin/bash +# Author: Ashish Arora +# Apache 2.0 + +. ./cmd.sh +. ./path.sh + +enhancement=gss +. utils/parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "Wrong #arguments ($#, expected 3)" + echo "Usage: local/add_location_to_uttid.sh [options] " + echo " " + echo "main options (for others, see top of script file)" + echo " --enhancement # enhancement type (gss or beamformit)" + exit 1; +fi + +jdir=$1 +puttdir=$2 +utt_loc_file=$3 + +# Set bash to 'debug' mode, it will exit on : +# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', +set -e +set -u +set -o pipefail + +if [[ ${enhancement} == *gss* ]]; then + local/get_location.py $jdir > $utt_loc_file + local/replace_uttid.py $utt_loc_file $puttdir/per_utt > $puttdir/per_utt_loc +fi + +if [[ ${enhancement} == *beamformit* ]]; then + cat $puttdir/per_utt > $puttdir/per_utt_loc +fi diff --git a/egs/chime5/s5/local/chain/compare_wer.sh b/egs/chime6/s5_track1/local/chain/compare_wer.sh similarity index 100% rename from egs/chime5/s5/local/chain/compare_wer.sh rename to egs/chime6/s5_track1/local/chain/compare_wer.sh diff --git a/egs/tedlium/s5_r3/local/chain/run_tdnnf.sh b/egs/chime6/s5_track1/local/chain/run_tdnn.sh similarity index 100% rename from egs/tedlium/s5_r3/local/chain/run_tdnnf.sh rename to egs/chime6/s5_track1/local/chain/run_tdnn.sh diff --git a/egs/chime6/s5_track1/local/chain/tuning/run_tdnn_1a.sh b/egs/chime6/s5_track1/local/chain/tuning/run_tdnn_1a.sh new file mode 100755 index 00000000000..daad37e2cd7 --- /dev/null +++ b/egs/chime6/s5_track1/local/chain/tuning/run_tdnn_1a.sh @@ -0,0 +1,270 @@ +#!/bin/bash + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=96 +train_set=train_worn_u100k +test_sets="dev_worn dev_beamformit_ref" +gmm=tri3 +nnet3_affix=_train_worn_u100k +lm_suffix= + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1a # affix for the TDNN directory name +tree_affix= +train_stage=-10 +get_egs_stage=-10 +decode_iter= + +# training options +# training chunk-options +chunk_width=140,100,160 +common_egs_dir= +xent_regularize=0.1 + +# training options +srand=0 +remove_egs=true + +#decode options +test_online_decoding=false # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj ${nj} --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 13 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + opts="l2-regularize=0.05" + output_opts="l2-regularize=0.01 bottleneck-dim=320" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 $opts dim=512 + relu-batchnorm-layer name=tdnn2 $opts dim=512 input=Append(-1,0,1) + relu-batchnorm-layer name=tdnn3 $opts dim=512 + relu-batchnorm-layer name=tdnn4 $opts dim=512 input=Append(-1,0,1) + relu-batchnorm-layer name=tdnn5 $opts dim=512 + relu-batchnorm-layer name=tdnn6 $opts dim=512 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn7 $opts dim=512 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn8 $opts dim=512 input=Append(-6,-3,0) + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain $opts dim=512 target-rms=0.5 + output-layer name=output include-log-softmax=false $output_opts dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn8 $opts dim=512 target-rms=0.5 + output-layer name=output-xent $output_opts dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/chime5-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=10 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=4 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=256,128,64 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang${lm_suffix}/ \ + $tree_dir $tree_dir/graph${lm_suffix} || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + steps/nnet3/decode.sh \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk $frames_per_chunk \ + --nj 8 --cmd "$decode_cmd" --num-threads 4 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \ + $tree_dir/graph${lm_suffix} data/${data}_hires ${dir}/decode${lm_suffix}_${data} || exit 1 + ) || touch $dir/.error & + done + wait + [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1 +fi + +# Not testing the 'looped' decoding separately, because for +# TDNN systems it would give exactly the same results as the +# normal decoding. + +if $test_online_decoding && [ $stage -le 17 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online + + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l 2776 combine=-0.134->-0.133 (over 3) xent:train/valid[285,428,final]=(-2.37,-1.95,-1.95/-2.19,-1.90,-1.91) logprob:train/valid[285,428,final]=(-0.201,-0.125,-0.124/-0.198,-0.147,-0.148) + +set -e + +# configs for 'chain' +stage=0 +nj=96 +train_set=train_worn_u400k +test_sets="dev_worn dev_beamformit_ref" +gmm=tri3 +nnet3_affix=_train_worn_u400k +lm_suffix= + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1b # affix for the TDNN directory name +tree_affix= +train_stage=-10 +get_egs_stage=-10 +decode_iter= + +num_epochs=4 +common_egs_dir= +# training options +# training chunk-options +chunk_width=140,100,160 +xent_regularize=0.1 +dropout_schedule='0,0@0.20,0.5@0.50,0' + +# training options +srand=0 +remove_egs=true + +#decode options +test_online_decoding=false # if true, it will run the last decoding stage. +skip_decoding=true +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj ${nj} --cmd "$train_cmd" --generate-ali-from-lats true \ + ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $lat_dir $tree_dir +fi + +if [ $stage -le 13 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + affine_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true" + tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66" + linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0" + prefinal_opts="l2-regularize=0.01" + output_opts="l2-regularize=0.002" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + linear-component name=prefinal-l dim=256 $linear_opts + + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/chime5-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$train_cmd --mem 4G" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule "$dropout_schedule" \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $chunk_width \ + --trainer.num-chunk-per-minibatch 64 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.00025 \ + --trainer.optimization.final-effective-lrate 0.000025 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang${lm_suffix}/ \ + $tree_dir $tree_dir/graph${lm_suffix} || exit 1; +fi + +if [ $stage -le 16 ] && [[ $skip_decoding == "false" ]]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + steps/nnet3/decode.sh \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk $frames_per_chunk \ + --nj 8 --cmd "$decode_cmd" --num-threads 4 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \ + $tree_dir/graph${lm_suffix} data/${data}_hires ${dir}/decode${lm_suffix}_${data} || exit 1 + ) || touch $dir/.error & + done + wait + [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1 +fi + +exit 0; diff --git a/egs/chime6/s5_track1/local/check_tools.sh b/egs/chime6/s5_track1/local/check_tools.sh new file mode 100755 index 00000000000..8e80e25ca33 --- /dev/null +++ b/egs/chime6/s5_track1/local/check_tools.sh @@ -0,0 +1,76 @@ +#!/bin/bash -u + +# Copyright 2015 (c) Johns Hopkins University (Jan Trmal ) + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +[ -f ./path.sh ] && . ./path.sh + +command -v uconv &>/dev/null \ + || { echo >&2 "uconv not found on PATH. You will have to install ICU4C"; exit 1; } + +command -v ngram &>/dev/null \ + || { echo >&2 "srilm not found on PATH. Please use the script $KALDI_ROOT/tools/extras/install_srilm.sh to install it"; exit 1; } + +if [ -z ${LIBLBFGS} ]; then + echo >&2 "SRILM is not compiled with the support of MaxEnt models." + echo >&2 "You should use the script in \$KALDI_ROOT/tools/install_srilm.sh" + echo >&2 "which will take care of compiling the SRILM with MaxEnt support" + exit 1; +fi + +sox=`command -v sox 2>/dev/null` \ + || { echo >&2 "sox not found on PATH. Please install it manually (you will need version 14.4.0 and higher)."; exit 1; } + +# If sox is found on path, check if the version is correct +if [ ! -z "$sox" ]; then + sox_version=`$sox --version 2>&1| head -1 | sed -e 's?.*: ??' -e 's?.* ??'` + if [[ ! $sox_version =~ v14.4.* ]]; then + echo "Unsupported sox version $sox_version found on path. You will need version v14.4.0 and higher." + exit 1 + fi +fi + +command -v phonetisaurus-align &>/dev/null \ + || { echo >&2 "Phonetisaurus not found on PATH. Please use the script $KALDI_ROOT/tools/extras/install_phonetisaurus.sh to install it"; exit 1; } + +command -v BeamformIt &>/dev/null \ + || { echo >&2 "BeamformIt not found on PATH. Please use the script $KALDI_ROOT/tools/extras/install_beamformit.sh to install it"; exit 1; } + +miniconda_dir=$HOME/miniconda3/ +if [ ! -d $miniconda_dir ]; then + echo "$miniconda_dir does not exist. Please run '../../../tools/extras/install_miniconda.sh'" +fi + +# check if WPE is installed +result=`$miniconda_dir/bin/python -c "\ +try: + import nara_wpe + print('1') +except ImportError: + print('0')"` + +if [ "$result" != "1" ]; then + echo "WPE is not installed. Please run ../../../tools/extras/install_wpe.sh" + exit 1 +fi + +# this is used for the audio synchronization +sox_conda=`command -v ${miniconda_dir}/bin/sox 2>/dev/null` +if [ -z "${sox_conda}" ]; then + echo "install conda sox (v14.4.2)" + ${miniconda_dir}/bin/conda install -c conda-forge sox +fi + +exit 0 diff --git a/egs/chime6/s5_track1/local/copy_lat_dir_parallel.sh b/egs/chime6/s5_track1/local/copy_lat_dir_parallel.sh new file mode 100755 index 00000000000..82839604c9e --- /dev/null +++ b/egs/chime6/s5_track1/local/copy_lat_dir_parallel.sh @@ -0,0 +1,97 @@ +#!/bin/bash + +cmd=queue.pl +nj=40 +stage=0 +speed_perturb=true + +. ./path.sh +. utils/parse_options.sh + +if [ $# -ne 4 ]; then + echo "Usage: $0 " + exit 1 +fi + +utt_map=$1 +data=$2 +srcdir=$3 +dir=$4 + +mkdir -p $dir + +cp $srcdir/{phones.txt,tree,final.mdl} $dir || exit 1 +cp $srcdir/{final.alimdl,final.occs,splice_opts,cmvn_opts,delta_opts,final.mat,full.mat} 2>/dev/null || true + +nj_src=$(cat $srcdir/num_jobs) || exit 1 + +if [ $stage -le 1 ]; then + $cmd JOB=1:$nj_src $dir/log/copy_lats_orig.JOB.log \ + lattice-copy "ark:gunzip -c $srcdir/lat.JOB.gz |" \ + ark,scp:$dir/lat_orig.JOB.ark,$dir/lat_orig.JOB.scp || exit 1 +fi + +for n in $(seq $nj_src); do + cat $dir/lat_orig.$n.scp +done > $dir/lat_orig.scp || exit 1 + +if $speed_perturb; then + for s in 0.9 1.1; do + awk -v s=$s '{print "sp"s"-"$1" sp"s"-"$2}' $utt_map + done | cat - $utt_map | sort -k1,1 > $dir/utt_map + utt_map=$dir/utt_map +fi + +if [ $stage -le 2 ]; then + utils/filter_scp.pl -f 2 $dir/lat_orig.scp < $utt_map | \ + utils/apply_map.pl -f 2 $dir/lat_orig.scp > \ + $dir/lat.scp || exit 1 + + if [ ! -s $dir/lat.scp ]; then + echo "$0: $dir/lat.scp is empty. Something went wrong!" + exit 1 + fi +fi + +utils/split_data.sh $data $nj + +if [ $stage -le 3 ]; then + $cmd JOB=1:$nj $dir/log/copy_lats.JOB.log \ + lattice-copy "scp:utils/filter_scp.pl $data/split$nj/JOB/utt2spk $dir/lat.scp |" \ + "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1 +fi + +echo $nj > $dir/num_jobs + +if [ -f $srcdir/ali.1.gz ]; then + if [ $stage -le 4 ]; then + $cmd JOB=1:$nj_src $dir/log/copy_ali_orig.JOB.log \ + copy-int-vector "ark:gunzip -c $srcdir/ali.JOB.gz |" \ + ark,scp:$dir/ali_orig.JOB.ark,$dir/ali_orig.JOB.scp || exit 1 + fi + + for n in $(seq $nj_src); do + cat $dir/ali_orig.$n.scp + done > $dir/ali_orig.scp || exit 1 + + if [ $stage -le 5 ]; then + utils/filter_scp.pl -f 2 $dir/ali_orig.scp < $utt_map | \ + utils/apply_map.pl -f 2 $dir/ali_orig.scp > \ + $dir/ali.scp || exit 1 + + if [ ! -s $dir/ali.scp ]; then + echo "$0: $dir/ali.scp is empty. Something went wrong!" + exit 1 + fi + fi + + utils/split_data.sh $data $nj + + if [ $stage -le 6 ]; then + $cmd JOB=1:$nj $dir/log/copy_ali.JOB.log \ + copy-int-vector "scp:utils/filter_scp.pl $data/split$nj/JOB/utt2spk $dir/ali.scp |" \ + "ark:|gzip -c > $dir/ali.JOB.gz" || exit 1 + fi +fi + +rm $dir/lat_orig.*.{ark,scp} $dir/ali_orig.*.{ark,scp} 2>/dev/null || true diff --git a/egs/chime6/s5_track1/local/decode.sh b/egs/chime6/s5_track1/local/decode.sh new file mode 100755 index 00000000000..7283a171000 --- /dev/null +++ b/egs/chime6/s5_track1/local/decode.sh @@ -0,0 +1,253 @@ +#!/bin/bash +# +# Based mostly on the TED-LIUM and Switchboard recipe +# +# Copyright 2017 Johns Hopkins University (Author: Shinji Watanabe and Yenda Trmal) +# Apache 2.0 +# +# This script only performs recognition experiments with evaluation data +# This script can be run from run.sh or standalone.  +# To run it standalone, you can download a pretrained chain ASR model using: +# wget http://kaldi-asr.org/models/12/0012_asr_v1.tar.gz +# Once it is downloaded, extract using: tar -xvzf 0012_asr_v1.tar.gz +# and copy the contents of the {data/ exp/} directory to your {data/ exp/} + +# Begin configuration section. +decode_nj=20 +gss_nj=50 +stage=0 +enhancement=gss # for a new enhancement method, + # change this variable and stage 4 + +# training data +train_set=train_worn_simu_u400k +# End configuration section +. ./utils/parse_options.sh + +. ./cmd.sh +. ./path.sh + + +set -e # exit on error + +# chime5 main directory path +# please change the path accordingly +chime5_corpus=/export/corpora4/CHiME5 +# chime6 data directories, which are generated from ${chime5_corpus}, +# to synchronize audio files across arrays and modify the annotation (JSON) file accordingly +chime6_corpus=${PWD}/CHiME6 +json_dir=${chime6_corpus}/transcriptions +audio_dir=${chime6_corpus}/audio + +enhanced_dir=enhanced +if [[ ${enhancement} == *gss* ]]; then + enhanced_dir=${enhanced_dir}_multiarray + enhancement=${enhancement}_multiarray +fi + +if [[ ${enhancement} == *beamformit* ]]; then + enhanced_dir=${enhanced_dir} + enhancement=${enhancement} +fi + +enhanced_dir=$(utils/make_absolute.sh $enhanced_dir) || exit 1 +test_sets="dev_${enhancement} eval_${enhancement}" + +# This script also needs the phonetisaurus g2p, srilm, beamformit +./local/check_tools.sh || exit 1 + +########################################################################### +# We first generate the synchronized audio files across arrays and +# corresponding JSON files. Note that this requires sox v14.4.2, +# which is installed via miniconda in ./local/check_tools.sh +########################################################################### + +if [ $stage -le 0 ]; then + local/generate_chime6_data.sh \ + --cmd "$train_cmd" \ + ${chime5_corpus} \ + ${chime6_corpus} +fi + +######################################################################################### +# In stage 1, we perform GSS based enhancement or beamformit for the test sets. multiarray = true +#can take around 10hrs for dev and eval set. +######################################################################################### + +if [ $stage -le 1 ] && [[ ${enhancement} == *gss* ]]; then + echo "$0: enhance data..." + # Guided Source Separation (GSS) from Paderborn University + # http://spandh.dcs.shef.ac.uk/chime_workshop/papers/CHiME_2018_paper_boeddecker.pdf + # @Article{PB2018CHiME5, + # author = {Boeddeker, Christoph and Heitkaemper, Jens and Schmalenstroeer, Joerg and Drude, Lukas and Heymann, Jahn and Haeb-Umbach, Reinhold}, + # title = {{Front-End Processing for the CHiME-5 Dinner Party Scenario}}, + # year = {2018}, + # booktitle = {CHiME5 Workshop}, + # } + + if [ ! -d pb_chime5/ ]; then + local/install_pb_chime5.sh + fi + + if [ ! -f pb_chime5/cache/chime6.json ]; then + ( + cd pb_chime5 + miniconda_dir=$HOME/miniconda3/ + export PATH=$miniconda_dir/bin:$PATH + export CHIME6_DIR=$chime6_corpus + make cache/chime6.json + ) + fi + + for dset in dev eval; do + local/run_gss.sh \ + --cmd "$train_cmd --max-jobs-run $gss_nj" --nj 160 \ + ${dset} \ + ${enhanced_dir} \ + ${enhanced_dir} || exit 1 + done + + for dset in dev eval; do + local/prepare_data.sh --mictype gss ${enhanced_dir}/audio/${dset} \ + ${json_dir}/${dset} data/${dset}_${enhancement} || exit 1 + done +fi + +####################################################################### +# Prepare the dev and eval data with dereverberation (WPE) and +# beamforming. +####################################################################### + +if [ $stage -le 1 ] && [[ ${enhancement} == *beamformit* ]]; then + # Beamforming using reference arrays + # enhanced WAV directory + enhanced_dir=enhan + dereverb_dir=${PWD}/wav/wpe/ + for dset in dev eval; do + for mictype in u01 u02 u03 u04 u05 u06; do + local/run_wpe.sh --nj 4 --cmd "$train_cmd --mem 20G" \ + ${audio_dir}/${dset} \ + ${dereverb_dir}/${dset} \ + ${mictype} + done + done + + for dset in dev eval; do + for mictype in u01 u02 u03 u04 u05 u06; do + local/run_beamformit.sh --cmd "$train_cmd" \ + ${dereverb_dir}/${dset} \ + ${enhanced_dir}/${dset}_${enhancement}_${mictype} \ + ${mictype} + done + done + + for dset in dev eval; do + local/prepare_data.sh --mictype ref "$PWD/${enhanced_dir}/${dset}_${enhancement}_u0*" \ + ${json_dir}/${dset} data/${dset}_${enhancement} + done +fi + +# In GSS enhancement, we do not have array information in utterance ID +if [ $stage -le 2 ] && [[ ${enhancement} == *gss* ]]; then + # Split speakers up into 3-minute chunks. This doesn't hurt adaptation, and + # lets us use more jobs for decoding etc. + for dset in ${test_sets}; do + utils/copy_data_dir.sh data/${dset} data/${dset}_orig + done + + for dset in ${test_sets}; do + utils/data/modify_speaker_info.sh --seconds-per-spk-max 180 data/${dset}_orig data/${dset} + done +fi + +if [ $stage -le 2 ] && [[ ${enhancement} == *beamformit* ]]; then + # fix speaker ID issue (thanks to Dr. Naoyuki Kanda) + # add array ID to the speaker ID to avoid the use of other array information to meet regulations + # Before this fix + # $ head -n 2 data/eval_beamformit_ref_nosplit/utt2spk + # P01_S01_U02_KITCHEN.ENH-0000192-0001278 P01 + # P01_S01_U02_KITCHEN.ENH-0001421-0001481 P01 + # After this fix + # $ head -n 2 data/eval_beamformit_ref_nosplit_fix/utt2spk + # P01_S01_U02_KITCHEN.ENH-0000192-0001278 P01_U02 + # P01_S01_U02_KITCHEN.ENH-0001421-0001481 P01_U02 + echo "$0: fix data..." + for dset in ${test_sets}; do + utils/copy_data_dir.sh data/${dset} data/${dset}_nosplit + mkdir -p data/${dset}_nosplit_fix + for f in segments text wav.scp; do + if [ -f data/${dset}_nosplit/$f ]; then + cp data/${dset}_nosplit/$f data/${dset}_nosplit_fix + fi + done + awk -F "_" '{print $0 "_" $3}' data/${dset}_nosplit/utt2spk > data/${dset}_nosplit_fix/utt2spk + utils/utt2spk_to_spk2utt.pl data/${dset}_nosplit_fix/utt2spk > data/${dset}_nosplit_fix/spk2utt + done + + # Split speakers up into 3-minute chunks. This doesn't hurt adaptation, and + # lets us use more jobs for decoding etc. + for dset in ${test_sets}; do + utils/data/modify_speaker_info.sh --seconds-per-spk-max 180 data/${dset}_nosplit_fix data/${dset} + done +fi + +########################################################################## +# DECODING: we perform 2 stage decoding. +########################################################################## + +nnet3_affix=_${train_set}_cleaned_rvb +lm_suffix= + +if [ $stage -le 3 ]; then + # First the options that are passed through to run_ivector_common.sh + # (some of which are also used in this script directly). + + # The rest are configs specific to this script. Most of the parameters + # are just hardcoded at this level, in the commands below. + echo "$0: decode data..." + affix=1b # affix for the TDNN directory name + tree_affix= + tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix} + dir=exp/chain${nnet3_affix}/tdnn${affix}_sp + + # training options + # training chunk-options + chunk_width=140,100,160 + # we don't need extra left/right context for TDNN systems. + chunk_left_context=0 + chunk_right_context=0 + + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang${lm_suffix}/ \ + $tree_dir $tree_dir/graph${lm_suffix} || exit 1; + + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + local/nnet3/decode.sh --affix 2stage --pass2-decode-opts "--min-active 1000" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk 150 --nj $decode_nj \ + --ivector-dir exp/nnet3${nnet3_affix} \ + data/${data} data/lang${lm_suffix} \ + $tree_dir/graph${lm_suffix} \ + exp/chain${nnet3_affix}/tdnn${affix}_sp + ) || touch $dir/.error & + done + wait + [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1 +fi + +########################################################################## +# Scoring: here we obtain wer per session per location and overall WER +########################################################################## + +if [ $stage -le 4 ]; then + # final scoring to get the official challenge result + # please specify both dev and eval set directories so that the search parameters + # (insertion penalty and language model weight) will be tuned using the dev set + local/score_for_submit.sh --enhancement $enhancement --json $json_dir \ + --dev exp/chain${nnet3_affix}/tdnn1b_sp/decode${lm_suffix}_dev_${enhancement}_2stage \ + --eval exp/chain${nnet3_affix}/tdnn1b_sp/decode${lm_suffix}_eval_${enhancement}_2stage +fi diff --git a/egs/chime6/s5_track1/local/distant_audio_list b/egs/chime6/s5_track1/local/distant_audio_list new file mode 100644 index 00000000000..710945b014b --- /dev/null +++ b/egs/chime6/s5_track1/local/distant_audio_list @@ -0,0 +1,372 @@ +S03_U01.CH1 +S03_U01.CH2 +S03_U01.CH3 +S03_U01.CH4 +S03_U02.CH1 +S03_U02.CH2 +S03_U02.CH3 +S03_U02.CH4 +S03_U03.CH1 +S03_U03.CH2 +S03_U03.CH3 +S03_U03.CH4 +S03_U04.CH1 +S03_U04.CH2 +S03_U04.CH3 +S03_U04.CH4 +S03_U05.CH1 +S03_U05.CH2 +S03_U05.CH3 +S03_U05.CH4 +S03_U06.CH1 +S03_U06.CH2 +S03_U06.CH3 +S03_U06.CH4 +S04_U01.CH1 +S04_U01.CH2 +S04_U01.CH3 +S04_U01.CH4 +S04_U02.CH1 +S04_U02.CH2 +S04_U02.CH3 +S04_U02.CH4 +S04_U03.CH1 +S04_U03.CH2 +S04_U03.CH3 +S04_U03.CH4 +S04_U04.CH1 +S04_U04.CH2 +S04_U04.CH3 +S04_U04.CH4 +S04_U05.CH1 +S04_U05.CH2 +S04_U05.CH3 +S04_U05.CH4 +S04_U06.CH1 +S04_U06.CH2 +S04_U06.CH3 +S04_U06.CH4 +S05_U01.CH1 +S05_U01.CH2 +S05_U01.CH3 +S05_U01.CH4 +S05_U02.CH1 +S05_U02.CH2 +S05_U02.CH3 +S05_U02.CH4 +S05_U05.CH1 +S05_U05.CH2 +S05_U05.CH3 +S05_U05.CH4 +S05_U06.CH1 +S05_U06.CH2 +S05_U06.CH3 +S05_U06.CH4 +S06_U01.CH1 +S06_U01.CH2 +S06_U01.CH3 +S06_U01.CH4 +S06_U02.CH1 +S06_U02.CH2 +S06_U02.CH3 +S06_U02.CH4 +S06_U03.CH1 +S06_U03.CH2 +S06_U03.CH3 +S06_U03.CH4 +S06_U04.CH1 +S06_U04.CH2 +S06_U04.CH3 +S06_U04.CH4 +S06_U05.CH1 +S06_U05.CH2 +S06_U05.CH3 +S06_U05.CH4 +S06_U06.CH1 +S06_U06.CH2 +S06_U06.CH3 +S06_U06.CH4 +S07_U01.CH1 +S07_U01.CH2 +S07_U01.CH3 +S07_U01.CH4 +S07_U02.CH1 +S07_U02.CH2 +S07_U02.CH3 +S07_U02.CH4 +S07_U03.CH1 +S07_U03.CH2 +S07_U03.CH3 +S07_U03.CH4 +S07_U04.CH1 +S07_U04.CH2 +S07_U04.CH3 +S07_U04.CH4 +S07_U05.CH1 +S07_U05.CH2 +S07_U05.CH3 +S07_U05.CH4 +S07_U06.CH1 +S07_U06.CH2 +S07_U06.CH3 +S07_U06.CH4 +S08_U01.CH1 +S08_U01.CH2 +S08_U01.CH3 +S08_U01.CH4 +S08_U02.CH1 +S08_U02.CH2 +S08_U02.CH3 +S08_U02.CH4 +S08_U03.CH1 +S08_U03.CH2 +S08_U03.CH3 +S08_U03.CH4 +S08_U04.CH1 +S08_U04.CH2 +S08_U04.CH3 +S08_U04.CH4 +S08_U05.CH1 +S08_U05.CH2 +S08_U05.CH3 +S08_U05.CH4 +S08_U06.CH1 +S08_U06.CH2 +S08_U06.CH3 +S08_U06.CH4 +S12_U01.CH1 +S12_U01.CH2 +S12_U01.CH3 +S12_U01.CH4 +S12_U02.CH1 +S12_U02.CH2 +S12_U02.CH3 +S12_U02.CH4 +S12_U03.CH1 +S12_U03.CH2 +S12_U03.CH3 +S12_U03.CH4 +S12_U04.CH1 +S12_U04.CH2 +S12_U04.CH3 +S12_U04.CH4 +S12_U05.CH1 +S12_U05.CH2 +S12_U05.CH3 +S12_U05.CH4 +S12_U06.CH1 +S12_U06.CH2 +S12_U06.CH3 +S12_U06.CH4 +S13_U01.CH1 +S13_U01.CH2 +S13_U01.CH3 +S13_U01.CH4 +S13_U02.CH1 +S13_U02.CH2 +S13_U02.CH3 +S13_U02.CH4 +S13_U03.CH1 +S13_U03.CH2 +S13_U03.CH3 +S13_U03.CH4 +S13_U04.CH1 +S13_U04.CH2 +S13_U04.CH3 +S13_U04.CH4 +S13_U05.CH1 +S13_U05.CH2 +S13_U05.CH3 +S13_U05.CH4 +S13_U06.CH1 +S13_U06.CH2 +S13_U06.CH3 +S13_U06.CH4 +S16_U01.CH1 +S16_U01.CH2 +S16_U01.CH3 +S16_U01.CH4 +S16_U02.CH1 +S16_U02.CH2 +S16_U02.CH3 +S16_U02.CH4 +S16_U03.CH1 +S16_U03.CH2 +S16_U03.CH3 +S16_U03.CH4 +S16_U04.CH1 +S16_U04.CH2 +S16_U04.CH3 +S16_U04.CH4 +S16_U05.CH1 +S16_U05.CH2 +S16_U05.CH3 +S16_U05.CH4 +S16_U06.CH1 +S16_U06.CH2 +S16_U06.CH3 +S16_U06.CH4 +S17_U01.CH1 +S17_U01.CH2 +S17_U01.CH3 +S17_U01.CH4 +S17_U02.CH1 +S17_U02.CH2 +S17_U02.CH3 +S17_U02.CH4 +S17_U03.CH1 +S17_U03.CH2 +S17_U03.CH3 +S17_U03.CH4 +S17_U04.CH1 +S17_U04.CH2 +S17_U04.CH3 +S17_U04.CH4 +S17_U05.CH1 +S17_U05.CH2 +S17_U05.CH3 +S17_U05.CH4 +S17_U06.CH1 +S17_U06.CH2 +S17_U06.CH3 +S17_U06.CH4 +S18_U01.CH1 +S18_U01.CH2 +S18_U01.CH3 +S18_U01.CH4 +S18_U02.CH1 +S18_U02.CH2 +S18_U02.CH3 +S18_U02.CH4 +S18_U03.CH1 +S18_U03.CH2 +S18_U03.CH3 +S18_U03.CH4 +S18_U04.CH1 +S18_U04.CH2 +S18_U04.CH3 +S18_U04.CH4 +S18_U05.CH1 +S18_U05.CH2 +S18_U05.CH3 +S18_U05.CH4 +S18_U06.CH1 +S18_U06.CH2 +S18_U06.CH3 +S18_U06.CH4 +S19_U01.CH1 +S19_U01.CH2 +S19_U01.CH3 +S19_U01.CH4 +S19_U02.CH1 +S19_U02.CH2 +S19_U02.CH3 +S19_U02.CH4 +S19_U03.CH1 +S19_U03.CH2 +S19_U03.CH3 +S19_U03.CH4 +S19_U04.CH1 +S19_U04.CH2 +S19_U04.CH3 +S19_U04.CH4 +S19_U05.CH1 +S19_U05.CH2 +S19_U05.CH3 +S19_U05.CH4 +S19_U06.CH1 +S19_U06.CH2 +S19_U06.CH3 +S19_U06.CH4 +S20_U01.CH1 +S20_U01.CH2 +S20_U01.CH3 +S20_U01.CH4 +S20_U02.CH1 +S20_U02.CH2 +S20_U02.CH3 +S20_U02.CH4 +S20_U03.CH1 +S20_U03.CH2 +S20_U03.CH3 +S20_U03.CH4 +S20_U04.CH1 +S20_U04.CH2 +S20_U04.CH3 +S20_U04.CH4 +S20_U05.CH1 +S20_U05.CH2 +S20_U05.CH3 +S20_U05.CH4 +S20_U06.CH1 +S20_U06.CH2 +S20_U06.CH3 +S20_U06.CH4 +S22_U01.CH1 +S22_U01.CH2 +S22_U01.CH3 +S22_U01.CH4 +S22_U02.CH1 +S22_U02.CH2 +S22_U02.CH3 +S22_U02.CH4 +S22_U04.CH1 +S22_U04.CH2 +S22_U04.CH3 +S22_U04.CH4 +S22_U05.CH1 +S22_U05.CH2 +S22_U05.CH3 +S22_U05.CH4 +S22_U06.CH1 +S22_U06.CH2 +S22_U06.CH3 +S22_U06.CH4 +S23_U01.CH1 +S23_U01.CH2 +S23_U01.CH3 +S23_U01.CH4 +S23_U02.CH1 +S23_U02.CH2 +S23_U02.CH3 +S23_U02.CH4 +S23_U03.CH1 +S23_U03.CH2 +S23_U03.CH3 +S23_U03.CH4 +S23_U04.CH1 +S23_U04.CH2 +S23_U04.CH3 +S23_U04.CH4 +S23_U05.CH1 +S23_U05.CH2 +S23_U05.CH3 +S23_U05.CH4 +S23_U06.CH1 +S23_U06.CH2 +S23_U06.CH3 +S23_U06.CH4 +S24_U01.CH1 +S24_U01.CH2 +S24_U01.CH3 +S24_U01.CH4 +S24_U02.CH1 +S24_U02.CH2 +S24_U02.CH3 +S24_U02.CH4 +S24_U03.CH1 +S24_U03.CH2 +S24_U03.CH3 +S24_U03.CH4 +S24_U04.CH1 +S24_U04.CH2 +S24_U04.CH3 +S24_U04.CH4 +S24_U05.CH1 +S24_U05.CH2 +S24_U05.CH3 +S24_U05.CH4 +S24_U06.CH1 +S24_U06.CH2 +S24_U06.CH3 +S24_U06.CH4 diff --git a/egs/chime6/s5_track1/local/extract_noises.py b/egs/chime6/s5_track1/local/extract_noises.py new file mode 100755 index 00000000000..8f617752f2d --- /dev/null +++ b/egs/chime6/s5_track1/local/extract_noises.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 + +import argparse +import json +import logging +import os +import sys +import scipy.io.wavfile as siw +import math +import numpy as np + + +def get_args(): + parser = argparse.ArgumentParser( + """Extract noises from the corpus based on the non-speech regions. + e.g. {} /export/corpora4/CHiME5/audio/train/ \\ + /export/corpora4/CHiME5/transcriptions/train/ \\ + /export/b05/zhiqiw/noise/""".format(sys.argv[0])) + + parser.add_argument("--segment-length", default=20) + parser.add_argument("audio_dir", help="""Location of the CHiME5 Audio files. e.g. /export/corpora4/CHiME5/audio/train/""") + parser.add_argument("trans_dir", help="""Location of the CHiME5 Transcriptions. e.g. /export/corpora4/CHiME5/transcriptions/train/""") + parser.add_argument("audio_list", help="""List of ids of the CHiME5 recordings from which noise is extracted. e.g. local/distant_audio_list""") + parser.add_argument("out_dir", help="Output directory to write noise files. e.g. /export/b05/zhiqiw/noise/") + + args = parser.parse_args() + return args + + +def Trans_time(time, fs): + units = time.split(':') + time_second = float(units[0]) * 3600 + float(units[1]) * 60 + float(units[2]) + return int(time_second*fs) + + +# remove mic dependency for CHiME-6 +def Get_time(conf, tag, fs): + for i in conf: + st = Trans_time(i['start_time'], fs) + ed = Trans_time(i['end_time'], fs) + tag[st:ed] = 0 + return tag + + +def write_noise(out_dir, seg, audio, sig, tag, fs, cnt): + sig_noise = sig[np.nonzero(tag)] + for i in range(math.floor(len(sig_noise)/(seg*fs))): + siw.write(out_dir +'/noise'+str(cnt)+'.wav', fs, sig_noise[i*seg*fs:(i+1)*seg*fs]) + cnt += 1 + return cnt + + +def main(): + args = get_args() + + if not os.path.exists(args.out_dir): + os.makedirs(args.out_dir) + + wav_list = open(args.audio_list).readlines() + + cnt = 1 + for i, audio in enumerate(wav_list): + parts = audio.strip().split('.') + if len(parts) == 2: + # Assuming distant mic with name like S03_U01.CH1 + session, mic = parts[0].split('_') + channel = parts[1] + base_name = session + "_" + mic + "." + channel + else: + # Assuming close talk mic with name like S03_P09 + session, mic = audio.strip().split('_') + base_name = session + "_" + mic + fs, sig = siw.read(args.audio_dir + "/" + base_name + '.wav') + tag = np.ones(len(sig)) + if i == 0 or session != session_p: + with open(args.trans_dir + "/" + session + '.json') as f: + conf = json.load(f) + tag = Get_time(conf, tag, fs) + cnt = write_noise(args.out_dir, args.segment_length, audio, sig, tag, fs, cnt) + session_p = session + + +if __name__ == '__main__': + main() diff --git a/egs/chime6/s5_track1/local/extract_vad_weights.sh b/egs/chime6/s5_track1/local/extract_vad_weights.sh new file mode 100755 index 00000000000..250b021bd8f --- /dev/null +++ b/egs/chime6/s5_track1/local/extract_vad_weights.sh @@ -0,0 +1,86 @@ +#!/bin/bash + +# Copyright 2016 Johns Hopkins University (Author: Daniel Povey, Vijayaditya Peddinti) +# 2019 Vimal Manohar +# Apache 2.0. + +# This script converts lattices available from a first pass decode into a per-frame weights file +# The ctms generated from the lattices are filtered. Silence frames are assigned a low weight (e.g.0.00001) +# and voiced frames have a weight of 1. + +set -e + +stage=1 +cmd=run.pl +silence_weight=0.00001 +#end configuration section. + +. ./cmd.sh + +[ -f ./path.sh ] && . ./path.sh +. utils/parse_options.sh || exit 1; +if [ $# -ne 4 ]; then + echo "Usage: $0 [--cmd (run.pl|queue.pl...)] " + echo " Options:" + echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." + exit 1; +fi + +data_dir=$1 +lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied. +decode_dir=$3 +output_wts_file_gz=$4 + +if [ $stage -le 1 ]; then + echo "$0: generating CTM from input lattices" + steps/get_ctm_conf.sh --cmd "$cmd" \ + --use-segments false \ + $data_dir \ + $lang \ + $decode_dir +fi + +if [ $stage -le 2 ]; then + name=`basename $data_dir` + # we just take the ctm from LMWT 10, it doesn't seem to affect the results a lot + ctm=$decode_dir/score_10/$name.ctm + echo "$0: generating weights file from ctm $ctm" + + pad_frames=0 # this did not seem to be helpful but leaving it as an option. + feat-to-len scp:$data_dir/feats.scp ark,t:- >$decode_dir/utt.lengths + if [ ! -f $ctm ]; then echo "$0: expected ctm to exist: $ctm"; exit 1; fi + + cat $ctm | awk '$6 == 1.0 && $4 < 1.0' | \ + grep -v -w mm | grep -v -w mhm | grep -v -F '[noise]' | \ + grep -v -F '[laughter]' | grep -v -F '' | \ + perl -e ' $lengths=shift @ARGV; $pad_frames=shift @ARGV; $silence_weight=shift @ARGV; + $pad_frames >= 0 || die "bad pad-frames value $pad_frames"; + open(L, "<$lengths") || die "opening lengths file"; + @all_utts = (); + $utt2ref = { }; + while () { + ($utt, $len) = split(" ", $_); + push @all_utts, $utt; + $array_ref = [ ]; + for ($n = 0; $n < $len; $n++) { ${$array_ref}[$n] = $silence_weight; } + $utt2ref{$utt} = $array_ref; + } + while () { + @A = split(" ", $_); + @A == 6 || die "bad ctm line $_"; + $utt = $A[0]; $beg = $A[2]; $len = $A[3]; + $beg_int = int($beg * 100) - $pad_frames; + $len_int = int($len * 100) + 2*$pad_frames; + $array_ref = $utt2ref{$utt}; + !defined $array_ref && die "No length info for utterance $utt"; + for ($t = $beg_int; $t < $beg_int + $len_int; $t++) { + if ($t >= 0 && $t < @$array_ref) { + ${$array_ref}[$t] = 1; + } + } + } + foreach $utt (@all_utts) { $array_ref = $utt2ref{$utt}; + print $utt, " [ ", join(" ", @$array_ref), " ]\n"; + } ' $decode_dir/utt.lengths $pad_frames $silence_weight | \ + gzip -c > $output_wts_file_gz +fi diff --git a/egs/chime6/s5_track1/local/generate_chime6_data.sh b/egs/chime6/s5_track1/local/generate_chime6_data.sh new file mode 100755 index 00000000000..93106cf605a --- /dev/null +++ b/egs/chime6/s5_track1/local/generate_chime6_data.sh @@ -0,0 +1,121 @@ +#!/bin/bash + +# Copyright 2019, Johns Hopkins University (Author: Shinji Watanabe) +# Apache 2.0 +# +# This script generates synchronized audio data across arrays by considering +# the frame dropping, clock drift etc. done by Prof. Jon Barker at University of +# Sheffield. This script first downloads the synchronization tool and generate +# the synchronized audios and corresponding JSON transcription files +# Note that +# 1) the JSON format is slightly changed from the original CHiME-5 one (simplified +# thanks to the synchronization) +# 2) it requires sox v.14.4.2 and Python 3.6.7 +# Unfortunately, the generated files would be different depending on the sox +# and Python versions and to generate the exactly same audio files, this script uses +# the fixed versions of sox and Python installed in the miniconda instead of system ones + +. ./cmd.sh +. ./path.sh + +# Config: +cmd=run.pl + +. utils/parse_options.sh || exit 1; + +if [ $# != 2 ]; then + echo "Wrong #arguments ($#, expected 2)" + echo "Usage: local/generate_chime6_data.sh [options] " + echo "main options (for others, see top of script file)" + echo " --cmd # Command to run in parallel with" + exit 1; +fi + +sdir=$1 +odir=$2 +expdir=${PWD}/exp/chime6_data + +# Set bash to 'debug' mode, it will exit on : +# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', +set -e +set -u +set -o pipefail + +# get chime6-synchronisation tools +SYNC_PATH=${PWD}/chime6-synchronisation +if [ ! -d ${SYNC_PATH} ]; then + git clone https://github.com/chimechallenge/chime6-synchronisation.git +fi + +mkdir -p ${odir} +mkdir -p ${expdir}/log + +# split the session to avoid too much disk access +sessions1="S01 S02 S03 S04 S05 S06 S07" +sessions2="S08 S09 S12 S13 S16 S17 S18" +sessions3="S19 S20 S21 S22 S23 S24" + +CONDA_PATH=${HOME}/miniconda3/bin +IN_PATH=${sdir}/audio +OUT_PATH=${odir}/audio +TMP_PATH=${odir}/audio_tmp + +if [ ! -d "${IN_PATH}" ]; then + echo "please specify the CHiME-5 data path correctly" + exit 1 +fi +mkdir -p $OUT_PATH/train $OUT_PATH/eval $OUT_PATH/dev +mkdir -p $TMP_PATH/train $TMP_PATH/eval $TMP_PATH/dev + +if [ -f ${odir}/audio/dev/S02_P05.wav ]; then + echo "CHiME-6 date already exists" + exit 0 +fi + +pushd ${SYNC_PATH} +echo "Correct for frame dropping" +for session in ${sessions1}; do + $cmd ${expdir}/correct_signals_for_frame_drops.${session}.log \ + ${CONDA_PATH}/python correct_signals_for_frame_drops.py --session=${session} chime6_audio_edits.json $IN_PATH $TMP_PATH & +done +wait +for session in ${sessions2}; do + $cmd ${expdir}/correct_signals_for_frame_drops.${session}.log \ + ${CONDA_PATH}/python correct_signals_for_frame_drops.py --session=${session} chime6_audio_edits.json $IN_PATH $TMP_PATH & +done +wait +for session in ${sessions3}; do + $cmd ${expdir}/correct_signals_for_frame_drops.${session}.log \ + ${CONDA_PATH}/python correct_signals_for_frame_drops.py --session=${session} chime6_audio_edits.json $IN_PATH $TMP_PATH & +done +wait + +echo "Sox processing for correcting clock drift" +for session in ${sessions1}; do + $cmd ${expdir}/correct_signals_for_clock_drift.${session}.log \ + ${CONDA_PATH}/python correct_signals_for_clock_drift.py --session=${session} --sox_path $CONDA_PATH chime6_audio_edits.json $TMP_PATH $OUT_PATH & +done +wait +for session in ${sessions2}; do + $cmd ${expdir}/correct_signals_for_clock_drift.${session}.log \ + ${CONDA_PATH}/python correct_signals_for_clock_drift.py --session=${session} --sox_path $CONDA_PATH chime6_audio_edits.json $TMP_PATH $OUT_PATH & +done +wait +for session in ${sessions3}; do + $cmd ${expdir}/correct_signals_for_clock_drift.${session}.log \ + ${CONDA_PATH}/python correct_signals_for_clock_drift.py --session=${session} --sox_path $CONDA_PATH chime6_audio_edits.json $TMP_PATH $OUT_PATH & +done +wait + +echo "adjust the JSON files" +mkdir -p ${odir}/transcriptions/eval ${odir}/transcriptions/dev ${odir}/transcriptions/train +${CONDA_PATH}/python correct_transcript_for_clock_drift.py --clock_drift_data chime6_audio_edits.json ${sdir}/transcriptions ${odir}/transcriptions +popd + +# finally check md5sum +pushd ${odir} +echo "check MD5 hash value for generated audios" +md5sum -c ${SYNC_PATH}/audio_md5sums.txt || echo "check https://github.com/chimechallenge/chime6-synchronisation" +popd + +echo "`basename $0` Done." diff --git a/egs/chime6/s5_track1/local/get_location.py b/egs/chime6/s5_track1/local/get_location.py new file mode 100755 index 00000000000..92351e72e65 --- /dev/null +++ b/egs/chime6/s5_track1/local/get_location.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 +# Copyright Ashish Arora +# Apache 2.0 +# This script create a utterance and location mapping file +# It is used in score_for_submit script to get locationwise WER. +# for GSS enhancement + +import json +from datetime import timedelta +from glob import glob +import sys, io +from decimal import Decimal + +SAMPLE_RATE = 16000 + +def to_samples(time: str): + "mapping time in string to int, as mapped in pb_chime5" + "see https://github.com/fgnt/pb_chime5/blob/master/pb_chime5/database/chime5/get_speaker_activity.py" + hours, minutes, seconds = [t for t in time.split(':')] + hours = int(hours) + minutes = int(minutes) + seconds = Decimal(seconds) + + seconds_samples = seconds * SAMPLE_RATE + samples = ( + hours * 3600 * SAMPLE_RATE + + minutes * 60 * SAMPLE_RATE + + seconds_samples + ) + return int(samples) + + +def main(): + output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') + json_file_location= sys.argv[1] + '/*.json' + json_files = glob(json_file_location) + + json_file_location= sys.argv[1] + '/*.json' + json_files = glob(json_file_location) + location_dict = {} + json_file_location= sys.argv[1] + '/*.json' + json_files = glob(json_file_location) + location_dict = {} + for file in json_files: + with open(file, 'r') as f: + session_dict = json.load(f) + + for uttid in session_dict: + try: + ref=uttid['ref'] + speaker_id = uttid['speaker'] + location = uttid['location'] + location=location.upper() + session_id=uttid['session_id'] + words = uttid['words'] + end_sample=to_samples(str(uttid['end_time'])) + start_sample=to_samples(str(uttid['start_time'])) + start_sample_str = str(int(start_sample * 100 / SAMPLE_RATE)).zfill(7) + end_sample_str = str(int(end_sample * 100 / SAMPLE_RATE)).zfill(7) + utt = "{0}_{1}-{2}-{3}".format(speaker_id, session_id, start_sample_str, end_sample_str) + location_dict[utt]=(location) + except: + continue + + for key in sorted(location_dict.keys()): + utt= "{0} {1}".format(key, location_dict[key]) + output.write(utt+ '\n') + +if __name__ == '__main__': + main() diff --git a/egs/chime6/s5_track1/local/install_pb_chime5.sh b/egs/chime6/s5_track1/local/install_pb_chime5.sh new file mode 100755 index 00000000000..a151dc60f12 --- /dev/null +++ b/egs/chime6/s5_track1/local/install_pb_chime5.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +# Installs pb_chime5 +# miniconda should be installed in $HOME/miniconda3/ + +miniconda_dir=$HOME/miniconda3/ + +if [ ! -d $miniconda_dir ]; then + echo "$miniconda_dir does not exist. Please run 'tools/extras/install_miniconda.sh" && exit 1; +fi + +git clone https://github.com/fgnt/pb_chime5.git +cd pb_chime5 +# Download submodule dependencies # https://stackoverflow.com/a/3796947/5766934 +git submodule init +git submodule update + +$miniconda_dir/bin/python -m pip install cython +$miniconda_dir/bin/python -m pip install pymongo +$miniconda_dir/bin/python -m pip install fire +$miniconda_dir/bin/python -m pip install -e pb_bss/ +$miniconda_dir/bin/python -m pip install -e . diff --git a/egs/chime6/s5_track1/local/json2text.py b/egs/chime6/s5_track1/local/json2text.py new file mode 100755 index 00000000000..34cf52f086b --- /dev/null +++ b/egs/chime6/s5_track1/local/json2text.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 + +# Copyright 2017 Johns Hopkins University (Shinji Watanabe) +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +import json +import argparse +import logging +import sys + + +def hms_to_seconds(hms): + hour = hms.split(':')[0] + minute = hms.split(':')[1] + second = hms.split(':')[2].split('.')[0] + + # .xx (10 ms order) + ms10 = hms.split(':')[2].split('.')[1] + + # total seconds + seconds = int(hour) * 3600 + int(minute) * 60 + int(second) + + return '{:07d}'.format(int(str(seconds) + ms10)) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('json', type=str, help='JSON transcription file') + parser.add_argument('--mictype', type=str, + choices=['ref', 'worn', 'gss', 'u01', 'u02', 'u03', 'u04', 'u05', 'u06'], + help='Type of microphones') + args = parser.parse_args() + + # logging info + log_format = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s:%(message)s" + logging.basicConfig(level=logging.INFO, format=log_format) + + logging.debug("reading %s", args.json) + with open(args.json, 'rt', encoding="utf-8") as f: + j = json.load(f) + + for x in j: + if '[redacted]' not in x['words']: + session_id = x['session_id'] + speaker_id = x['speaker'] + if args.mictype == 'ref': + mictype = x['ref'] + elif args.mictype == 'worn' or args.mictype == 'gss': + mictype = 'original' + else: + mictype = args.mictype.upper() # convert from u01 to U01 + + # add location tag for scoring (only for dev and eval sets) + if 'location' in x.keys(): + location = x['location'].upper() + else: + location = 'NOLOCATION' + + # remove mic dependency for CHiME-6 + start_time = x['start_time'] + end_time = x['end_time'] + + # remove meta chars and convert to lower + words = x['words'].replace('"', '')\ + .replace('.', '')\ + .replace('?', '')\ + .replace(',', '')\ + .replace(':', '')\ + .replace(';', '')\ + .replace('!', '').lower() + + # remove multiple spaces + words = " ".join(words.split()) + + # convert to seconds, e.g., 1:10:05.55 -> 3600 + 600 + 5.55 = 4205.55 + start_time = hms_to_seconds(start_time) + end_time = hms_to_seconds(end_time) + + uttid = speaker_id + '_' + session_id + if not args.mictype in ['worn', 'gss']: + uttid += '_' + mictype + + if args.mictype == 'gss': + uttid += '-' + start_time + '-' + end_time + else: + uttid += '_' + location + '-' + start_time + '-' + end_time + + # In several utterances, there are inconsistency in the time stamp + # (the end time is earlier than the start time) + # We just ignored such utterances. + if end_time > start_time: + sys.stdout.buffer.write((uttid + ' ' + words + '\n').encode("utf-8")) diff --git a/egs/chime6/s5_track1/local/make_noise_list.py b/egs/chime6/s5_track1/local/make_noise_list.py new file mode 100755 index 00000000000..5aaf7fa4062 --- /dev/null +++ b/egs/chime6/s5_track1/local/make_noise_list.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python3 + +import glob +import os +import sys + + +if len(sys.argv) != 2: + print ("Usage: {} ".format(sys.argv[0])) + raise SystemExit(1) + + +for line in glob.glob("{}/*.wav".format(sys.argv[1])): + fname = os.path.basename(line.strip()) + + print ("--noise-id {} --noise-type point-source " + "--bg-fg-type foreground {}".format(fname, line.strip())) diff --git a/egs/chime6/s5_track1/local/nnet3/compare_wer.sh b/egs/chime6/s5_track1/local/nnet3/compare_wer.sh new file mode 100755 index 00000000000..095e85cc338 --- /dev/null +++ b/egs/chime6/s5_track1/local/nnet3/compare_wer.sh @@ -0,0 +1,132 @@ +#!/bin/bash + +# this script is used for comparing decoding results between systems. +# e.g. local/chain/compare_wer.sh exp/chain/tdnn_{c,d}_sp +# For use with discriminatively trained systems you specify the epochs after a colon: +# for instance, +# local/chain/compare_wer.sh exp/chain/tdnn_c_sp exp/chain/tdnn_c_sp_smbr:{1,2,3} + + +if [ $# == 0 ]; then + echo "Usage: $0: [--looped] [--online] [ ... ]" + echo "e.g.: $0 exp/chain/tdnn_{b,c}_sp" + echo "or (with epoch numbers for discriminative training):" + echo "$0 exp/chain/tdnn_b_sp_disc:{1,2,3}" + exit 1 +fi + +echo "# $0 $*" + +include_looped=false +if [ "$1" == "--looped" ]; then + include_looped=true + shift +fi +include_online=false +if [ "$1" == "--online" ]; then + include_online=true + shift +fi + + +used_epochs=false + +# this function set_names is used to separate the epoch-related parts of the name +# [for discriminative training] and the regular parts of the name. +# If called with a colon-free directory name, like: +# set_names exp/chain/tdnn_lstm1e_sp_bi_smbr +# it will set dir=exp/chain/tdnn_lstm1e_sp_bi_smbr and epoch_infix="" +# If called with something like: +# set_names exp/chain/tdnn_d_sp_smbr:3 +# it will set dir=exp/chain/tdnn_d_sp_smbr and epoch_infix="_epoch3" + + +set_names() { + if [ $# != 1 ]; then + echo "compare_wer_general.sh: internal error" + exit 1 # exit the program + fi + dirname=$(echo $1 | cut -d: -f1) + epoch=$(echo $1 | cut -s -d: -f2) + if [ -z $epoch ]; then + epoch_infix="" + else + used_epochs=true + epoch_infix=_epoch${epoch} + fi +} + + + +echo -n "# System " +for x in $*; do printf "% 10s" " $(basename $x)"; done +echo + +strings=( + "#WER dev_clean_2 (tgsmall) " + "#WER dev_clean_2 (tglarge) ") + +for n in 0 1; do + echo -n "${strings[$n]}" + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + decode_names=(tgsmall_dev_clean_2 tglarge_dev_clean_2) + + wer=$(cat $dirname/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + if $include_looped; then + echo -n "# [looped:] " + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + wer=$(cat $dirname/decode_looped_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + fi + if $include_online; then + echo -n "# [online:] " + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + wer=$(cat ${dirname}_online/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + fi +done + + +if $used_epochs; then + exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. +fi + +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.{final,combined}.log 2>/dev/null | grep log-like | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.{final,combined}.log 2>/dev/null | grep log-like | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final train acc " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.{final,combined}.log 2>/dev/null | grep accuracy | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid acc " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.{final,combined}.log 2>/dev/null | grep accuracy | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo diff --git a/egs/chime6/s5_track1/local/nnet3/decode.sh b/egs/chime6/s5_track1/local/nnet3/decode.sh new file mode 100755 index 00000000000..8fa54e0d4a6 --- /dev/null +++ b/egs/chime6/s5_track1/local/nnet3/decode.sh @@ -0,0 +1,164 @@ +#!/bin/bash + +# Copyright 2016 Johns Hopkins University (Author: Daniel Povey, Vijayaditya Peddinti) +# 2019 Vimal Manohar +# Apache 2.0. + +# This script does 2-stage decoding where the first stage is used to get +# reliable frames for i-vector extraction. + +set -e + +# general opts +iter= +stage=0 +nj=30 +affix= # affix for decode directory + +# ivector opts +max_count=75 # parameter for extract_ivectors.sh +sub_speaker_frames=6000 +ivector_scale=0.75 +get_weights_from_ctm=true +weights_file= # use weights from this archive (must be compressed using gunzip) +silence_weight=0.00001 # apply this weight to silence frames during i-vector extraction +ivector_dir=exp/nnet3 + +# decode opts +pass2_decode_opts="--min-active 1000" +lattice_beam=8 +extra_left_context=0 # change for (B)LSTM +extra_right_context=0 # change for BLSTM +frames_per_chunk=50 # change for (B)LSTM +acwt=0.1 # important to change this when using chain models +post_decode_acwt=1.0 # important to change this when using chain models +extra_left_context_initial=0 +extra_right_context_final=0 + +graph_affix= + +score_opts="--min-lmwt 6 --max-lmwt 13" + +. ./cmd.sh +[ -f ./path.sh ] && . ./path.sh +. utils/parse_options.sh || exit 1; + +if [ $# -ne 4 ]; then + echo "Usage: $0 [options] " + echo " Options:" + echo " --stage (0|1|2) # start scoring script from part-way through." + echo "e.g.:" + echo "$0 data/dev data/lang exp/tri5a/graph_pp exp/nnet3/tdnn" + exit 1; +fi + +data=$1 # data directory +lang=$2 # data/lang +graph=$3 #exp/tri5a/graph_pp +dir=$4 # exp/nnet3/tdnn + +model_affix=`basename $dir` +ivector_affix=${affix:+_$affix}_chain_${model_affix}${iter:+_iter$iter} +affix=${affix:+_${affix}}${iter:+_iter${iter}} + +if [ $stage -le 1 ]; then + if [ ! -s ${data}_hires/feats.scp ]; then + utils/copy_data_dir.sh $data ${data}_hires + steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj $nj --cmd "$train_cmd" ${data}_hires + steps/compute_cmvn_stats.sh ${data}_hires + utils/fix_data_dir.sh ${data}_hires + fi +fi + +data_set=$(basename $data) +if [ $stage -le 2 ]; then + echo "Extracting i-vectors, stage 1" + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \ + --max-count $max_count \ + ${data}_hires $ivector_dir/extractor \ + $ivector_dir/ivectors_${data_set}${ivector_affix}_stage1; + # float comparisons are hard in bash + if [ `bc <<< "$ivector_scale != 1"` -eq 1 ]; then + ivector_scale_affix=_scale$ivector_scale + else + ivector_scale_affix= + fi + + if [ ! -z "$ivector_scale_affix" ]; then + echo "$0: Scaling iVectors, stage 1" + srcdir=$ivector_dir/ivectors_${data_set}${ivector_affix}_stage1 + outdir=$ivector_dir/ivectors_${data_set}${ivector_affix}${ivector_scale_affix}_stage1 + mkdir -p $outdir + $train_cmd $outdir/log/scale_ivectors.log \ + copy-matrix --scale=$ivector_scale scp:$srcdir/ivector_online.scp ark:- \| \ + copy-feats --compress=true ark:- ark,scp:$outdir/ivector_online.ark,$outdir/ivector_online.scp; + cp $srcdir/ivector_period $outdir/ivector_period + fi +fi + +decode_dir=$dir/decode${graph_affix}_${data_set}${affix} +# generate the lattices +if [ $stage -le 3 ]; then + echo "Generating lattices, stage 1" + steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" \ + --acwt $acwt --post-decode-acwt $post_decode_acwt \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial $extra_left_context_initial \ + --extra-right-context-final $extra_right_context_final \ + --frames-per-chunk "$frames_per_chunk" \ + --online-ivector-dir $ivector_dir/ivectors_${data_set}${ivector_affix}${ivector_scale_affix}_stage1 \ + --skip-scoring true ${iter:+--iter $iter} \ + $graph ${data}_hires ${decode_dir}_stage1; +fi + +if [ $stage -le 4 ]; then + if $get_weights_from_ctm; then + if [ ! -z $weights_file ]; then + echo "$0: Using provided vad weights file $weights_file" + ivector_extractor_weights=$weights_file + else + echo "$0 : Generating vad weights file" + ivector_extractor_weights=${decode_dir}_stage1/weights${affix}.gz + local/extract_vad_weights.sh --silence-weight $silence_weight \ + --cmd "$decode_cmd" ${iter:+--iter $iter} \ + ${data}_hires $lang \ + ${decode_dir}_stage1 $ivector_extractor_weights + fi + else + # get weights from best path decoding + ivector_extractor_weights=${decode_dir}_stage1 + fi +fi + +if [ $stage -le 5 ]; then + echo "Extracting i-vectors, stage 2 with weights from $ivector_extractor_weights" + # this does offline decoding, except we estimate the iVectors per + # speaker, excluding silence (based on alignments from a DNN decoding), with a + # different script. This is just to demonstrate that script. + # the --sub-speaker-frames is optional; if provided, it will divide each speaker + # up into "sub-speakers" of at least that many frames... can be useful if + # acoustic conditions drift over time within the speaker's data. + steps/online/nnet2/extract_ivectors.sh --cmd "$train_cmd" --nj $nj \ + --silence-weight $silence_weight \ + --sub-speaker-frames $sub_speaker_frames --max-count $max_count \ + ${data}_hires $lang $ivector_dir/extractor \ + $ivector_extractor_weights $ivector_dir/ivectors_${data_set}${ivector_affix}; +fi + +if [ $stage -le 6 ]; then + echo "Generating lattices, stage 2 with --acwt $acwt" + rm -f ${decode_dir}/.error + steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" $pass2_decode_opts \ + --acwt $acwt --post-decode-acwt $post_decode_acwt \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial $extra_left_context_initial \ + --extra-right-context-final $extra_right_context_final \ + --frames-per-chunk "$frames_per_chunk" \ + --skip-scoring false ${iter:+--iter $iter} --lattice-beam $lattice_beam \ + --online-ivector-dir $ivector_dir/ivectors_${data_set}${ivector_affix} \ + $graph ${data}_hires ${decode_dir} || touch ${decode_dir}/.error + [ -f ${decode_dir}/.error ] && echo "$0: Error decoding" && exit 1; +fi +exit 0 diff --git a/egs/chime6/s5_track1/local/nnet3/run_ivector_common.sh b/egs/chime6/s5_track1/local/nnet3/run_ivector_common.sh new file mode 100755 index 00000000000..cfa18cb7617 --- /dev/null +++ b/egs/chime6/s5_track1/local/nnet3/run_ivector_common.sh @@ -0,0 +1,141 @@ +#!/bin/bash + +set -euo pipefail + +# This script is called from local/nnet3/run_tdnn.sh and +# local/chain/run_tdnn.sh (and may eventually be called by more +# scripts). It contains the common feature preparation and +# iVector-related parts of the script. See those scripts for examples +# of usage. + +stage=0 +train_set=train_worn_u100k +test_sets="dev_worn dev_beamformit_ref" +gmm=tri3 +nj=96 + +nnet3_affix=_train_worn_u100k + +. ./cmd.sh +. ./path.sh +. utils/parse_options.sh + +gmm_dir=exp/${gmm} +ali_dir=exp/${gmm}_ali_${train_set}_sp + +for f in ${gmm_dir}/final.mdl; do + if [ ! -f $f ]; then + echo "$0: expected file $f to exist" + exit 1 + fi +done + +if [ $stage -le 1 ]; then + # Although the nnet will be trained by high resolution data, we still have to + # perturb the normal data to get the alignment _sp stands for speed-perturbed + echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)" + utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp + echo "$0: making MFCC features for low-resolution speed-perturbed data" + steps/make_mfcc.sh --cmd "$train_cmd" --nj 20 data/${train_set}_sp || exit 1; + steps/compute_cmvn_stats.sh data/${train_set}_sp || exit 1; + utils/fix_data_dir.sh data/${train_set}_sp +fi + +if [ $stage -le 2 ]; then + echo "$0: aligning with the perturbed low-resolution data" + steps/align_fmllr.sh --nj ${nj} --cmd "$train_cmd" \ + data/${train_set}_sp data/lang $gmm_dir $ali_dir || exit 1 +fi + +if [ $stage -le 3 ]; then + # Create high-resolution MFCC features (with 40 cepstra instead of 13). + # this shows how you can split across multiple file-systems. + echo "$0: creating high-resolution MFCC features" + mfccdir=data/${train_set}_sp_hires/data + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + utils/create_split_dir.pl /export/b1{5,6,8,9}/$USER/kaldi-data/mfcc/chime5-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage + fi + + for datadir in ${train_set}_sp; do + utils/copy_data_dir.sh data/$datadir data/${datadir}_hires + done + + # do volume-perturbation on the training data prior to extracting hires + # features; this helps make trained nnets more invariant to test data volume. + utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires || exit 1; + + for datadir in ${train_set}_sp; do + steps/make_mfcc.sh --nj 20 --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/${datadir}_hires || exit 1; + steps/compute_cmvn_stats.sh data/${datadir}_hires || exit 1; + utils/fix_data_dir.sh data/${datadir}_hires || exit 1; + done +fi + +if [ $stage -le 4 ]; then + echo "$0: computing a subset of data to train the diagonal UBM." + # We'll use about a quarter of the data. + mkdir -p exp/nnet3${nnet3_affix}/diag_ubm + temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm + + num_utts_total=$(wc -l &2 "$0" "$@" +if [ $# -ne 3 ] ; then + echo >&2 "$0" "$@" + echo >&2 "$0: Error: wrong number of arguments" + echo -e >&2 "Usage:\n $0 [opts] " + echo -e >&2 "eg:\n $0 /corpora/chime5/audio/train /corpora/chime5/transcriptions/train data/train" + exit 1 +fi + +set -e -o pipefail + +adir=$(utils/make_absolute.sh $1) +jdir=$2 +dir=$3 + +json_count=$(find -L $jdir -name "*.json" | wc -l) +wav_count=$(find -L $adir -name "*.wav" | wc -l) + +if [ "$json_count" -eq 0 ]; then + echo >&2 "We expect that the directory $jdir will contain json files." + echo >&2 "That implies you have supplied a wrong path to the data." + exit 1 +fi +if [ "$wav_count" -eq 0 ]; then + echo >&2 "We expect that the directory $adir will contain wav files." + echo >&2 "That implies you have supplied a wrong path to the data." + exit 1 +fi + +echo "$0: Converting transcription to text" + +mkdir -p $dir + +for file in $jdir/*json; do + ./local/json2text.py --mictype $mictype $file +done | \ + sed -e "s/\[inaudible[- 0-9]*\]/[inaudible]/g" |\ + sed -e 's/ - / /g' |\ + sed -e 's/mm-/mm/g' > $dir/text.orig + +echo "$0: Creating datadir $dir for type=\"$mictype\"" + +if [ $mictype == "worn" ]; then + # convert the filenames to wav.scp format, use the basename of the file + # as a the wav.scp key, add .L and .R for left and right channel + # i.e. each file will have two entries (left and right channel) + find -L $adir -name "S[0-9]*_P[0-9]*.wav" | \ + perl -ne '{ + chomp; + $path = $_; + next unless $path; + @F = split "/", $path; + ($f = $F[@F-1]) =~ s/.wav//; + @F = split "_", $f; + print "${F[1]}_${F[0]}.L sox $path -t wav - remix 1 |\n"; + print "${F[1]}_${F[0]}.R sox $path -t wav - remix 2 |\n"; + }' | sort > $dir/wav.scp + + # generate the transcripts for both left and right channel + # from the original transcript in the form + # P09_S03-0006072-0006147 gimme the baker + # create left and right channel transcript + # P09_S03.L-0006072-0006147 gimme the baker + # P09_S03.R-0006072-0006147 gimme the baker + sed -n 's/ *$//; h; s/-/\.L-/p; g; s/-/\.R-/p' $dir/text.orig | sort > $dir/text +elif [ $mictype == "ref" ]; then + # fixed reference array + + # first get a text, which will be used to extract reference arrays + perl -ne 's/-/.ENH-/;print;' $dir/text.orig | sort > $dir/text + + find -L $adir | grep "\.wav" | sort > $dir/wav.flist + # following command provide the argument for grep to extract only reference arrays + grep `cut -f 1 -d"-" $dir/text | awk -F"_" '{print $2 "_" $3}' | sed -e "s/\.ENH//" | sort | uniq | sed -e "s/^/ -e /" | tr "\n" " "` $dir/wav.flist > $dir/wav.flist2 + paste -d" " \ + <(awk -F "/" '{print $NF}' $dir/wav.flist2 | sed -e "s/\.wav/.ENH/") \ + $dir/wav.flist2 | sort > $dir/wav.scp +elif [ $mictype == "gss" ]; then + find -L $adir -name "P[0-9]*_S[0-9]*.wav" | \ + perl -ne '{ + chomp; + $path = $_; + next unless $path; + @F = split "/", $path; + ($f = $F[@F-1]) =~ s/.wav//; + print "$f $path\n"; + }' | sort > $dir/wav.scp + + cat $dir/text.orig | sort > $dir/text +else + # array mic case + # convert the filenames to wav.scp format, use the basename of the file + # as a the wav.scp key + find -L $adir -name "*.wav" -ipath "*${mictype}*" |\ + perl -ne '$p=$_;chomp $_;@F=split "/";$F[$#F]=~s/\.wav//;print "$F[$#F] $p";' |\ + sort -u > $dir/wav.scp + + # convert the transcripts from + # P09_S03-0006072-0006147 gimme the baker + # to the per-channel transcripts + # P09_S03_U01_NOLOCATION.CH1-0006072-0006147 gimme the baker + # P09_S03_U01_NOLOCATION.CH2-0006072-0006147 gimme the baker + # P09_S03_U01_NOLOCATION.CH3-0006072-0006147 gimme the baker + # P09_S03_U01_NOLOCATION.CH4-0006072-0006147 gimme the baker + perl -ne '$l=$_; + for($i=1; $i<=4; $i++) { + ($x=$l)=~ s/-/.CH\Q$i\E-/; + print $x;}' $dir/text.orig | sort > $dir/text + +fi +$cleanup && rm -f $dir/text.* $dir/wav.scp.* $dir/wav.flist + +# Prepare 'segments', 'utt2spk', 'spk2utt' +if [ $mictype == "worn" ]; then + cut -d" " -f 1 $dir/text | \ + awk -F"-" '{printf("%s %s %08.2f %08.2f\n", $0, $1, $2/100.0, $3/100.0)}' |\ + sed -e "s/_[A-Z]*\././2" \ + > $dir/segments +elif [ $mictype == "ref" ]; then + cut -d" " -f 1 $dir/text | \ + awk -F"-" '{printf("%s %s %08.2f %08.2f\n", $0, $1, $2/100.0, $3/100.0)}' |\ + sed -e "s/_[A-Z]*\././2" |\ + sed -e "s/ P.._/ /" > $dir/segments +elif [ $mictype != "gss" ]; then + cut -d" " -f 1 $dir/text | \ + awk -F"-" '{printf("%s %s %08.2f %08.2f\n", $0, $1, $2/100.0, $3/100.0)}' |\ + sed -e "s/_[A-Z]*\././2" |\ + sed -e 's/ P.._/ /' > $dir/segments +fi + +cut -f 1 -d ' ' $dir/text | \ + perl -ne 'chomp;$utt=$_;s/_.*//;print "$utt $_\n";' > $dir/utt2spk + +utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt + +# Check that data dirs are okay! +utils/validate_data_dir.sh --no-feats $dir || exit 1 diff --git a/egs/chime6/s5_track1/local/prepare_dict.sh b/egs/chime6/s5_track1/local/prepare_dict.sh new file mode 100755 index 00000000000..09083d0e795 --- /dev/null +++ b/egs/chime6/s5_track1/local/prepare_dict.sh @@ -0,0 +1,124 @@ +#!/bin/bash +# Copyright (c) 2018, Johns Hopkins University (Jan "Yenda" Trmal) +# License: Apache 2.0 + +# Begin configuration section. +# End configuration section +. ./utils/parse_options.sh + +. ./path.sh + +set -e -o pipefail +set -o nounset # Treat unset variables as an error + + +# The parts of the output of this that will be needed are +# [in data/local/dict/ ] +# lexicon.txt +# extra_questions.txt +# nonsilence_phones.txt +# optional_silence.txt +# silence_phones.txt + + +# check existing directories +[ $# != 0 ] && echo "Usage: $0" && exit 1; + +dir=data/local/dict + +mkdir -p $dir +echo "$0: Getting CMU dictionary" +if [ ! -f $dir/cmudict.done ]; then + [ -d $dir/cmudict ] && rm -rf $dir/cmudict + svn co https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict $dir/cmudict + touch $dir/cmudict.done +fi + +# silence phones, one per line. +for w in sil spn inaudible laughs noise; do + echo $w; +done > $dir/silence_phones.txt +echo sil > $dir/optional_silence.txt + +# For this setup we're discarding stress. +cat $dir/cmudict/cmudict-0.7b.symbols | \ + perl -ne 's:[0-9]::g; s:\r::; print lc($_)' | \ + sort -u > $dir/nonsilence_phones.txt + +# An extra question will be added by including the silence phones in one class. +paste -d ' ' -s $dir/silence_phones.txt > $dir/extra_questions.txt + +grep -v ';;;' $dir/cmudict/cmudict-0.7b |\ + uconv -f latin1 -t utf-8 -x Any-Lower |\ + perl -ne 's:(\S+)\(\d+\) :$1 :; s: : :; print;' |\ + perl -ne '@F = split " ",$_,2; $F[1] =~ s/[0-9]//g; print "$F[0] $F[1]";' \ + > $dir/lexicon1_raw_nosil.txt || exit 1; + +# Add prons for laughter, noise, oov +for w in `grep -v sil $dir/silence_phones.txt`; do + echo "[$w] $w" +done | cat - $dir/lexicon1_raw_nosil.txt > $dir/lexicon2_raw.txt || exit 1; + +# we keep all words from the cmudict in the lexicon +# might reduce OOV rate on dev and eval +cat $dir/lexicon2_raw.txt \ + <( echo "mm m" + echo " spn" + echo "cuz k aa z" + echo "cuz k ah z" + echo "cuz k ao z" + echo "mmm m"; \ + echo "hmm hh m"; \ + ) | sort -u | sed 's/[\t ]/\t/' > $dir/iv_lexicon.txt + + +cat data/train*/text | \ + awk '{for (n=2;n<=NF;n++){ count[$n]++; } } END { for(n in count) { print count[n], n; }}' | \ + sort -nr > $dir/word_counts + +cat $dir/word_counts | awk '{print $2}' > $dir/word_list + +awk '{print $1}' $dir/iv_lexicon.txt | \ + perl -e '($word_counts)=@ARGV; + open(W, "<$word_counts")||die "opening word-counts $word_counts"; + while() { chop; $seen{$_}=1; } + while() { + ($c,$w) = split; + if (!defined $seen{$w}) { print; } + } ' $dir/word_counts > $dir/oov_counts.txt + +echo "*Highest-count OOVs (including fragments) are:" +head -n 10 $dir/oov_counts.txt +echo "*Highest-count OOVs (excluding fragments) are:" +grep -v -E '^-|-$' $dir/oov_counts.txt | head -n 10 || true + +echo "*Training a G2P and generating missing pronunciations" +mkdir -p $dir/g2p/ +phonetisaurus-align --input=$dir/iv_lexicon.txt --ofile=$dir/g2p/aligned_lexicon.corpus +ngram-count -order 4 -kn-modify-counts-at-end -ukndiscount\ + -gt1min 0 -gt2min 0 -gt3min 0 -gt4min 0 \ + -text $dir/g2p/aligned_lexicon.corpus -lm $dir/g2p/aligned_lexicon.arpa +phonetisaurus-arpa2wfst --lm=$dir/g2p/aligned_lexicon.arpa --ofile=$dir/g2p/g2p.fst +awk '{print $2}' $dir/oov_counts.txt > $dir/oov_words.txt +phonetisaurus-apply --nbest 2 --model $dir/g2p/g2p.fst --thresh 5 --accumulate \ + --word_list $dir/oov_words.txt > $dir/oov_lexicon.txt + +## The next section is again just for debug purposes +## to show words for which the G2P failed +cat $dir/oov_lexicon.txt $dir/iv_lexicon.txt | sort -u > $dir/lexicon.txt +rm -f $dir/lexiconp.txt 2>/dev/null; # can confuse later script if this exists. +awk '{print $1}' $dir/lexicon.txt | \ + perl -e '($word_counts)=@ARGV; + open(W, "<$word_counts")||die "opening word-counts $word_counts"; + while() { chop; $seen{$_}=1; } + while() { + ($c,$w) = split; + if (!defined $seen{$w}) { print; } + } ' $dir/word_counts > $dir/oov_counts.g2p.txt + +echo "*Highest-count OOVs (including fragments) after G2P are:" +head -n 10 $dir/oov_counts.g2p.txt + +utils/validate_dict_dir.pl $dir +exit 0; + diff --git a/egs/chime6/s5_track1/local/replace_uttid.py b/egs/chime6/s5_track1/local/replace_uttid.py new file mode 100755 index 00000000000..96c45b58783 --- /dev/null +++ b/egs/chime6/s5_track1/local/replace_uttid.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 +# Copyright Ashish Arora +# Apache 2.0 +# This script is used in score_for_submit. It adds locationid to the utteranceid, +# using uttid_location file, for locationwise scoring. + +import sys, io +output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') + +def load_uttid_location(f): + locations = {} + for line in f: + parts=line.strip().split(' ') + uttid, loc = parts[0], parts[1] + locations[uttid] = loc + return locations + +locations = load_uttid_location(open(sys.argv[1],'r', encoding='utf8')) + +for line in open(sys.argv[2],'r', encoding='utf8'): + uttid, res = line.split(None, 1) + try: + location = locations[uttid] + location_uttid = location +'_'+ str(uttid) + output.write(location_uttid + ' ' + res) + except KeyError as e: + raise Exception("Could not find utteranceid in " + "uttid_location file" + "({0})\n".format(str(e))) diff --git a/egs/chime6/s5_track1/local/reverberate_lat_dir.sh b/egs/chime6/s5_track1/local/reverberate_lat_dir.sh new file mode 100755 index 00000000000..f601a37c0e1 --- /dev/null +++ b/egs/chime6/s5_track1/local/reverberate_lat_dir.sh @@ -0,0 +1,93 @@ +#!/bin/bash + +# Copyright 2018 Vimal Manohar +# Apache 2.0 + +num_data_reps=1 +cmd=run.pl +nj=20 +include_clean=false + +. utils/parse_options.sh +. ./path.sh + +if [ $# -ne 4 ]; then + echo "Usage: $0 " + exit 1 +fi + +train_data_dir=$1 +noisy_latdir=$2 +clean_latdir=$3 +dir=$4 + +clean_nj=$(cat $clean_latdir/num_jobs) + +$cmd JOB=1:$clean_nj $dir/copy_clean_lattices.JOB.log \ + lattice-copy "ark:gunzip -c $clean_latdir/lat.JOB.gz |" \ + ark,scp:$dir/lats_clean.JOB.ark,$dir/lats_clean.JOB.scp || exit 1 + +for n in $(seq $clean_nj); do + cat $dir/lats_clean.$n.scp +done > $dir/lats_clean.scp + +for i in $(seq $num_data_reps); do + cat $dir/lats_clean.scp | awk -vi=$i '{print "rev"i"_"$0}' +done > $dir/lats_rvb.scp + +noisy_nj=$(cat $noisy_latdir/num_jobs) +$cmd JOB=1:$noisy_nj $dir/copy_noisy_lattices.JOB>log \ + lattice-copy "ark:gunzip -c $noisy_latdir/lat.JOB.gz |" \ + ark,scp:$dir/lats_noisy.JOB.ark,$dir/lats_noisy.JOB.scp || exit 1 + +optional_clean= +if $include_clean; then + optional_clean=$dir/lats_clean.scp +fi + +for n in $(seq $noisy_nj); do + cat $dir/lats_noisy.$n.scp +done | cat - $dir/lats_rvb.scp ${optional_clean} | sort -k1,1 > $dir/lats.scp + +utils/split_data.sh $train_data_dir $nj +$cmd JOB=1:$nj $dir/copy_lattices.JOB.log \ + lattice-copy "scp:utils/filter_scp.pl $train_data_dir/split$nj/JOB/utt2spk $dir/lats.scp |" \ + "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1 + +echo $nj > $dir/num_jobs + +if [ -f $clean_latdir/ali.1.gz ]; then + $cmd JOB=1:$clean_nj $dir/copy_clean_alignments.JOB.log \ + copy-int-vector "ark:gunzip -c $clean_latdir/ali.JOB.gz |" \ + ark,scp:$dir/ali_clean.JOB.ark,$dir/ali_clean.JOB.scp + + for n in $(seq $clean_nj); do + cat $dir/ali_clean.$n.scp + done > $dir/ali_clean.scp + + for i in $(seq $num_data_reps); do + cat $dir/ali_clean.scp | awk -vi=$i '{print "rev"i"_"$0}' + done > $dir/ali_rvb.scp + + optional_clean= + if $include_clean; then + optional_clean=$dir/ali_clean.scp + fi + + $cmd JOB=1:$noisy_nj $dir/copy_noisy_alignments.JOB.log \ + copy-int-vector "ark:gunzip -c $noisy_latdir/ali.JOB.gz |" \ + ark,scp:$dir/ali_noisy.JOB.ark,$dir/ali_noisy.JOB.scp + + for n in $(seq $noisy_nj); do + cat $dir/ali_noisy.$n.scp + done | cat - $dir/ali_rvb.scp $optional_clean | sort -k1,1 > $dir/ali.scp + + utils/split_data.sh $train_data_dir $nj || exit 1 + $cmd JOB=1:$nj $dir/copy_rvb_alignments.JOB.log \ + copy-int-vector "scp:utils/filter_scp.pl $train_data_dir/split$nj/JOB/utt2spk $dir/ali.scp |" \ + "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1 +fi + +cp $clean_latdir/{final.*,tree,*.mat,*opts,*.txt} $dir || true + +rm $dir/lats_{clean,noisy}.*.{ark,scp} $dir/ali_{clean,noisy}.*.{ark,scp} || true # save space diff --git a/egs/chime6/s5_track1/local/run_beamformit.sh b/egs/chime6/s5_track1/local/run_beamformit.sh new file mode 100755 index 00000000000..aa3badd90d8 --- /dev/null +++ b/egs/chime6/s5_track1/local/run_beamformit.sh @@ -0,0 +1,87 @@ +#!/bin/bash + +# Copyright 2015, Mitsubishi Electric Research Laboratories, MERL (Author: Shinji Watanabe) + +. ./cmd.sh +. ./path.sh + +# Config: +cmd=run.pl +bmf="1 2 3 4" + +. utils/parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "Wrong #arguments ($#, expected 3)" + echo "Usage: local/run_beamformit.sh [options] " + echo "main options (for others, see top of script file)" + echo " --cmd # Command to run in parallel with" + echo " --bmf \"1 2 3 4\" # microphones used for beamforming" + exit 1; +fi + +sdir=$1 +odir=$2 +array=$3 +expdir=exp/enhan/`echo $odir | awk -F '/' '{print $NF}'`_`echo $bmf | tr ' ' '_'` + +if ! command -v BeamformIt &>/dev/null ; then + echo "Missing BeamformIt, run 'cd $KALDI_ROOT/tools/; ./extras/install_beamformit.sh; cd -;'" && exit 1 +fi + +# Set bash to 'debug' mode, it will exit on : +# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', +set -e +set -u +set -o pipefail + +mkdir -p $odir +mkdir -p $expdir/log + +echo "Will use the following channels: $bmf" +# number of channels +numch=`echo $bmf | tr ' ' '\n' | wc -l` +echo "the number of channels: $numch" + +# wavfiles.list can be used as the name of the output files +output_wavfiles=$expdir/wavfiles.list +find -L ${sdir} | grep -i ${array} | awk -F "/" '{print $NF}' | sed -e "s/\.CH.\.wav//" | sort | uniq > $expdir/wavfiles.list + +# this is an input file list of the microphones +# format: 1st_wav 2nd_wav ... nth_wav +input_arrays=$expdir/channels_$numch +for x in `cat $output_wavfiles`; do + echo -n "$x" + for ch in $bmf; do + echo -n " $x.CH$ch.wav" + done + echo "" +done > $input_arrays + +# split the list for parallel processing +# number of jobs are set by the number of WAV files +nj=`wc -l $expdir/wavfiles.list | awk '{print $1}'` +split_wavfiles="" +for n in `seq $nj`; do + split_wavfiles="$split_wavfiles $output_wavfiles.$n" +done +utils/split_scp.pl $output_wavfiles $split_wavfiles || exit 1; + +echo -e "Beamforming\n" +# making a shell script for each job +for n in `seq $nj`; do +cat << EOF > $expdir/log/beamform.$n.sh +while read line; do + $BEAMFORMIT/BeamformIt -s \$line -c $input_arrays \ + --config_file `pwd`/conf/beamformit.cfg \ + --source_dir $sdir \ + --result_dir $odir +done < $output_wavfiles.$n +EOF +done + +chmod a+x $expdir/log/beamform.*.sh +$cmd JOB=1:$nj $expdir/log/beamform.JOB.log \ + $expdir/log/beamform.JOB.sh + +echo "`basename $0` Done." diff --git a/egs/chime6/s5_track1/local/run_gss.sh b/egs/chime6/s5_track1/local/run_gss.sh new file mode 100755 index 00000000000..fbdc4af25d1 --- /dev/null +++ b/egs/chime6/s5_track1/local/run_gss.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +# Copyright 2015, Mitsubishi Electric Research Laboratories, MERL (Author: Shinji Watanabe) + +. ./cmd.sh +if [ -f ./path.sh ]; then . ./path.sh; fi + +# Config: +cmd=run.pl +nj=4 +multiarray=outer_array_mics +bss_iterations=5 +context_samples=160000 +. utils/parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "Wrong #arguments ($#, expected 3)" + echo "Usage: local/run_gss.sh [options] " + echo "main options (for others, see top of script file)" + echo " --cmd # Command to run in parallel with" + echo " --bss_iterations 5 # Number of EM iterations" + echo " --context_samples 160000 # Left-right context in number of samples" + echo " --multiarray # Multiarray configuration" + exit 1; +fi + +# setting multiarray as "true" uses all mics, we didn't see any performance +# gain from this we have chosen settings that makes the enhacement finish +# in around 1/3 of a day without significant change in performance. +# our result during the experiments are as follows: + +#MAF: multi array = False +#MAT: multi array = True +#Enhancement Iterations Num Microphones Context Computational time for GSS #cpus dev WER eval WER +#GSS(MAF) 10 24 17 hrs 30 62.3 57.98 +#GSS(MAT) 5 24 10s 26 hrs 50 53.15 53.77 +#GSS(MAT) 5 12 10s 9.5 hrs 50 53.09 53.75 + +session_id=$1 +log_dir=$2 +enhanced_dir=$3 +if [ ! -d pb_chime5/ ]; then + echo "Missing pb_chime5, run 'local/install_pb_chime5'" + exit 1 +fi + +miniconda_dir=$HOME/miniconda3/ +if [ ! -d $miniconda_dir/ ]; then + echo "$miniconda_dir/ does not exist. Please run '../../../tools/extras/install_miniconda.sh'" + exit 1 +fi + +enhanced_dir=$(utils/make_absolute.sh $enhanced_dir) || \ + { echo "Could not make absolute '$enhanced_dir'" && exit 1; } + +$cmd JOB=1:$nj $log_dir/log/enhance_${session_id}.JOB.log \ + cd pb_chime5/ '&&' \ + $miniconda_dir/bin/python -m pb_chime5.scripts.kaldi_run with \ + chime6=True \ + storage_dir=$enhanced_dir \ + session_id=$session_id \ + job_id=JOB number_of_jobs=$nj \ + bss_iterations=$bss_iterations \ + context_samples=$context_samples \ + multiarray=$multiarray || exit 1 diff --git a/egs/chime6/s5_track1/local/run_wpe.py b/egs/chime6/s5_track1/local/run_wpe.py new file mode 100755 index 00000000000..fbb264f2fd2 --- /dev/null +++ b/egs/chime6/s5_track1/local/run_wpe.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python +# Copyright 2018 Johns Hopkins University (Author: Aswin Shanmugam Subramanian) +# Apache 2.0 +# Works with both python2 and python3 +# This script assumes that WPE (nara_wpe) is installed locally using miniconda. +# ../../../tools/extras/install_miniconda.sh and ../../../tools/extras/install_wpe.sh +# needs to be run and this script needs to be launched run with that version of +# python. +# See local/run_wpe.sh for example. + +import numpy as np +import soundfile as sf +import time +import os, errno +from tqdm import tqdm +import argparse + +# to avoid huge memory consumption we decided to use `wpe_v8` instead of the original wpe by +# following the advice from Christoph Boeddeker at Paderborn University +# https://github.com/chimechallenge/kaldi_chime6/commit/2ea6ac07ef66ad98602f073b24a233cb7f61605c#r36147334 +from nara_wpe.wpe import wpe_v8 as wpe +from nara_wpe.utils import stft, istft +from nara_wpe import project_root + +parser = argparse.ArgumentParser() +parser.add_argument('--files', '-f', nargs='+') +args = parser.parse_args() + +input_files = args.files[:len(args.files)//2] +output_files = args.files[len(args.files)//2:] +out_dir = os.path.dirname(output_files[0]) +try: + os.makedirs(out_dir) +except OSError as e: + if e.errno != errno.EEXIST: + raise + +stft_options = dict( + size=512, + shift=128, + window_length=None, + fading=True, + pad=True, + symmetric_window=False +) + +sampling_rate = 16000 +delay = 3 +iterations = 5 +taps = 10 + +signal_list = [ + sf.read(f)[0] + for f in input_files +] +y = np.stack(signal_list, axis=0) +Y = stft(y, **stft_options).transpose(2, 0, 1) +Z = wpe(Y, iterations=iterations, statistics_mode='full').transpose(1, 2, 0) +z = istft(Z, size=stft_options['size'], shift=stft_options['shift']) + +for d in range(len(signal_list)): + sf.write(output_files[d], z[d,:], sampling_rate) diff --git a/egs/chime6/s5_track1/local/run_wpe.sh b/egs/chime6/s5_track1/local/run_wpe.sh new file mode 100755 index 00000000000..ed512e69aae --- /dev/null +++ b/egs/chime6/s5_track1/local/run_wpe.sh @@ -0,0 +1,86 @@ +#!/bin/bash +# Copyright 2018 Johns Hopkins University (Author: Aswin Shanmugam Subramanian) +# Apache 2.0 + +. ./cmd.sh +. ./path.sh + +# Config: +nj=4 +cmd=run.pl + +. utils/parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "Wrong #arguments ($#, expected 3)" + echo "Usage: local/run_wpe.sh [options] " + echo "main options (for others, see top of script file)" + echo " --cmd # Command to run in parallel with" + echo " --nj 50 # number of jobs for parallel processing" + exit 1; +fi + +sdir=$1 +odir=$2 +array=$3 +task=`basename $sdir` +expdir=exp/wpe/${task}_${array} +# Set bash to 'debug' mode, it will exit on : +# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', +set -e +set -u +set -o pipefail + +miniconda_dir=$HOME/miniconda3/ +if [ ! -d $miniconda_dir ]; then + echo "$miniconda_dir does not exist. Please run '$KALDI_ROOT/tools/extras/install_miniconda.sh'." + exit 1 +fi + +# check if WPE is installed +result=`$miniconda_dir/bin/python -c "\ +try: + import nara_wpe + print('1') +except ImportError: + print('0')"` + +if [ "$result" == "1" ]; then + echo "WPE is installed" +else + echo "WPE is not installed. Please run ../../../tools/extras/install_wpe.sh" + exit 1 +fi + +mkdir -p $odir +mkdir -p $expdir/log + +# wavfiles.list can be used as the name of the output files +output_wavfiles=$expdir/wavfiles.list +find -L ${sdir} | grep -i ${array} > $expdir/channels_input +cat $expdir/channels_input | awk -F '/' '{print $NF}' | sed "s@S@$odir\/S@g" > $expdir/channels_output +paste -d" " $expdir/channels_input $expdir/channels_output > $output_wavfiles + +# split the list for parallel processing +split_wavfiles="" +for n in `seq $nj`; do + split_wavfiles="$split_wavfiles $output_wavfiles.$n" +done +utils/split_scp.pl $output_wavfiles $split_wavfiles || exit 1; + +echo -e "Dereverberation - $task - $array\n" +# making a shell script for each job +for n in `seq $nj`; do +cat <<-EOF > $expdir/log/wpe.$n.sh +while read line; do + $miniconda_dir/bin/python local/run_wpe.py \ + --file \$line +done < $output_wavfiles.$n +EOF +done + +chmod a+x $expdir/log/wpe.*.sh +$cmd JOB=1:$nj $expdir/log/wpe.JOB.log \ + $expdir/log/wpe.JOB.sh + +echo "`basename $0` Done." diff --git a/egs/chime6/s5_track1/local/score.sh b/egs/chime6/s5_track1/local/score.sh new file mode 120000 index 00000000000..6a200b42ed3 --- /dev/null +++ b/egs/chime6/s5_track1/local/score.sh @@ -0,0 +1 @@ +../steps/scoring/score_kaldi_wer.sh \ No newline at end of file diff --git a/egs/chime6/s5_track1/local/score_for_submit.sh b/egs/chime6/s5_track1/local/score_for_submit.sh new file mode 100755 index 00000000000..ba7d6cde574 --- /dev/null +++ b/egs/chime6/s5_track1/local/score_for_submit.sh @@ -0,0 +1,132 @@ +#!/bin/bash +# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey, Yenda Trmal) +# Copyright 2019 Johns Hopkins University (Author: Shinji Watanabe) +# Apache 2.0 +# +# This script provides official CHiME-6 challenge track 1 submission scores per room and session. +# It first calculates the best search parameter configurations by using the dev set +# and also create the transcriptions for dev and eval sets to be submitted. +# The default setup does not calculate scores of the evaluation set since +# the evaluation transcription is not distributed (July 9 2018) + +cmd=run.pl +dev=exp/chain_train_worn_u100k_cleaned/tdnn1a_sp/decode_dev_beamformit_ref +eval=exp/chain_train_worn_u100k_cleaned/tdnn1a_sp/decode_eval_beamformit_ref +do_eval=true +enhancement=gss +json= + +echo "$0 $@" # Print the command line for logging +[ -f ./path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# -ne 0 ]; then + echo "Usage: $0 [--cmd (run.pl|queue.pl...)]" + echo "This script provides official CHiME-6 challenge submission scores" + echo " Options:" + echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." + echo " --dev # dev set decoding directory" + echo " --eval # eval set decoding directory" + echo " --enhancement # enhancement type (gss or beamformit)" + echo " --json # directory containing CHiME-6 json files" + exit 1; +fi + +# get language model weight and word insertion penalty from the dev set +best_lmwt=`cat $dev/scoring_kaldi/wer_details/lmwt` +best_wip=`cat $dev/scoring_kaldi/wer_details/wip` + +echo "best LM weight: $best_lmwt" +echo "insertion penalty weight: $best_wip" + +echo "==== development set ====" +# development set +# get uttid location mapping +local/add_location_to_uttid.sh --enhancement $enhancement $json/dev \ + $dev/scoring_kaldi/wer_details/ $dev/scoring_kaldi/wer_details/uttid_location +# get the scoring result per utterance +score_result=$dev/scoring_kaldi/wer_details/per_utt_loc + +for session in S02 S09; do + for room in DINING KITCHEN LIVING; do + # get nerror + nerr=`grep "\#csid" $score_result | grep $room | grep $session | awk '{sum+=$4+$5+$6} END {print sum}'` + # get nwords from references (NF-2 means to exclude utterance id and " ref ") + nwrd=`grep "\#csid" $score_result | grep $room | grep $session | awk '{sum+=$3+$4+$6} END {print sum}'` + # compute wer with scale=2 + wer=`echo "scale=2; 100 * $nerr / $nwrd" | bc` + + # report the results + echo -n "session $session " + echo -n "room $room: " + echo -n "#words $nwrd, " + echo -n "#errors $nerr, " + echo "wer $wer %" + done +done +echo -n "overall: " +# get nerror +nerr=`grep "\#csid" $score_result | awk '{sum+=$4+$5+$6} END {print sum}'` +# get nwords from references (NF-2 means to exclude utterance id and " ref ") +nwrd=`grep "\#csid" $score_result | awk '{sum+=$3+$4+$6} END {print sum}'` +# compute wer with scale=2 +wer=`echo "scale=2; 100 * $nerr / $nwrd" | bc` +echo -n "#words $nwrd, " +echo -n "#errors $nerr, " +echo "wer $wer %" + +echo "==== evaluation set ====" +# evaluation set +# get the scoring result per utterance. Copied from local/score.sh +mkdir -p $eval/scoring_kaldi/wer_details_devbest +$cmd $eval/scoring_kaldi/log/stats1.log \ + cat $eval/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \| \ + align-text --special-symbol="'***'" ark:$eval/scoring_kaldi/test_filt.txt ark:- ark,t:- \| \ + utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \> $eval/scoring_kaldi/wer_details_devbest/per_utt + +local/add_location_to_uttid.sh --enhancement $enhancement $json/eval \ + $eval/scoring_kaldi/wer_details_devbest/ $eval/scoring_kaldi/wer_details_devbest/uttid_location + +score_result=$eval/scoring_kaldi/wer_details_devbest/per_utt_loc +for session in S01 S21; do + for room in DINING KITCHEN LIVING; do + if $do_eval; then + # get nerror + nerr=`grep "\#csid" $score_result | grep $room | grep $session | awk '{sum+=$4+$5+$6} END {print sum}'` + # get nwords from references (NF-2 means to exclude utterance id and " ref ") + nwrd=`grep "\#csid" $score_result | grep $room | grep $session | awk '{sum+=$3+$4+$6} END {print sum}'` + # compute wer with scale=2 + wer=`echo "scale=2; 100 * $nerr / $nwrd" | bc` + + # report the results + echo -n "session $session " + echo -n "room $room: " + echo -n "#words $nwrd, " + echo -n "#errors $nerr, " + echo "wer $wer %" + fi + done +done +if $do_eval; then + # get nerror + nerr=`grep "\#csid" $score_result | awk '{sum+=$4+$5+$6} END {print sum}'` + # get nwords from references (NF-2 means to exclude utterance id and " ref ") + nwrd=`grep "\#csid" $score_result | awk '{sum+=$3+$4+$6} END {print sum}'` + # compute wer with scale=2 + wer=`echo "scale=2; 100 * $nerr / $nwrd" | bc` + echo -n "overall: " + echo -n "#words $nwrd, " + echo -n "#errors $nerr, " + echo "wer $wer %" +else + echo "skip evaluation scoring" + echo "" + echo "==== when you submit your result to the CHiME-6 challenge track 1 ====" + echo "Please rename your recognition results of " + echo "$dev/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt" + echo "$eval/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt" + echo "with {dev,eval}__.txt, e.g., dev_watanabe_jhu.txt and eval_watanabe_jhu.txt, " + echo "and submit both of them as your final challenge result" + echo "==================================================================" +fi + diff --git a/egs/chime6/s5_track1/local/train_lms_srilm.sh b/egs/chime6/s5_track1/local/train_lms_srilm.sh new file mode 100755 index 00000000000..5a1d56d24b3 --- /dev/null +++ b/egs/chime6/s5_track1/local/train_lms_srilm.sh @@ -0,0 +1,261 @@ +#!/bin/bash +# Copyright (c) 2017 Johns Hopkins University (Author: Yenda Trmal, Shinji Watanabe) +# Apache 2.0 + +export LC_ALL=C + +# Begin configuration section. +words_file= +train_text= +dev_text= +oov_symbol="" +# End configuration section + +echo "$0 $@" + +[ -f path.sh ] && . ./path.sh +. ./utils/parse_options.sh || exit 1 + +echo "-------------------------------------" +echo "Building an SRILM language model " +echo "-------------------------------------" + +if [ $# -ne 2 ] ; then + echo "Incorrect number of parameters. " + echo "Script has to be called like this:" + echo " $0 [switches] " + echo "For example: " + echo " $0 data data/srilm" + echo "The allowed switches are: " + echo " words_file= word list file -- data/lang/words.txt by default" + echo " train_text= data/train/text is used in case when not specified" + echo " dev_text= last 10 % of the train text is used by default" + echo " oov_symbol=> symbol to use for oov modeling -- by default" + exit 1 +fi + +datadir=$1 +tgtdir=$2 + +##End of configuration +loc=`which ngram-count`; +if [ -z $loc ]; then + echo >&2 "You appear to not have SRILM tools installed, either on your path," + echo >&2 "Use the script \$KALDI_ROOT/tools/install_srilm.sh to install it." + exit 1 +fi + +# Prepare the destination directory +mkdir -p $tgtdir + +for f in $words_file $train_text $dev_text; do + [ ! -s $f ] && echo "No such file $f" && exit 1; +done + +[ -z $words_file ] && words_file=$datadir/lang/words.txt +if [ ! -z "$train_text" ] && [ -z "$dev_text" ] ; then + nr=`cat $train_text | wc -l` + nr_dev=$(($nr / 10 )) + nr_train=$(( $nr - $nr_dev )) + orig_train_text=$train_text + head -n $nr_train $train_text > $tgtdir/train_text + tail -n $nr_dev $train_text > $tgtdir/dev_text + + train_text=$tgtdir/train_text + dev_text=$tgtdir/dev_text + echo "Using words file: $words_file" + echo "Using train text: 9/10 of $orig_train_text" + echo "Using dev text : 1/10 of $orig_train_text" +elif [ ! -z "$train_text" ] && [ ! -z "$dev_text" ] ; then + echo "Using words file: $words_file" + echo "Using train text: $train_text" + echo "Using dev text : $dev_text" + train_text=$train_text + dev_text=$dev_text +else + train_text=$datadir/train/text + dev_text=$datadir/dev2h/text + echo "Using words file: $words_file" + echo "Using train text: $train_text" + echo "Using dev text : $dev_text" + +fi + +[ ! -f $words_file ] && echo >&2 "File $words_file must exist!" && exit 1 +[ ! -f $train_text ] && echo >&2 "File $train_text must exist!" && exit 1 +[ ! -f $dev_text ] && echo >&2 "File $dev_text must exist!" && exit 1 + + +# Extract the word list from the training dictionary; exclude special symbols +sort $words_file | awk '{print $1}' | grep -v '\#0' | grep -v '' | grep -v -F "$oov_symbol" > $tgtdir/vocab +if (($?)); then + echo "Failed to create vocab from $words_file" + exit 1 +else + # wc vocab # doesn't work due to some encoding issues + echo vocab contains `cat $tgtdir/vocab | perl -ne 'BEGIN{$l=$w=0;}{split; $w+=$#_; $w++; $l++;}END{print "$l lines, $w words\n";}'` +fi + +# Kaldi transcript files contain Utterance_ID as the first word; remove it +# We also have to avoid skewing the LM by incorporating the same sentences +# from different channels +sed -e "s/\.CH.//" -e "s/_.\-./_/" -e "s/NOLOCATION\(\.[LR]\)*-//" -e "s/U[0-9][0-9]_//" $train_text | sort -u | \ + perl -ane 'print join(" ", @F[1..$#F]) . "\n" if @F > 1' > $tgtdir/train.txt +if (($?)); then + echo "Failed to create $tgtdir/train.txt from $train_text" + exit 1 +else + echo "Removed first word (uid) from every line of $train_text" + # wc text.train train.txt # doesn't work due to some encoding issues + echo $train_text contains `cat $train_text | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $w--; $s++;}END{print "$w words, $s sentences\n";}'` + echo train.txt contains `cat $tgtdir/train.txt | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $s++;}END{print "$w words, $s sentences\n";}'` +fi + +# Kaldi transcript files contain Utterance_ID as the first word; remove it +sed -e "s/\.CH.//" -e "s/_.\-./_/" $dev_text | sort -u | \ + perl -ane 'print join(" ", @F[1..$#F]) . "\n" if @F > 1' > $tgtdir/dev.txt +if (($?)); then + echo "Failed to create $tgtdir/dev.txt from $dev_text" + exit 1 +else + echo "Removed first word (uid) from every line of $dev_text" + # wc text.train train.txt # doesn't work due to some encoding issues + echo $dev_text contains `cat $dev_text | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $w--; $s++;}END{print "$w words, $s sentences\n";}'` + echo $tgtdir/dev.txt contains `cat $tgtdir/dev.txt | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $s++;}END{print "$w words, $s sentences\n";}'` +fi + + +echo "-------------------" +echo "Good-Turing 3grams" +echo "-------------------" +ngram-count -lm $tgtdir/3gram.gt011.gz -gt1min 0 -gt2min 1 -gt3min 1 -order 3 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.gt012.gz -gt1min 0 -gt2min 1 -gt3min 2 -order 3 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.gt022.gz -gt1min 0 -gt2min 2 -gt3min 2 -order 3 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.gt023.gz -gt1min 0 -gt2min 2 -gt3min 3 -order 3 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" + +echo "-------------------" +echo "Kneser-Ney 3grams" +echo "-------------------" +ngram-count -lm $tgtdir/3gram.kn011.gz -kndiscount1 -gt1min 0 \ + -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -order 3 -interpolate \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.kn012.gz -kndiscount1 -gt1min 0 \ + -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -order 3 -interpolate \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.kn022.gz -kndiscount1 -gt1min 0 \ + -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -order 3 -interpolate \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.kn023.gz -kndiscount1 -gt1min 0 \ + -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 3 -order 3 -interpolate \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.kn111.gz -kndiscount1 -gt1min 1 \ + -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -order 3 -interpolate \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.kn112.gz -kndiscount1 -gt1min 1 \ + -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -order 3 -interpolate \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.kn122.gz -kndiscount1 -gt1min 1 \ + -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -order 3 -interpolate \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.kn123.gz -kndiscount1 -gt1min 1 \ + -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 3 -order 3 -interpolate \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" + + +echo "-------------------" +echo "Good-Turing 4grams" +echo "-------------------" +ngram-count -lm $tgtdir/4gram.gt0111.gz \ + -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 1 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.gt0112.gz \ + -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 2 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.gt0122.gz \ + -gt1min 0 -gt2min 1 -gt3min 2 -gt4min 2 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.gt0123.gz \ + -gt1min 0 -gt2min 1 -gt3min 2 -gt4min 3 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.gt0113.gz \ + -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 3 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.gt0222.gz \ + -gt1min 0 -gt2min 2 -gt3min 2 -gt4min 2 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.gt0223.gz \ + -gt1min 0 -gt2min 2 -gt3min 2 -gt4min 3 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" + +echo "-------------------" +echo "Kneser-Ney 4grams" +echo "-------------------" +ngram-count -lm $tgtdir/4gram.kn0111.gz \ + -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 1 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.kn0112.gz \ + -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 2 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.kn0113.gz \ + -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 3 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.kn0122.gz \ + -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 2 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.kn0123.gz \ + -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 3 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.kn0222.gz \ + -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 2 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.kn0223.gz \ + -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 3 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" + +if [ ! -z ${LIBLBFGS} ]; then + #please note that if the switch -map-unk "$oov_symbol" is used with -maxent-convert-to-arpa, ngram-count will segfault + #instead of that, we simply output the model in the maxent format and convert it using the "ngram" + echo "-------------------" + echo "Maxent 3grams" + echo "-------------------" + sed 's/'${oov_symbol}'//g' $tgtdir/train.txt | \ + ngram-count -lm - -order 3 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\ + ngram -lm - -order 3 -unk -map-unk "$oov_symbol" -prune-lowprobs -write-lm - |\ + sed 's//'${oov_symbol}'/g' | gzip -c > $tgtdir/3gram.me.gz || exit 1 + + echo "-------------------" + echo "Maxent 4grams" + echo "-------------------" + sed 's/'${oov_symbol}'//g' $tgtdir/train.txt | \ + ngram-count -lm - -order 4 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\ + ngram -lm - -order 4 -unk -map-unk "$oov_symbol" -prune-lowprobs -write-lm - |\ + sed 's//'${oov_symbol}'/g' | gzip -c > $tgtdir/4gram.me.gz || exit 1 +else + echo >&2 "SRILM is not compiled with the support of MaxEnt models." + echo >&2 "You should use the script in \$KALDI_ROOT/tools/install_srilm.sh" + echo >&2 "which will take care of compiling the SRILM with MaxEnt support" + exit 1; +fi + + +echo "--------------------" +echo "Computing perplexity" +echo "--------------------" +( + for f in $tgtdir/3gram* ; do ( echo $f; ngram -order 3 -lm $f -unk -map-unk "$oov_symbol" -prune-lowprobs -ppl $tgtdir/dev.txt ) | paste -s -d ' ' ; done + for f in $tgtdir/4gram* ; do ( echo $f; ngram -order 4 -lm $f -unk -map-unk "$oov_symbol" -prune-lowprobs -ppl $tgtdir/dev.txt ) | paste -s -d ' ' ; done +) | sort -r -n -k 15,15g | column -t | tee $tgtdir/perplexities.txt + +echo "The perlexity scores report is stored in $tgtdir/perplexities.txt " +echo "" + +for best_ngram in {3,4}gram ; do + outlm=best_${best_ngram}.gz + lmfilename=$(grep "${best_ngram}" $tgtdir/perplexities.txt | head -n 1 | cut -f 1 -d ' ') + echo "$outlm -> $lmfilename" + (cd $tgtdir; rm -f $outlm; ln -sf $(basename $lmfilename) $outlm ) +done diff --git a/egs/chime6/s5_track1/local/wer_output_filter b/egs/chime6/s5_track1/local/wer_output_filter new file mode 100755 index 00000000000..6f4b6400716 --- /dev/null +++ b/egs/chime6/s5_track1/local/wer_output_filter @@ -0,0 +1,25 @@ +#!/bin/bash +# Copyright (c) 2017 Johns Hopkins University (Author: Yenda Trmal ) +# Apache 2.0 + + +## Filter for scoring of the STT results. Convert everything to lowercase +## and add some ad-hoc fixes for the hesitations + +perl -e ' + while() { + @A = split(" ", $_); + $id = shift @A; print "$id "; + foreach $a (@A) { + print lc($a) . " " unless $a =~ /\[.*\]/; + } + print "\n"; + }' | \ +sed -e ' + s/\/hmm/g; + s/\/hmm/g; + s/\/hmm/g; +' + +#| uconv -f utf-8 -t utf-8 -x Latin-ASCII + diff --git a/egs/chime6/s5_track1/path.sh b/egs/chime6/s5_track1/path.sh new file mode 100644 index 00000000000..fb1c0489386 --- /dev/null +++ b/egs/chime6/s5_track1/path.sh @@ -0,0 +1,7 @@ +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C + diff --git a/egs/chime6/s5_track1/run.sh b/egs/chime6/s5_track1/run.sh new file mode 100755 index 00000000000..0890a939faf --- /dev/null +++ b/egs/chime6/s5_track1/run.sh @@ -0,0 +1,280 @@ +#!/bin/bash +# +# Based mostly on the TED-LIUM and Switchboard recipe +# +# Copyright 2017 Johns Hopkins University (Author: Shinji Watanabe and Yenda Trmal) +# Apache 2.0 +# + +# Begin configuration section. +nj=96 +decode_nj=20 +stage=0 +nnet_stage=-10 +decode_stage=1 +decode_only=false +num_data_reps=4 +foreground_snrs="20:10:15:5:0" +background_snrs="20:10:15:5:0" +enhancement=beamformit # gss or beamformit + +# End configuration section +. ./utils/parse_options.sh + +. ./cmd.sh +. ./path.sh + +if [ $decode_only == "true" ]; then + stage=16 +fi + +set -e # exit on error + +# chime5 main directory path +# please change the path accordingly +chime5_corpus=/export/corpora4/CHiME5 +# chime6 data directories, which are generated from ${chime5_corpus}, +# to synchronize audio files across arrays and modify the annotation (JSON) file accordingly +chime6_corpus=${PWD}/CHiME6 +json_dir=${chime6_corpus}/transcriptions +audio_dir=${chime6_corpus}/audio + +if [[ ${enhancement} == *gss* ]]; then + enhanced_dir=${enhanced_dir}_multiarray + enhancement=${enhancement}_multiarray +fi + +if [[ ${enhancement} == *beamformit* ]]; then + enhanced_dir=${enhanced_dir} + enhancement=${enhancement} +fi + +test_sets="dev_${enhancement} eval_${enhancement}" +train_set=train_worn_simu_u400k + +# This script also needs the phonetisaurus g2p, srilm, beamformit +./local/check_tools.sh || exit 1 + +########################################################################### +# We first generate the synchronized audio files across arrays and +# corresponding JSON files. Note that this requires sox v14.4.2, +# which is installed via miniconda in ./local/check_tools.sh +########################################################################### + +if [ $stage -le 0 ]; then + local/generate_chime6_data.sh \ + --cmd "$train_cmd" \ + ${chime5_corpus} \ + ${chime6_corpus} +fi + +########################################################################### +# We prepare dict and lang in stages 1 to 3. +########################################################################### + +if [ $stage -le 1 ]; then + echo "$0: prepare data..." + # skip u03 and u04 as they are missing + for mictype in worn u01 u02 u05 u06; do + local/prepare_data.sh --mictype ${mictype} \ + ${audio_dir}/train ${json_dir}/train data/train_${mictype} + done + for dataset in dev; do + for mictype in worn; do + local/prepare_data.sh --mictype ${mictype} \ + ${audio_dir}/${dataset} ${json_dir}/${dataset} \ + data/${dataset}_${mictype} + done + done +fi + +if [ $stage -le 2 ]; then + echo "$0: train lm ..." + local/prepare_dict.sh + + utils/prepare_lang.sh \ + data/local/dict "" data/local/lang data/lang + + local/train_lms_srilm.sh \ + --train-text data/train_worn/text --dev-text data/dev_worn/text \ + --oov-symbol "" --words-file data/lang/words.txt \ + data/ data/srilm +fi + +LM=data/srilm/best_3gram.gz +if [ $stage -le 3 ]; then + # Compiles G for chime5 trigram LM + echo "$0: prepare lang..." + utils/format_lm.sh \ + data/lang $LM data/local/dict/lexicon.txt data/lang + +fi + +######################################################################################### +# In stages 4 to 7, we augment and fix train data for our training purpose. point source +# noises are extracted from chime corpus. Here we use 400k utterances from array microphones, +# its augmentation and all the worn set utterances in train. +######################################################################################### + +if [ $stage -le 4 ]; then + # remove possibly bad sessions (P11_S03, P52_S19, P53_S24, P54_S24) + # see http://spandh.dcs.shef.ac.uk/chime_challenge/data.html for more details + utils/copy_data_dir.sh data/train_worn data/train_worn_org # back up + grep -v -e "^P11_S03" -e "^P52_S19" -e "^P53_S24" -e "^P54_S24" data/train_worn_org/text > data/train_worn/text + utils/fix_data_dir.sh data/train_worn +fi + +if [ $stage -le 5 ]; then + local/extract_noises.py $chime6_corpus/audio/train $chime6_corpus/transcriptions/train \ + local/distant_audio_list distant_noises + local/make_noise_list.py distant_noises > distant_noise_list + + noise_list=distant_noise_list + + if [ ! -d RIRS_NOISES/ ]; then + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + + # This is the config for the system using simulated RIRs and point-source noises + rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list") + rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list") + rvb_opts+=(--noise-set-parameters $noise_list) + + steps/data/reverberate_data_dir.py \ + "${rvb_opts[@]}" \ + --prefix "rev" \ + --foreground-snrs $foreground_snrs \ + --background-snrs $background_snrs \ + --speech-rvb-probability 1 \ + --pointsource-noise-addition-probability 1 \ + --isotropic-noise-addition-probability 1 \ + --num-replications $num_data_reps \ + --max-noises-per-minute 1 \ + --source-sampling-rate 16000 \ + data/train_worn data/train_worn_rvb +fi + +if [ $stage -le 6 ]; then + # combine mix array and worn mics + # randomly extract first 400k utterances from all mics + # if you want to include more training data, you can increase the number of array mic utterances + utils/combine_data.sh data/train_uall data/train_u01 data/train_u02 data/train_u05 data/train_u06 + utils/subset_data_dir.sh data/train_uall 400000 data/train_u400k + utils/combine_data.sh data/${train_set} data/train_worn data/train_worn_rvb data/train_u400k + + # only use left channel for worn mic recognition + # you can use both left and right channels for training + for dset in train dev; do + utils/copy_data_dir.sh data/${dset}_worn data/${dset}_worn_stereo + grep "\.L-" data/${dset}_worn_stereo/text > data/${dset}_worn/text + utils/fix_data_dir.sh data/${dset}_worn + done +fi + +if [ $stage -le 7 ]; then + # Split speakers up into 3-minute chunks. This doesn't hurt adaptation, and + # lets us use more jobs for decoding etc. + for dset in ${train_set}; do + utils/copy_data_dir.sh data/${dset} data/${dset}_nosplit + utils/data/modify_speaker_info.sh --seconds-per-spk-max 180 data/${dset}_nosplit data/${dset} + done +fi + +################################################################################## +# Now make 13-dim MFCC features. We use 13-dim fetures for GMM-HMM systems. +################################################################################## + +if [ $stage -le 8 ]; then + # Now make MFCC features. + # mfccdir should be some place with a largish disk where you + # want to store MFCC features. + echo "$0: make features..." + mfccdir=mfcc + for x in ${train_set}; do + steps/make_mfcc.sh --nj 20 --cmd "$train_cmd" \ + data/$x exp/make_mfcc/$x $mfccdir + steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir + utils/fix_data_dir.sh data/$x + done +fi + +################################################################################### +# Stages 9 to 13 train monophone and triphone models. They will be used for +# generating lattices for training the chain model +################################################################################### + +if [ $stage -le 9 ]; then + # make a subset for monophone training + utils/subset_data_dir.sh --shortest data/${train_set} 100000 data/${train_set}_100kshort + utils/subset_data_dir.sh data/${train_set}_100kshort 30000 data/${train_set}_30kshort +fi + +if [ $stage -le 10 ]; then + # Starting basic training on MFCC features + steps/train_mono.sh --nj $nj --cmd "$train_cmd" \ + data/${train_set}_30kshort data/lang exp/mono +fi + +if [ $stage -le 11 ]; then + steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + data/${train_set} data/lang exp/mono exp/mono_ali + + steps/train_deltas.sh --cmd "$train_cmd" \ + 2500 30000 data/${train_set} data/lang exp/mono_ali exp/tri1 +fi + +if [ $stage -le 12 ]; then + steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + data/${train_set} data/lang exp/tri1 exp/tri1_ali + + steps/train_lda_mllt.sh --cmd "$train_cmd" \ + 4000 50000 data/${train_set} data/lang exp/tri1_ali exp/tri2 +fi + +if [ $stage -le 13 ]; then + steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + data/${train_set} data/lang exp/tri2 exp/tri2_ali + + steps/train_sat.sh --cmd "$train_cmd" \ + 5000 100000 data/${train_set} data/lang exp/tri2_ali exp/tri3 +fi + +####################################################################### +# Perform data cleanup for training data. +####################################################################### + +if [ $stage -le 14 ]; then + # The following script cleans the data and produces cleaned data + steps/cleanup/clean_and_segment_data.sh --nj ${nj} --cmd "$train_cmd" \ + --segmentation-opts "--min-segment-length 0.3 --min-new-segment-length 0.6" \ + data/${train_set} data/lang exp/tri3 exp/tri3_cleaned data/${train_set}_cleaned +fi + +########################################################################## +# CHAIN MODEL TRAINING +# skipping decoding here and performing it in step 16 +########################################################################## + +if [ $stage -le 15 ]; then + # chain TDNN + local/chain/run_tdnn.sh --nj ${nj} \ + --stage $nnet_stage \ + --train-set ${train_set}_cleaned \ + --test-sets "$test_sets" \ + --gmm tri3_cleaned --nnet3-affix _${train_set}_cleaned_rvb +fi + +########################################################################## +# DECODING is done in the local/decode.sh script. This script performs +# enhancement, fixes test sets performs feature extraction and 2 stage decoding +########################################################################## + +if [ $stage -le 16 ]; then + local/decode.sh --stage $decode_stage \ + --enhancement $enhancement \ + --train_set "$train_set" +fi + +exit 0; diff --git a/egs/chime6/s5_track1/steps b/egs/chime6/s5_track1/steps new file mode 120000 index 00000000000..1b186770dd1 --- /dev/null +++ b/egs/chime6/s5_track1/steps @@ -0,0 +1 @@ +../../wsj/s5/steps/ \ No newline at end of file diff --git a/egs/chime6/s5_track1/utils b/egs/chime6/s5_track1/utils new file mode 120000 index 00000000000..a3279dc8679 --- /dev/null +++ b/egs/chime6/s5_track1/utils @@ -0,0 +1 @@ +../../wsj/s5/utils/ \ No newline at end of file diff --git a/egs/chime6/s5_track2/RESULTS b/egs/chime6/s5_track2/RESULTS new file mode 100644 index 00000000000..cf87e7cc109 --- /dev/null +++ b/egs/chime6/s5_track2/RESULTS @@ -0,0 +1,18 @@ +# Results for Chime-6 track 2 for dev and eval, using pretrained models +# available at http://kaldi-asr.org/models/m12. + +# Speech Activity Detection (SAD) + Missed speech False alarm Total error +Dev 4.3 2.1 6.4 +Eval 5.6 5.9 11.5 + +# The results for the remaining pipeline are only for array U06. + +# Diarization + DER JER +Dev 57.15 83.96 +Eval 54.12 80.33 + +# ASR nnet3 tdnn+chain +Dev: U06 %WER 81.18 [ 58881 / 47798, 1638 ins, 30528 del, 15632 sub ] +Eval: U06 %WER 85.39 [ 55132 / 47076, 1107 ins, 27768 del, 18201 sub ] diff --git a/egs/chime6/s5_track2/cmd.sh b/egs/chime6/s5_track2/cmd.sh new file mode 100644 index 00000000000..86514d94d4d --- /dev/null +++ b/egs/chime6/s5_track2/cmd.sh @@ -0,0 +1,14 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="retry.pl queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" diff --git a/egs/chime6/s5_track2/conf/beamformit.cfg b/egs/chime6/s5_track2/conf/beamformit.cfg new file mode 100755 index 00000000000..70fdd858651 --- /dev/null +++ b/egs/chime6/s5_track2/conf/beamformit.cfg @@ -0,0 +1,50 @@ +#BeamformIt sample configuration file for AMI data (http://groups.inf.ed.ac.uk/ami/download/) + +# scrolling size to compute the delays +scroll_size = 250 + +# cross correlation computation window size +window_size = 500 + +#amount of maximum points for the xcorrelation taken into account +nbest_amount = 4 + +#flag wether to apply an automatic noise thresholding +do_noise_threshold = 1 + +#Percentage of frames with lower xcorr taken as noisy +noise_percent = 10 + +######## acoustic modelling parameters + +#transition probabilities weight for multichannel decoding +trans_weight_multi = 25 +trans_weight_nbest = 25 + +### + +#flag wether to print the feaures after setting them, or not +print_features = 1 + +#flag wether to use the bad frames in the sum process +do_avoid_bad_frames = 1 + +#flag to use the best channel (SNR) as a reference +#defined from command line +do_compute_reference = 1 + +#flag wether to use a uem file or not(process all the file) +do_use_uem_file = 0 + +#flag wether to use an adaptative weights scheme or fixed weights +do_adapt_weights = 1 + +#flag wether to output the sph files or just run the system to create the auxiliary files +do_write_sph_files = 1 + +####directories where to store/retrieve info#### +#channels_file = ./cfg-files/channels + +#show needs to be passed as argument normally, here a default one is given just in case +#show_id = Ttmp + diff --git a/egs/chime6/s5_track2/conf/mfcc.conf b/egs/chime6/s5_track2/conf/mfcc.conf new file mode 100644 index 00000000000..32988403b00 --- /dev/null +++ b/egs/chime6/s5_track2/conf/mfcc.conf @@ -0,0 +1,2 @@ +--use-energy=false +--sample-frequency=16000 diff --git a/egs/chime6/s5_track2/conf/mfcc_hires.conf b/egs/chime6/s5_track2/conf/mfcc_hires.conf new file mode 100644 index 00000000000..fd64b62eb16 --- /dev/null +++ b/egs/chime6/s5_track2/conf/mfcc_hires.conf @@ -0,0 +1,10 @@ +# config for high-resolution MFCC features, intended for neural network training. +# Note: we keep all cepstra, so it has the same info as filterbank features, +# but MFCC is more easily compressible (because less correlated) which is why +# we prefer this method. +--use-energy=false # use average of log energy, not energy. +--sample-frequency=16000 +--num-mel-bins=40 +--num-ceps=40 +--low-freq=40 +--high-freq=-400 diff --git a/egs/chime6/s5_track2/conf/online_cmvn.conf b/egs/chime6/s5_track2/conf/online_cmvn.conf new file mode 100644 index 00000000000..7748a4a4dd3 --- /dev/null +++ b/egs/chime6/s5_track2/conf/online_cmvn.conf @@ -0,0 +1 @@ +# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh diff --git a/egs/chime6/s5_track2/conf/sad.conf b/egs/chime6/s5_track2/conf/sad.conf new file mode 100644 index 00000000000..752bb1cf6c5 --- /dev/null +++ b/egs/chime6/s5_track2/conf/sad.conf @@ -0,0 +1,2 @@ +affix=_1a +nnet_type=stats diff --git a/egs/chime6/s5_track2/diarization b/egs/chime6/s5_track2/diarization new file mode 120000 index 00000000000..bad937c1444 --- /dev/null +++ b/egs/chime6/s5_track2/diarization @@ -0,0 +1 @@ +../../callhome_diarization/v1/diarization \ No newline at end of file diff --git a/egs/chime6/s5_track2/local/chain b/egs/chime6/s5_track2/local/chain new file mode 120000 index 00000000000..dd7910711d1 --- /dev/null +++ b/egs/chime6/s5_track2/local/chain @@ -0,0 +1 @@ +../../s5_track1/local/chain/ \ No newline at end of file diff --git a/egs/chime6/s5_track2/local/check_dset_error.py b/egs/chime6/s5_track2/local/check_dset_error.py new file mode 100755 index 00000000000..0ed7f59ae83 --- /dev/null +++ b/egs/chime6/s5_track2/local/check_dset_error.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 +# Copyright 2019 Ashish Arora +# Apache 2.0. + +import argparse +import sys, os +import string + +def get_args(): + parser = argparse.ArgumentParser( + description="""This script splits a kaldi text file + into per_speaker per_session text files""") + parser.add_argument("wer_dir_path", type=str, + help="path of directory containing wer files") + parser.add_argument("output_dir_path", type=str, + help="path of the directory containing per speaker output files") + args = parser.parse_args() + return args + +def get_results(filename): + with open(filename) as f: + first_line = f.readline() + parts = first_line.strip().split(',') + total_words = parts[0].split()[-1] + ins = parts[1].split()[0] + deletion = parts[2].split()[0] + sub = parts[3].split()[0] + return int(total_words), int(ins), int(deletion), int(sub) + +def main(): + args = get_args() + recodingid_error_dict={} + min_wer_per_recording = os.path.join(args.wer_dir_path, 'all.txt') + for line in open(min_wer_per_recording, 'r', encoding='utf8'): + toks = line.strip().split() + recordingid = toks[1] + total_words = toks[-5][:-1] + total_errors = toks[-4][:-1] + total_ins = toks[-3][:-1] + total_del = toks[-2][:-1] + total_sub = toks[-1] + recodingid_error_dict[recordingid]=(total_words, total_errors, total_ins, total_del, total_sub) + + recording_spkorder_file = os.path.join(args.output_dir_path, 'recordinid_spkorder') + for line in open(recording_spkorder_file, 'r', encoding='utf8'): + parts = line.strip().split(':') + recordingid = parts[0] + spkorder = parts[1] + spkorder_list=spkorder.split('_') + num_speakers=len(spkorder_list) + total_errors=total_words=total_ins=total_del=total_sub=0 + for i in range(1, num_speakers+1): + filename = 'wer_' + recordingid + '_' + 'r' + str(i)+ 'h' + str(spkorder_list[i-1]) + wer_filename = os.path.join(args.wer_dir_path, filename) + words, ins, deletion, sub = get_results(wer_filename) + total_words += words + total_ins += ins + total_del += deletion + total_sub += sub + total_errors += ins + deletion + sub + assert int(total_words) == int(recodingid_error_dict[recordingid][0]), "Total words mismatch" + assert int(total_errors) == int(recodingid_error_dict[recordingid][1]), "Total errors mismatch" + assert int(total_ins) == int(recodingid_error_dict[recordingid][2]), "Total insertions mismatch" + assert int(total_del) == int(recodingid_error_dict[recordingid][3]), "Total deletions mismatch" + assert int(total_sub) == int(recodingid_error_dict[recordingid][4]), "Total substitutions mismatch" + + +if __name__ == '__main__': + main() diff --git a/egs/chime6/s5_track2/local/check_tools.sh b/egs/chime6/s5_track2/local/check_tools.sh new file mode 120000 index 00000000000..4e835e887f2 --- /dev/null +++ b/egs/chime6/s5_track2/local/check_tools.sh @@ -0,0 +1 @@ +../../s5_track1/local/check_tools.sh \ No newline at end of file diff --git a/egs/chime6/s5_track2/local/convert_rttm_to_utt2spk_and_segments.py b/egs/chime6/s5_track2/local/convert_rttm_to_utt2spk_and_segments.py new file mode 100755 index 00000000000..410dced190c --- /dev/null +++ b/egs/chime6/s5_track2/local/convert_rttm_to_utt2spk_and_segments.py @@ -0,0 +1,98 @@ +#! /usr/bin/env python +# Copyright 2019 Vimal Manohar +# Apache 2.0. + +"""This script converts an RTTM with +speaker info into kaldi utt2spk and segments""" + +import argparse + +def get_args(): + parser = argparse.ArgumentParser( + description="""This script converts an RTTM with + speaker info into kaldi utt2spk and segments""") + parser.add_argument("--use-reco-id-as-spkr", type=str, + choices=["true", "false"], default="false", + help="Use the recording ID based on RTTM and " + "reco2file_and_channel as the speaker") + parser.add_argument("--append-reco-id-to-spkr", type=str, + choices=["true", "false"], default="false", + help="Append recording ID to the speaker ID") + + parser.add_argument("rttm_file", type=str, + help="""Input RTTM file. + The format of the RTTM file is + """ + """ """) + parser.add_argument("reco2file_and_channel", type=str, + help="""Input reco2file_and_channel. + The format is .""") + parser.add_argument("utt2spk", type=str, + help="Output utt2spk file") + parser.add_argument("segments", type=str, + help="Output segments file") + + args = parser.parse_args() + + args.use_reco_id_as_spkr = bool(args.use_reco_id_as_spkr == "true") + args.append_reco_id_to_spkr = bool(args.append_reco_id_to_spkr == "true") + + if args.use_reco_id_as_spkr: + if args.append_reco_id_to_spkr: + raise Exception("Appending recording ID to speaker does not make sense when using --use-reco-id-as-spkr=true") + + return args + +def main(): + args = get_args() + + file_and_channel2reco = {} + utt2spk={} + segments={} + for line in open(args.reco2file_and_channel): + parts = line.strip().split() + file_and_channel2reco[(parts[1], parts[2])] = parts[0] + + utt2spk_writer = open(args.utt2spk, 'w') + segments_writer = open(args.segments, 'w') + for line in open(args.rttm_file): + parts = line.strip().split() + if parts[0] != "SPEAKER": + continue + + file_id = parts[1] + channel = parts[2] + + try: + reco = file_and_channel2reco[(file_id, channel)] + except KeyError as e: + raise Exception("Could not find recording with " + "(file_id, channel) " + "= ({0},{1}) in {2}: {3}\n".format( + file_id, channel, + args.reco2file_and_channel, str(e))) + + start_time = float(parts[3]) + end_time = start_time + float(parts[4]) + + if args.use_reco_id_as_spkr: + spkr = reco + else: + if args.append_reco_id_to_spkr: + spkr = reco + "-" + parts[7] + else: + spkr = parts[7] + + st = int(start_time * 100) + end = int(end_time * 100) + utt = "{0}-{1:06d}-{2:06d}".format(spkr, st, end) + utt2spk[utt]=spkr + segments[utt]=(reco, start_time, end_time) + + for uttid_id in sorted(utt2spk): + utt2spk_writer.write("{0} {1}\n".format(uttid_id, utt2spk[uttid_id])) + segments_writer.write("{0} {1} {2:7.2f} {3:7.2f}\n".format( + uttid_id, segments[uttid_id][0], segments[uttid_id][1], segments[uttid_id][2])) + +if __name__ == '__main__': + main() diff --git a/egs/chime6/s5_track2/local/copy_lat_dir_parallel.sh b/egs/chime6/s5_track2/local/copy_lat_dir_parallel.sh new file mode 120000 index 00000000000..a168a917d92 --- /dev/null +++ b/egs/chime6/s5_track2/local/copy_lat_dir_parallel.sh @@ -0,0 +1 @@ +../../s5_track1/local/copy_lat_dir_parallel.sh \ No newline at end of file diff --git a/egs/chime6/s5_track2/local/decode.sh b/egs/chime6/s5_track2/local/decode.sh new file mode 100755 index 00000000000..876cc0be126 --- /dev/null +++ b/egs/chime6/s5_track2/local/decode.sh @@ -0,0 +1,176 @@ +#!/bin/bash +# +# This script decodes raw utterances through the entire pipeline: +# Feature extraction -> SAD -> Diarization -> ASR +# +# Copyright 2017 Johns Hopkins University (Author: Shinji Watanabe and Yenda Trmal) +# 2019 Desh Raj, David Snyder, Ashish Arora +# Apache 2.0 + +# Begin configuration section. +nj=8 +decode_nj=10 +stage=0 +sad_stage=0 +diarizer_stage=0 +decode_diarize_stage=0 +score_stage=0 +enhancement=beamformit + +# chime5 main directory path +# please change the path accordingly +chime5_corpus=/export/corpora4/CHiME5 +# chime6 data directories, which are generated from ${chime5_corpus}, +# to synchronize audio files across arrays and modify the annotation (JSON) file accordingly +chime6_corpus=${PWD}/CHiME6 +json_dir=${chime6_corpus}/transcriptions +audio_dir=${chime6_corpus}/audio + +enhanced_dir=enhanced +enhanced_dir=$(utils/make_absolute.sh $enhanced_dir) || exit 1 + +# training data +train_set=train_worn_simu_u400k +test_sets="dev_${enhancement}_dereverb eval_${enhancement}_dereverb" + +. ./utils/parse_options.sh + +. ./cmd.sh +. ./path.sh +. ./conf/sad.conf + +# This script also needs the phonetisaurus g2p, srilm, beamformit +./local/check_tools.sh || exit 1 + +########################################################################### +# We first generate the synchronized audio files across arrays and +# corresponding JSON files. Note that this requires sox v14.4.2, +# which is installed via miniconda in ./local/check_tools.sh +########################################################################### + +if [ $stage -le 0 ]; then + local/generate_chime6_data.sh \ + --cmd "$train_cmd" \ + ${chime5_corpus} \ + ${chime6_corpus} +fi + +####################################################################### +# Prepare the dev and eval data with dereverberation (WPE) and +# beamforming. +####################################################################### +if [ $stage -le 1 ]; then + # Beamforming using reference arrays + # enhanced WAV directory + enhandir=enhan + dereverb_dir=${PWD}/wav/wpe/ + + for dset in dev eval; do + for mictype in u01 u02 u03 u04 u06; do + local/run_wpe.sh --nj 4 --cmd "$train_cmd --mem 20G" \ + ${audio_dir}/${dset} \ + ${dereverb_dir}/${dset} \ + ${mictype} + done + done + + for dset in dev eval; do + for mictype in u01 u02 u03 u04 u06; do + local/run_beamformit.sh --cmd "$train_cmd" \ + ${dereverb_dir}/${dset} \ + ${enhandir}/${dset}_${enhancement}_${mictype} \ + ${mictype} + done + done + + # Note that for the evaluation sets, we use the flag + # "--train false". This keeps the files segments, text, + # and utt2spk with .bak extensions, so that they can + # be used later for scoring if needed but are not used + # in the intermediate stages. + for dset in dev eval; do + local/prepare_data.sh --mictype ref --train false \ + "$PWD/${enhandir}/${dset}_${enhancement}_u0*" \ + ${json_dir}/${dset} data/${dset}_${enhancement}_dereverb + done +fi + +if [ $stage -le 2 ]; then + # mfccdir should be some place with a largish disk where you + # want to store MFCC features. + mfccdir=mfcc + for x in ${test_sets}; do + steps/make_mfcc.sh --nj $decode_nj --cmd "$train_cmd" \ + --mfcc-config conf/mfcc_hires.conf \ + data/$x exp/make_mfcc/$x $mfccdir + done +fi + +####################################################################### +# Perform SAD on the dev/eval data +####################################################################### +dir=exp/segmentation${affix} +sad_work_dir=exp/sad${affix}_${nnet_type}/ +sad_nnet_dir=$dir/tdnn_${nnet_type}_sad_1a + +if [ $stage -le 3 ]; then + for datadir in ${test_sets}; do + test_set=data/${datadir} + if [ ! -f ${test_set}/wav.scp ]; then + echo "$0: Not performing SAD on ${test_set}" + exit 0 + fi + # Perform segmentation + local/segmentation/detect_speech_activity.sh --nj $decode_nj --stage $sad_stage \ + $test_set $sad_nnet_dir mfcc $sad_work_dir \ + data/${datadir} || exit 1 + + mv data/${datadir}_seg data/${datadir}_${nnet_type}_seg + mv data/${datadir}/{segments.bak,utt2spk.bak} data/${datadir}_${nnet_type}_seg + # Generate RTTM file from segmentation performed by SAD. This can + # be used to evaluate the performance of the SAD as an intermediate + # step. + steps/segmentation/convert_utt2spk_and_segments_to_rttm.py \ + data/${datadir}_${nnet_type}_seg/utt2spk data/${datadir}_${nnet_type}_seg/segments \ + data/${datadir}_${nnet_type}_seg/rttm + done +fi + +####################################################################### +# Perform diarization on the dev/eval data +####################################################################### +if [ $stage -le 4 ]; then + for datadir in ${test_sets}; do + local/diarize.sh --nj 10 --cmd "$train_cmd" --stage $diarizer_stage \ + exp/xvector_nnet_1a \ + data/${datadir}_${nnet_type}_seg \ + exp/${datadir}_${nnet_type}_seg_diarization + done +fi + +####################################################################### +# Decode diarized output using trained chain model +####################################################################### +if [ $stage -le 5 ]; then + for datadir in ${test_sets}; do + local/decode_diarized.sh --nj $nj --cmd "$decode_cmd" --stage $decode_diarize_stage \ + exp/${datadir}_${nnet_type}_seg_diarization data/$datadir data/lang \ + exp/chain_${train_set}_cleaned_rvb exp/nnet3_${train_set}_cleaned_rvb \ + data/${datadir}_diarized + done +fi + +####################################################################### +# Score decoded dev/eval sets +####################################################################### +if [ $stage -le 6 ]; then + # final scoring to get the challenge result + # please specify both dev and eval set directories so that the search parameters + # (insertion penalty and language model weight) will be tuned using the dev set + local/score_for_submit.sh --stage $score_stage \ + --dev_decodedir exp/chain_${train_set}_cleaned_rvb/tdnn1b_sp/decode_dev_beamformit_dereverb_diarized_2stage \ + --dev_datadir dev_beamformit_dereverb_diarized_hires \ + --eval_decodedir exp/chain_${train_set}_cleaned_rvb/tdnn1b_sp/decode_eval_beamformit_dereverb_diarized_2stage \ + --eval_datadir eval_beamformit_dereverb_diarized_hires +fi +exit 0; diff --git a/egs/chime6/s5_track2/local/decode_diarized.sh b/egs/chime6/s5_track2/local/decode_diarized.sh new file mode 100755 index 00000000000..2d0ad6a3b95 --- /dev/null +++ b/egs/chime6/s5_track2/local/decode_diarized.sh @@ -0,0 +1,71 @@ +#!/bin/bash +# Copyright 2019 Ashish Arora, Vimal Manohar +# Apache 2.0. +# This script takes an rttm file, and performs decoding on on a test directory. +# The output directory contains a text file which can be used for scoring. + + +stage=0 +nj=8 +cmd=queue.pl +echo "$0 $@" # Print the command line for logging +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; +if [ $# != 6 ]; then + echo "Usage: $0 " + echo "e.g.: $0 data/rttm data/dev data/lang_chain exp/chain_train_worn_simu_u400k_cleaned_rvb \ + exp/nnet3_train_worn_simu_u400k_cleaned_rvb data/dev_diarized" + echo "Options: " + echo " --nj # number of parallel jobs." + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + +rttm_dir=$1 +data_in=$2 +lang_dir=$3 +asr_model_dir=$4 +ivector_extractor=$5 +out_dir=$6 + +for f in $rttm_dir/rttm $data_in/wav.scp $data_in/text.bak \ + $lang_dir/L.fst $asr_model_dir/tree_sp/graph/HCLG.fst \ + $asr_model_dir/tdnn1b_sp/final.mdl; do + [ ! -f $f ] && echo "$0: No such file $f" && exit 1; +done + +if [ $stage -le 0 ]; then + echo "$0 copying data files in output directory" + cp $rttm_dir/rttm $rttm_dir/rttm_1 + sed -i 's/'.ENH'/''/g' $rttm_dir/rttm_1 + mkdir -p ${out_dir}_hires + cp ${data_in}/{wav.scp,utt2spk} ${out_dir}_hires + utils/data/get_reco2dur.sh ${out_dir}_hires +fi + +if [ $stage -le 1 ]; then + echo "$0 creating segments file from rttm and utt2spk, reco2file_and_channel " + local/convert_rttm_to_utt2spk_and_segments.py --append-reco-id-to-spkr=true $rttm_dir/rttm_1 \ + <(awk '{print $2".ENH "$2" "$3}' $rttm_dir/rttm_1 |sort -u) \ + ${out_dir}_hires/utt2spk ${out_dir}_hires/segments + + utils/utt2spk_to_spk2utt.pl ${out_dir}_hires/utt2spk > ${out_dir}_hires/spk2utt + + awk '{print $1" "$1" 1"}' ${out_dir}_hires/wav.scp > ${out_dir}_hires/reco2file_and_channel + utils/fix_data_dir.sh ${out_dir}_hires || exit 1; +fi + +if [ $stage -le 2 ]; then + echo "$0 extracting mfcc freatures using segments file" + steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj $nj --cmd queue.pl ${out_dir}_hires + steps/compute_cmvn_stats.sh ${out_dir}_hires + cp $data_in/text.bak ${out_dir}_hires/text +fi + +if [ $stage -le 3 ]; then + echo "$0 performing decoding on the extracted features" + local/nnet3/decode.sh --affix 2stage --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk 150 --nj $nj --ivector-dir $ivector_extractor \ + $out_dir $lang_dir $asr_model_dir/tree_sp/graph $asr_model_dir/tdnn1b_sp/ +fi + diff --git a/egs/chime6/s5_track2/local/diarize.sh b/egs/chime6/s5_track2/local/diarize.sh new file mode 100755 index 00000000000..561d5fe7755 --- /dev/null +++ b/egs/chime6/s5_track2/local/diarize.sh @@ -0,0 +1,113 @@ +#!/bin/bash +# Copyright 2019 David Snder +# Apache 2.0. +# +# This script takes an input directory that has a segments file (and +# a feats.scp file), and performs diarization on it. The output directory +# contains an RTTM file which can be used to resegment the input data. + +stage=0 +nj=10 +cmd="run.pl" +ref_rttm= + +echo "$0 $@" # Print the command line for logging +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; +if [ $# != 3 ]; then + echo "Usage: $0 " + echo "e.g.: $0 exp/xvector_nnet_1a data/dev exp/dev_diarization" + echo "Options: " + echo " --nj # number of parallel jobs." + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --ref-rttm # if present, used to score output RTTM." + exit 1; +fi + +model_dir=$1 +data_in=$2 +out_dir=$3 + +name=`basename $data_in` + +for f in $data_in/feats.scp $data_in/segments $model_dir/plda \ + $model_dir/final.raw $model_dir/extract.config; do + [ ! -f $f ] && echo "$0: No such file $f" && exit 1; +done + +if [ $stage -le 0 ]; then + echo "$0: keeping only data corresponding to array U06 " + echo "$0: we can skip this stage, to perform diarization on all arrays " + # to perform diarization ond scoring on all array please skip this step and + # pass all_array = true in local/multispeaker_score.sh + cp -r data/$name data/${name}.bak + mv data/$name/wav.scp data/$name/wav.scp.bak + grep 'U06' data/$name/wav.scp.bak > data/$name/wav.scp + utils/fix_data_dir.sh data/$name + nj=2 # since we have reduced number of "speakers" now +fi + +if [ $stage -le 1 ]; then + echo "$0: computing features for x-vector extractor" + utils/fix_data_dir.sh data/${name} + rm -rf data/${name}_cmn + local/nnet3/xvector/prepare_feats.sh --nj $nj --cmd "$cmd" \ + data/$name data/${name}_cmn exp/${name}_cmn + cp data/$name/segments exp/${name}_cmn/ + utils/fix_data_dir.sh data/${name}_cmn +fi + +if [ $stage -le 2 ]; then + echo "$0: extracting x-vectors for all segments" + diarization/nnet3/xvector/extract_xvectors.sh --cmd "$cmd" \ + --nj $nj --window 1.5 --period 0.75 --apply-cmn false \ + --min-segment 0.5 $model_dir \ + data/${name}_cmn $out_dir/xvectors_${name} +fi + +# Perform PLDA scoring +if [ $stage -le 3 ]; then + # Perform PLDA scoring on all pairs of segments for each recording. + echo "$0: performing PLDA scoring between all pairs of x-vectors" + diarization/nnet3/xvector/score_plda.sh --cmd "$cmd" \ + --target-energy 0.5 \ + --nj $nj $model_dir/ $out_dir/xvectors_${name} \ + $out_dir/xvectors_${name}/plda_scores +fi + +if [ $stage -le 4 ]; then + echo "$0: performing clustering using PLDA scores (we assume 4 speakers per recording)" + awk '{print $1, "4"}' data/$name/wav.scp > data/$name/reco2num_spk + diarization/cluster.sh --cmd "$cmd" --nj $nj \ + --reco2num-spk data/$name/reco2num_spk \ + --rttm-channel 1 \ + $out_dir/xvectors_${name}/plda_scores $out_dir + echo "$0: wrote RTTM to output directory ${out_dir}" +fi + +# For scoring the diarization system, we use the same tool that was +# used in the DIHARD II challenge. This is available at: +# https://github.com/nryant/dscore +if [ $stage -le 5 ]; then + # If a reference RTTM file is not provided, we create one using the backed up + # segments and utt2spk files in the original data directory. + if [ -z $ref_rttm ]; then + ref_rttm=data/$name/rttm + echo "$0: preparing ref RTTM file from segments and utt2spk" + steps/segmentation/convert_utt2spk_and_segments_to_rttm.py data/$name/utt2spk.bak \ + data/$name/segments.bak $ref_rttm + fi + grep 'U06' $ref_rttm > ${ref_rttm}.U06 + ref_rttm_path=$(readlink -f ${ref_rttm}.U06) + out_rttm_path=$(readlink -f $out_dir/rttm) + if ! [ -d dscore ]; then + git clone https://github.com/nryant/dscore.git || exit 1; + cd dscore + python -m pip install --user -r requirements.txt + cd .. + fi + cd dscore + python score.py -r $ref_rttm_path -s $out_rttm_path + cd .. +fi + diff --git a/egs/chime6/s5_track2/local/distant_audio_list b/egs/chime6/s5_track2/local/distant_audio_list new file mode 120000 index 00000000000..0455876cf4d --- /dev/null +++ b/egs/chime6/s5_track2/local/distant_audio_list @@ -0,0 +1 @@ +../../s5_track1/local/distant_audio_list \ No newline at end of file diff --git a/egs/chime6/s5_track2/local/extract_noises.py b/egs/chime6/s5_track2/local/extract_noises.py new file mode 120000 index 00000000000..04a6389916d --- /dev/null +++ b/egs/chime6/s5_track2/local/extract_noises.py @@ -0,0 +1 @@ +../../s5_track1/local/extract_noises.py \ No newline at end of file diff --git a/egs/chime6/s5_track2/local/extract_vad_weights.sh b/egs/chime6/s5_track2/local/extract_vad_weights.sh new file mode 120000 index 00000000000..0db29cded5d --- /dev/null +++ b/egs/chime6/s5_track2/local/extract_vad_weights.sh @@ -0,0 +1 @@ +../../s5_track1/local/extract_vad_weights.sh \ No newline at end of file diff --git a/egs/chime6/s5_track2/local/gen_aligned_hyp.py b/egs/chime6/s5_track2/local/gen_aligned_hyp.py new file mode 100755 index 00000000000..acaa3a13ad5 --- /dev/null +++ b/egs/chime6/s5_track2/local/gen_aligned_hyp.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 +# Copyright 2019 Yusuke Fujita +# Apache 2.0. + +"""This script generates hypothesis utterances aligned with reference segments. + Usage: gen_align_hyp.py alignment.txt wc.txt > hyp.txt + alignment.txt is a session-level word alignment generated by align-text command. + wc.txt is a sequence of utt-id:reference_word_count generated by 'local/get_ref_perspeaker_persession_file.py'. +""" + +import sys, io +import string +output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') + +def load_align_text(f): + alignments = {} + for line in f: + recoid, res = line.split(None, 1) + alignments[recoid] = [] + toks = res.split(';') + for tok in toks: + ref, hyp = tok.split() + alignments[recoid].append((ref, hyp)) + return alignments + +alignments = load_align_text(open(sys.argv[1],'r', encoding='utf8')) + +for line in open(sys.argv[2],'r', encoding='utf8'): + recoid, res = line.split(None, 1) + ali = iter(alignments[recoid]) + toks = res.split() + for tok in toks: + uttid, count = tok.split(':') + count = int(count) + text = '' + for i in range(count): + while True: + ref, hyp = ali.__next__() + if hyp != '': + text += ' ' + hyp + if ref != '': + break + output.write(uttid + ' ' + text.strip() + '\n') diff --git a/egs/chime6/s5_track2/local/generate_chime6_data.sh b/egs/chime6/s5_track2/local/generate_chime6_data.sh new file mode 120000 index 00000000000..62882cd6279 --- /dev/null +++ b/egs/chime6/s5_track2/local/generate_chime6_data.sh @@ -0,0 +1 @@ +../../s5_track1/local/generate_chime6_data.sh \ No newline at end of file diff --git a/egs/chime6/s5_track2/local/get_best_error.py b/egs/chime6/s5_track2/local/get_best_error.py new file mode 100755 index 00000000000..b9d8b0d43e7 --- /dev/null +++ b/egs/chime6/s5_track2/local/get_best_error.py @@ -0,0 +1,84 @@ +#! /usr/bin/env python3 +# Copyright 2019 Ashish Arora +# Apache 2.0. +"""This script finds best matching of reference and hypothesis speakers. + For the best matching speakers,it provides the WER for the reference session + (eg:S02) and hypothesis recording (eg: S02_U02)""" + +import itertools +import numpy as np +import argparse +from munkres import Munkres + +def get_args(): + parser = argparse.ArgumentParser( + description="""This script finds best matching of reference and hypothesis speakers. + For the best matching it provides the WER""") + parser.add_argument("WER_dir", type=str, + help="path of WER files") + parser.add_argument("recording_id", type=str, + help="recording_id name") + parser.add_argument("num_speakers", type=str, + help="number of speakers in ref") + args = parser.parse_args() + return args + + +def get_results(filename): + with open(filename) as f: + first_line = f.readline() + parts = first_line.strip().split(',') + total_words = parts[0].split()[-1] + ins = parts[1].split()[0] + deletions = parts[2].split()[0] + sub = parts[3].split()[0] + return total_words, ins, deletions, sub + + +def get_min_wer(recording_id, num_speakers, WER_dir): + best_wer_file = WER_dir + '/' + 'best_wer' + '_' + recording_id + best_wer_writer = open(best_wer_file, 'w') + m = Munkres() + total_error_mat = [0] * num_speakers + all_errors_mat = [0] * num_speakers + for i in range(num_speakers): + total_error_mat[i] = [0] * num_speakers + all_errors_mat[i] = [0] * num_speakers + for i in range(1, num_speakers+1): + for j in range(1, num_speakers+1): + filename = '/wer_' + recording_id + '_' + 'r' + str(i)+ 'h' + str(j) + filename = WER_dir + filename + total_words, ins, deletions, sub = get_results(filename) + ins = int(ins) + deletions = int(deletions) + sub = int(sub) + total_error = ins + deletions + sub + total_error_mat[i-1][j-1]=total_error + all_errors_mat[i-1][j-1]= (total_words, total_error, ins, deletions, sub) + + indexes = m.compute(total_error_mat) + total_errors=total_words=total_ins=total_del=total_sub=0 + spk_order = '(' + for row, column in indexes: + words, errs, ins, deletions, sub = all_errors_mat[row][column] + total_errors += int(errs) + total_words += int(words) + total_ins += int(ins) + total_del += int(deletions) + total_sub += int(sub) + spk_order = spk_order + str(column+1) + ', ' + spk_order = spk_order + ')' + text = "Best error: (#T #E #I #D #S) " + str(total_words)+ ', '+str(total_errors)+ ', '+str(total_ins)+ ', '+str(total_del)+ ', '+str(total_sub) + best_wer_writer.write(" recording_id: "+ recording_id + ' ') + best_wer_writer.write(' best hypothesis speaker order: ' + spk_order + ' ') + best_wer_writer.write(text+ '\n') + best_wer_writer.close() + + +def main(): + args = get_args() + get_min_wer(args.recording_id, int(args.num_speakers), args.WER_dir) + + +if __name__ == '__main__': + main() diff --git a/egs/chime6/s5_track2/local/get_hyp_perspeaker_perarray_file.py b/egs/chime6/s5_track2/local/get_hyp_perspeaker_perarray_file.py new file mode 100755 index 00000000000..7b3e14aaa49 --- /dev/null +++ b/egs/chime6/s5_track2/local/get_hyp_perspeaker_perarray_file.py @@ -0,0 +1,56 @@ +#! /usr/bin/env python +# Copyright 2019 Ashish Arora +# Apache 2.0. +"""This script splits a kaldi (text) file + into per_array per_session per_speaker hypothesis (text) files""" + +import argparse +def get_args(): + parser = argparse.ArgumentParser( + description="""This script splits a kaldi text file + into per_array per_session per_speaker text files""") + parser.add_argument("input_text_path", type=str, + help="path of text files") + parser.add_argument("output_dir_path", type=str, + help="Output path for per_array per_session per_speaker reference files") + args = parser.parse_args() + return args + + +def main(): + # S09_U06.ENH-4-704588-704738 + args = get_args() + sessionid_micid_speakerid_dict= {} + for line in open(args.input_text_path): + parts = line.strip().split() + uttid_id = parts[0] + temp = uttid_id.strip().split('.')[0] + micid = temp.strip().split('_')[1] + speakerid = uttid_id.strip().split('-')[1] + sessionid = uttid_id.strip().split('_')[0] + sessionid_micid_speakerid = sessionid + '_' + micid + '_' + speakerid + if sessionid_micid_speakerid not in sessionid_micid_speakerid_dict: + sessionid_micid_speakerid_dict[sessionid_micid_speakerid]=list() + sessionid_micid_speakerid_dict[sessionid_micid_speakerid].append(line) + + for sessionid_micid_speakerid in sorted(sessionid_micid_speakerid_dict): + hyp_file = args.output_dir_path + '/' + 'hyp' + '_' + sessionid_micid_speakerid + hyp_writer = open(hyp_file, 'w') + combined_hyp_file = args.output_dir_path + '/' + 'hyp' + '_' + sessionid_micid_speakerid + '_comb' + combined_hyp_writer = open(combined_hyp_file, 'w') + utterances = sessionid_micid_speakerid_dict[sessionid_micid_speakerid] + text = '' + for line in utterances: + parts = line.strip().split() + text = text + ' ' + ' '.join(parts[1:]) + hyp_writer.write(line) + combined_utterance = 'utt' + " " + text + combined_hyp_writer.write(combined_utterance) + combined_hyp_writer.write('\n') + combined_hyp_writer.close() + hyp_writer.close() + + +if __name__ == '__main__': + main() + diff --git a/egs/chime6/s5_track2/local/get_ref_perspeaker_persession_file.py b/egs/chime6/s5_track2/local/get_ref_perspeaker_persession_file.py new file mode 100755 index 00000000000..6b00e29e6b1 --- /dev/null +++ b/egs/chime6/s5_track2/local/get_ref_perspeaker_persession_file.py @@ -0,0 +1,79 @@ +#! /usr/bin/env python +# Copyright 2019 Ashish Arora +# Apache 2.0. +"""This script splits a kaldi (text) file + into per_speaker per_session reference (text) file""" + +import argparse + +def get_args(): + parser = argparse.ArgumentParser( + description="""This script splits a kaldi text file + into per_speaker per_session text files""") + parser.add_argument("input_text_path", type=str, + help="path of text file") + parser.add_argument("output_dir_path", type=str, + help="Output path for per_session per_speaker reference files") + args = parser.parse_args() + return args + + +def main(): + args = get_args() + sessionid_speakerid_dict= {} + spkrid_mapping = {} + for line in open(args.input_text_path): + parts = line.strip().split() + uttid_id = parts[0] + speakerid = uttid_id.strip().split('_')[0] + sessionid = uttid_id.strip().split('_')[1] + sessionid_speakerid = sessionid + '_' + speakerid + if sessionid_speakerid not in sessionid_speakerid_dict: + sessionid_speakerid_dict[sessionid_speakerid]=list() + sessionid_speakerid_dict[sessionid_speakerid].append(line) + + spkr_num = 1 + prev_sessionid = '' + for sessionid_speakerid in sorted(sessionid_speakerid_dict): + spkr_id = sessionid_speakerid.strip().split('_')[1] + curr_sessionid = sessionid_speakerid.strip().split('_')[0] + if prev_sessionid != curr_sessionid: + prev_sessionid = curr_sessionid + spkr_num = 1 + if spkr_id not in spkrid_mapping: + spkrid_mapping[spkr_id] = spkr_num + spkr_num += 1 + + for sessionid_speakerid in sorted(sessionid_speakerid_dict): + ref_file = args.output_dir_path + '/ref_' + sessionid_speakerid.split('_')[0] + '_' + str( + spkrid_mapping[sessionid_speakerid.split('_')[1]]) + ref_writer = open(ref_file, 'w') + wc_file = args.output_dir_path + '/ref_wc_' + sessionid_speakerid.split('_')[0] + '_' + str( + spkrid_mapping[sessionid_speakerid.split('_')[1]]) + wc_writer = open(wc_file, 'w') + combined_ref_file = args.output_dir_path + '/ref_' + sessionid_speakerid.split('_')[0] + '_' + str( + spkrid_mapping[sessionid_speakerid.split('_')[1]]) + '_comb' + combined_ref_writer = open(combined_ref_file, 'w') + utterances = sessionid_speakerid_dict[sessionid_speakerid] + text = '' + uttid_wc = 'utt' + for line in utterances: + parts = line.strip().split() + uttid_id = parts[0] + utt_text = ' '.join(parts[1:]) + text = text + ' ' + ' '.join(parts[1:]) + ref_writer.write(line) + length = str(len(utt_text.split())) + uttid_id_len = uttid_id + ":" + length + uttid_wc = uttid_wc + ' ' + uttid_id_len + combined_utterance = 'utt' + " " + text + combined_ref_writer.write(combined_utterance) + combined_ref_writer.write('\n') + combined_ref_writer.close() + wc_writer.write(uttid_wc) + wc_writer.write('\n') + wc_writer.close() + ref_writer.close() + +if __name__ == '__main__': + main() diff --git a/egs/chime6/s5_track2/local/install_pb_chime5.sh b/egs/chime6/s5_track2/local/install_pb_chime5.sh new file mode 120000 index 00000000000..ce5ea5f9f08 --- /dev/null +++ b/egs/chime6/s5_track2/local/install_pb_chime5.sh @@ -0,0 +1 @@ +../../s5_track1/local/install_pb_chime5.sh \ No newline at end of file diff --git a/egs/chime6/s5_track2/local/json2text.py b/egs/chime6/s5_track2/local/json2text.py new file mode 120000 index 00000000000..2aa0a8dd1f9 --- /dev/null +++ b/egs/chime6/s5_track2/local/json2text.py @@ -0,0 +1 @@ +../../s5_track1/local/json2text.py \ No newline at end of file diff --git a/egs/chime6/s5_track2/local/make_noise_list.py b/egs/chime6/s5_track2/local/make_noise_list.py new file mode 120000 index 00000000000..d8dcc7822fc --- /dev/null +++ b/egs/chime6/s5_track2/local/make_noise_list.py @@ -0,0 +1 @@ +../../s5_track1/local/make_noise_list.py \ No newline at end of file diff --git a/egs/chime6/s5_track2/local/make_voxceleb1.pl b/egs/chime6/s5_track2/local/make_voxceleb1.pl new file mode 100755 index 00000000000..2268c20ab52 --- /dev/null +++ b/egs/chime6/s5_track2/local/make_voxceleb1.pl @@ -0,0 +1,130 @@ +#!/usr/bin/perl +# +# Copyright 2018 Ewald Enzinger +# 2018 David Snyder +# +# Usage: make_voxceleb1.pl /export/voxceleb1 data/ + +if (@ARGV != 2) { + print STDERR "Usage: $0 \n"; + print STDERR "e.g. $0 /export/voxceleb1 data/\n"; + exit(1); +} + +($data_base, $out_dir) = @ARGV; +my $out_test_dir = "$out_dir/voxceleb1_test"; +my $out_train_dir = "$out_dir/voxceleb1_train"; + +if (system("mkdir -p $out_test_dir") != 0) { + die "Error making directory $out_test_dir"; +} + +if (system("mkdir -p $out_train_dir") != 0) { + die "Error making directory $out_train_dir"; +} + +opendir my $dh, "$data_base/voxceleb1_wav" or die "Cannot open directory: $!"; +my @spkr_dirs = grep {-d "$data_base/voxceleb1_wav/$_" && ! /^\.{1,2}$/} readdir($dh); +closedir $dh; + +if (! -e "$data_base/voxceleb1_test.txt") { + system("wget -O $data_base/voxceleb1_test.txt http://www.openslr.org/resources/49/voxceleb1_test.txt"); +} + +if (! -e "$data_base/vox1_meta.csv") { + system("wget -O $data_base/vox1_meta.csv http://www.openslr.org/resources/49/vox1_meta.csv"); +} + +open(TRIAL_IN, "<", "$data_base/voxceleb1_test.txt") or die "Could not open the verification trials file $data_base/voxceleb1_test.txt"; +open(META_IN, "<", "$data_base/vox1_meta.csv") or die "Could not open the meta data file $data_base/vox1_meta.csv"; +open(SPKR_TEST, ">", "$out_test_dir/utt2spk") or die "Could not open the output file $out_test_dir/utt2spk"; +open(WAV_TEST, ">", "$out_test_dir/wav.scp") or die "Could not open the output file $out_test_dir/wav.scp"; +open(SPKR_TRAIN, ">", "$out_train_dir/utt2spk") or die "Could not open the output file $out_train_dir/utt2spk"; +open(WAV_TRAIN, ">", "$out_train_dir/wav.scp") or die "Could not open the output file $out_train_dir/wav.scp"; +open(TRIAL_OUT, ">", "$out_test_dir/trials") or die "Could not open the output file $out_test_dir/trials"; + +my %id2spkr = (); +while () { + chomp; + my ($vox_id, $spkr_id, $gender, $nation, $set) = split; + $id2spkr{$vox_id} = $spkr_id; +} + +my $test_spkrs = (); +while () { + chomp; + my ($tar_or_non, $path1, $path2) = split; + + # Create entry for left-hand side of trial + my ($spkr_id, $filename) = split('/', $path1); + my $rec_id = substr($filename, 0, 11); + my $segment = substr($filename, 12, 7); + my $utt_id1 = "$spkr_id-$rec_id-$segment"; + $test_spkrs{$spkr_id} = (); + + # Create entry for right-hand side of trial + my ($spkr_id, $filename) = split('/', $path2); + my $rec_id = substr($filename, 0, 11); + my $segment = substr($filename, 12, 7); + my $utt_id2 = "$spkr_id-$rec_id-$segment"; + $test_spkrs{$spkr_id} = (); + + my $target = "nontarget"; + if ($tar_or_non eq "1") { + $target = "target"; + } + print TRIAL_OUT "$utt_id1 $utt_id2 $target\n"; +} + +foreach (@spkr_dirs) { + my $spkr_id = $_; + my $new_spkr_id = $spkr_id; + # If we're using a newer version of VoxCeleb1, we need to "deanonymize" + # the speaker labels. + if (exists $id2spkr{$spkr_id}) { + $new_spkr_id = $id2spkr{$spkr_id}; + } + opendir my $dh, "$data_base/voxceleb1_wav/$spkr_id/" or die "Cannot open directory: $!"; + my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh); + closedir $dh; + foreach (@files) { + my $filename = $_; + my $rec_id = substr($filename, 0, 11); + my $segment = substr($filename, 12, 7); + my $wav = "$data_base/voxceleb1_wav/$spkr_id/$filename.wav"; + my $utt_id = "$new_spkr_id-$rec_id-$segment"; + if (exists $test_spkrs{$new_spkr_id}) { + print WAV_TEST "$utt_id", " $wav", "\n"; + print SPKR_TEST "$utt_id", " $new_spkr_id", "\n"; + } else { + print WAV_TRAIN "$utt_id", " $wav", "\n"; + print SPKR_TRAIN "$utt_id", " $new_spkr_id", "\n"; + } + } +} + +close(SPKR_TEST) or die; +close(WAV_TEST) or die; +close(SPKR_TRAIN) or die; +close(WAV_TRAIN) or die; +close(TRIAL_OUT) or die; +close(TRIAL_IN) or die; +close(META_IN) or die; + +if (system( + "utils/utt2spk_to_spk2utt.pl $out_test_dir/utt2spk >$out_test_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_test_dir"; +} +system("env LC_COLLATE=C utils/fix_data_dir.sh $out_test_dir"); +if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_test_dir") != 0) { + die "Error validating directory $out_test_dir"; +} + +if (system( + "utils/utt2spk_to_spk2utt.pl $out_train_dir/utt2spk >$out_train_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_train_dir"; +} +system("env LC_COLLATE=C utils/fix_data_dir.sh $out_train_dir"); +if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_train_dir") != 0) { + die "Error validating directory $out_train_dir"; +} diff --git a/egs/chime6/s5_track2/local/make_voxceleb2.pl b/egs/chime6/s5_track2/local/make_voxceleb2.pl new file mode 100755 index 00000000000..34c1591eba3 --- /dev/null +++ b/egs/chime6/s5_track2/local/make_voxceleb2.pl @@ -0,0 +1,70 @@ +#!/usr/bin/perl +# +# Copyright 2018 Ewald Enzinger +# +# Usage: make_voxceleb2.pl /export/voxceleb2 dev data/dev +# +# Note: This script requires ffmpeg to be installed and its location included in $PATH. + +if (@ARGV != 3) { + print STDERR "Usage: $0 \n"; + print STDERR "e.g. $0 /export/voxceleb2 dev data/dev\n"; + exit(1); +} + +# Check that ffmpeg is installed. +if (`which ffmpeg` eq "") { + die "Error: this script requires that ffmpeg is installed."; +} + +($data_base, $dataset, $out_dir) = @ARGV; + +if ("$dataset" ne "dev" && "$dataset" ne "test") { + die "dataset parameter must be 'dev' or 'test'!"; +} + +opendir my $dh, "$data_base/$dataset/aac" or die "Cannot open directory: $!"; +my @spkr_dirs = grep {-d "$data_base/$dataset/aac/$_" && ! /^\.{1,2}$/} readdir($dh); +closedir $dh; + +if (system("mkdir -p $out_dir") != 0) { + die "Error making directory $out_dir"; +} + +open(SPKR, ">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk"; +open(WAV, ">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp"; + +foreach (@spkr_dirs) { + my $spkr_id = $_; + + opendir my $dh, "$data_base/$dataset/aac/$spkr_id/" or die "Cannot open directory: $!"; + my @rec_dirs = grep {-d "$data_base/$dataset/aac/$spkr_id/$_" && ! /^\.{1,2}$/} readdir($dh); + closedir $dh; + + foreach (@rec_dirs) { + my $rec_id = $_; + + opendir my $dh, "$data_base/$dataset/aac/$spkr_id/$rec_id/" or die "Cannot open directory: $!"; + my @files = map{s/\.[^.]+$//;$_}grep {/\.m4a$/} readdir($dh); + closedir $dh; + + foreach (@files) { + my $name = $_; + my $wav = "ffmpeg -v 8 -i $data_base/$dataset/aac/$spkr_id/$rec_id/$name.m4a -f wav -acodec pcm_s16le -|"; + my $utt_id = "$spkr_id-$rec_id-$name"; + print WAV "$utt_id", " $wav", "\n"; + print SPKR "$utt_id", " $spkr_id", "\n"; + } + } +} +close(SPKR) or die; +close(WAV) or die; + +if (system( + "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_dir"; +} +system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir"); +if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) { + die "Error validating directory $out_dir"; +} diff --git a/egs/chime6/s5_track2/local/multispeaker_score.sh b/egs/chime6/s5_track2/local/multispeaker_score.sh new file mode 100755 index 00000000000..74e089c4052 --- /dev/null +++ b/egs/chime6/s5_track2/local/multispeaker_score.sh @@ -0,0 +1,144 @@ +#!/bin/bash +# Copyright 2019 Ashish Arora, Yusuke Fujita +# Apache 2.0. +# This script takes a reference and hypothesis text file, and performs +# multispeaker scoring. + +stage=0 +cmd=queue.pl +num_spkrs=4 +num_hyp_spk=4 +datadir=dev_beamformit_dereverb +get_stats=true +all_array=false +declare -a recording_id_array=("S02_U06" "S09_U06") +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "Usage: $0 " + echo "e.g.: $0 data/diarized/text data/dev \ + exp/chain_train_worn_simu_u400k_cleaned_rvb/tdnn1b_sp/decode_dev_xvector_sad/scoring_kaldi/penalty_1.0/10.txt \ + exp/chain_train_worn_simu_u400k_cleaned_rvb/tdnn1b_sp/decode_dev_xvector_sad/scoring_kaldi_multispeaker" + echo "Options: " + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + +ref_file=$1 +hyp_file=$2 +out_dir=$3 + +output_dir=$out_dir/per_speaker_output +wer_dir=$out_dir/per_speaker_wer + +# For dev and evaluation set, we take corresopnding arrays +if [[ ${datadir} == *dev* ]]; then + recording_id_array=("S02_U06" "S09_U06") +fi + +if [[ ${datadir} == *eval* ]]; then + recording_id_array=("S01_U06" "S21_U06") +fi + +if [[ ${datadir} == *dev* ]] && [[ $all_array == "true" ]]; then + recording_id_array=("S02_U01" "S02_U02" "S02_U03" "S02_U04" "S02_U06" "S09_U01" "S09_U02" "S09_U03" "S09_U04" "S09_U06") +fi + +if [[ ${datadir} == *eval* ]] && [[ $all_array == "true" ]]; then + recording_id_array=("S01_U01" "S01_U02" "S01_U03" "S01_U04" "S01_U06" "S21_U01" "S21_U02" "S21_U03" "S21_U04" "S21_U06") +fi + +for f in $ref_file $hyp_file; do + [ ! -f $f ] && echo "$0: No such file $f" && exit 1; +done + +if [ $stage -le 0 ]; then + # generate per speaker per session file at paragraph level for the reference" + # and per speaker per array file at paraghaph level for the hypothesis" + mkdir -p $output_dir $wer_dir + local/wer_output_filter < $ref_file > $output_dir/ref_filt.txt + local/wer_output_filter < $hyp_file > $output_dir/hyp_filt.txt + local/get_ref_perspeaker_persession_file.py $output_dir/ref_filt.txt $output_dir + local/get_hyp_perspeaker_perarray_file.py $output_dir/hyp_filt.txt $output_dir +fi + +if [ $stage -le 1 ]; then + if [ $num_hyp_spk -le 3 ]; then + # create dummy per speaker per array hypothesis files for if the" + # perdicted number of speakers by diarization is less than 4 " + for recording_id in "${recording_id_array[@]}"; do + for (( i=$num_hyp_spk+1; i<$num_spkrs+1; i++ )); do + echo 'utt ' > ${dir}/hyp_${recording_id}_${i}_comb + done + done + fi +fi + +if [ $stage -le 2 ]; then + # calculate wer for each ref and hypothesis speaker" + for recording_id in "${recording_id_array[@]}"; do + for (( i=0; i<$((num_spkrs * num_spkrs)); i++ )); do + ind_r=$((i / num_spkrs + 1)) + ind_h=$((i % num_spkrs + 1)) + sessionid="$(echo $recording_id | cut -d'_' -f1)" + + # compute WER with combined texts + compute-wer --text --mode=present ark:${output_dir}/ref_${sessionid}_${ind_r}_comb \ + ark:${output_dir}/hyp_${recording_id}_${ind_h}_comb \ + > $wer_dir/wer_${recording_id}_r${ind_r}h${ind_h} 2>/dev/null + done + + local/get_best_error.py $wer_dir $recording_id $num_spkrs + done +fi + +if [ $stage -le 3 ]; then + # print best word error rate" + # it will print best wer for each recording and each array" + cat $wer_dir/best_wer* > $wer_dir/all.txt + cat $wer_dir/all.txt | local/print_dset_error.py \ + $output_dir/recordinid_spkorder > $wer_dir/array_wer.txt +fi + +if [ $stage -le 4 ]; then + # checks if DP result of total error is equivalent + # to the sum of the individual errors: + local/check_dset_error.py $wer_dir $output_dir +fi + +if [ $stage -le 5 ] && [[ $get_stats == "true" ]]; then + # generate per utterance wer details at utterance level + mkdir -p $wer_dir/wer_details $wer_dir/wer_details/log/ + while read -r line; + do + recording_id=$(echo "$line" | cut -f1 -d ":") + spkorder_str=$(echo "$line" | cut -f2 -d ":") + sessionid=$(echo "$line" | cut -f1 -d "_") + IFS='_' read -r -a spkorder_list <<< "$spkorder_str" + IFS=" " + ind_r=1 + for ind_h in "${spkorder_list[@]}"; do + + $cmd $wer_dir/wer_details/log/${recording_id}_r${ind_r}h${ind_h}_comb.log \ + align-text ark:${output_dir}/ref_${sessionid}_${ind_r}_comb ark:${output_dir}/hyp_${recording_id}_${ind_h}_comb ark:$output_dir/alignment_${sessionid}_r${ind_r}h${ind_h}.txt + + # split hypothesis texts along with reference utterances using word alignment of combined texts + local/gen_aligned_hyp.py $output_dir/alignment_${sessionid}_r${ind_r}h${ind_h}.txt ${output_dir}/ref_wc_${sessionid}_${ind_r} > ${output_dir}/hyp_${recording_id}_r${ind_r}h${ind_h}_ref_segmentation + + ## compute per utterance alignments + $cmd $wer_dir/wer_details/log/${recording_id}_r${ind_r}h${ind_h}_per_utt.log \ + cat ${output_dir}/hyp_${recording_id}_r${ind_r}h${ind_h}_ref_segmentation \| \ + align-text --special-symbol="'***'" ark:${output_dir}/ref_${sessionid}_${ind_r} ark:- ark,t:- \| \ + utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \| tee $wer_dir/wer_details/per_utt_${recording_id}_r${ind_r}h${ind_h} || exit 1 + + $cmd $wer_dir/wer_details/log/${recording_id}_r${ind_r}h${ind_h}_ops.log \ + cat $wer_dir/wer_details/per_utt_${recording_id}_r${ind_r}h${ind_h} \| \ + utils/scoring/wer_ops_details.pl --special-symbol "'***'" \| \ + sort -b -i -k 1,1 -k 4,4rn -k 2,2 -k 3,3 \> $wer_dir/wer_details/ops_${recording_id}_r${ind_r}h${ind_h} || exit 1; + + ind_r=$(( ind_r + 1 )) + done + done < $output_dir/recordinid_spkorder + # done generating per utterance wer details +fi diff --git a/egs/chime6/s5_track2/local/nnet3/compare_wer.sh b/egs/chime6/s5_track2/local/nnet3/compare_wer.sh new file mode 120000 index 00000000000..87041e833d0 --- /dev/null +++ b/egs/chime6/s5_track2/local/nnet3/compare_wer.sh @@ -0,0 +1 @@ +../../../s5_track1/local/nnet3/compare_wer.sh \ No newline at end of file diff --git a/egs/chime6/s5_track2/local/nnet3/decode.sh b/egs/chime6/s5_track2/local/nnet3/decode.sh new file mode 120000 index 00000000000..32595ccedbc --- /dev/null +++ b/egs/chime6/s5_track2/local/nnet3/decode.sh @@ -0,0 +1 @@ +../../../s5_track1/local/nnet3/decode.sh \ No newline at end of file diff --git a/egs/chime6/s5_track2/local/nnet3/run_ivector_common.sh b/egs/chime6/s5_track2/local/nnet3/run_ivector_common.sh new file mode 120000 index 00000000000..4161993c225 --- /dev/null +++ b/egs/chime6/s5_track2/local/nnet3/run_ivector_common.sh @@ -0,0 +1 @@ +../../../s5_track1/local/nnet3/run_ivector_common.sh \ No newline at end of file diff --git a/egs/chime6/s5_track2/local/nnet3/xvector/prepare_feats.sh b/egs/chime6/s5_track2/local/nnet3/xvector/prepare_feats.sh new file mode 100755 index 00000000000..cb8fe2e6326 --- /dev/null +++ b/egs/chime6/s5_track2/local/nnet3/xvector/prepare_feats.sh @@ -0,0 +1,89 @@ +#!/bin/bash +# +# Apache 2.0. + +# This script applies sliding window CMVN and writes the features to disk. +# +# Although this kind of script isn't necessary in speaker recognition recipes, +# it can be helpful in the diarization recipes. The script +# diarization/nnet3/xvector/extract_xvectors.sh extracts x-vectors from very +# short (e.g., 1-2 seconds) segments. Therefore, in order to apply the sliding +# window CMVN in a meaningful way, it must be performed prior to performing +# the subsegmentation. + +nj=40 +cmd="run.pl" +stage=0 +norm_vars=false +center=true +compress=true +cmn_window=300 + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; +if [ $# != 3 ]; then + echo "Usage: $0 " + echo "e.g.: $0 data/train data/train_no_sil exp/make_xvector_features" + echo "Options: " + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --norm-vars # If true, normalize variances in the sliding window cmvn" + exit 1; +fi + +data_in=$1 +data_out=$2 +dir=$3 + +name=`basename $data_in` + +for f in $data_in/feats.scp ; do + [ ! -f $f ] && echo "$0: No such file $f" && exit 1; +done + +# Set various variables. +mkdir -p $dir/log +mkdir -p $data_out +featdir=$(utils/make_absolute.sh $dir) + +if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $featdir/storage ]; then + utils/create_split_dir.pl \ + /export/b{14,15,16,17}/$USER/kaldi-data/egs/callhome_diarization/v2/xvector-$(date +'%m_%d_%H_%M')/xvector_cmvn_feats/storage $featdir/storage +fi + +for n in $(seq $nj); do + # the next command does nothing unless $featdir/storage/ exists, see + # utils/create_data_link.pl for more info. + utils/create_data_link.pl $featdir/xvector_cmvn_feats_${name}.${n}.ark +done + +cp $data_in/utt2spk $data_out/utt2spk +cp $data_in/spk2utt $data_out/spk2utt +cp $data_in/wav.scp $data_out/wav.scp +for f in $data_in/segments $data_in/segments/vad.scp ; do + [ -f $f ] && cp $f $data_out/`basename $f`; +done + +write_num_frames_opt="--write-num-frames=ark,t:$featdir/log/utt2num_frames.JOB" + +sdata_in=$data_in/split$nj; +utils/split_data.sh $data_in $nj || exit 1; + +$cmd JOB=1:$nj $dir/log/create_xvector_cmvn_feats_${name}.JOB.log \ + apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=$cmn_window \ + scp:${sdata_in}/JOB/feats.scp ark:- \| \ + copy-feats --compress=$compress $write_num_frames_opt ark:- \ + ark,scp:$featdir/xvector_cmvn_feats_${name}.JOB.ark,$featdir/xvector_cmvn_feats_${name}.JOB.scp || exit 1; + +for n in $(seq $nj); do + cat $featdir/xvector_cmvn_feats_${name}.$n.scp || exit 1; +done > ${data_out}/feats.scp || exit 1 + +for n in $(seq $nj); do + cat $featdir/log/utt2num_frames.$n || exit 1; +done > $data_out/utt2num_frames || exit 1 +rm $featdir/log/utt2num_frames.* + +echo "$0: Succeeded creating xvector features for $name" diff --git a/egs/chime6/s5_track2/local/nnet3/xvector/prepare_feats_for_egs.sh b/egs/chime6/s5_track2/local/nnet3/xvector/prepare_feats_for_egs.sh new file mode 100755 index 00000000000..dcdbe1b1593 --- /dev/null +++ b/egs/chime6/s5_track2/local/nnet3/xvector/prepare_feats_for_egs.sh @@ -0,0 +1,83 @@ +#!/bin/bash +# +# Apache 2.0. + +# This script applies sliding window CMVN and removes silence frames. This +# is performed on the raw features prior to generating examples for training +# the x-vector system. Once the training examples are generated, the features +# created by this script can be removed. + +nj=40 +cmd="run.pl" +stage=0 +norm_vars=false +center=true +compress=true +cmn_window=300 + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; +if [ $# != 3 ]; then + echo "Usage: $0 " + echo "e.g.: $0 data/train data/train_no_sil exp/make_xvector_features" + echo "Options: " + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --norm-vars # If true, normalize variances in the sliding window cmvn" + exit 1; +fi + +data_in=$1 +data_out=$2 +dir=$3 + +name=`basename $data_in` + +for f in $data_in/feats.scp $data_in/vad.scp ; do + [ ! -f $f ] && echo "$0: No such file $f" && exit 1; +done + +# Set various variables. +mkdir -p $dir/log +mkdir -p $data_out +featdir=$(utils/make_absolute.sh $dir) + +if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $featdir/storage ]; then + utils/create_split_dir.pl \ + /export/b{14,15,16,17}/$USER/kaldi-data/egs/callhome_diarization/v2/xvector-$(date +'%m_%d_%H_%M')/xvector_feats/storage $featdir/storage +fi + +for n in $(seq $nj); do + # the next command does nothing unless $featdir/storage/ exists, see + # utils/create_data_link.pl for more info. + utils/create_data_link.pl $featdir/xvector_feats_${name}.${n}.ark +done + +cp $data_in/utt2spk $data_out/utt2spk +cp $data_in/spk2utt $data_out/spk2utt +cp $data_in/wav.scp $data_out/wav.scp + +write_num_frames_opt="--write-num-frames=ark,t:$featdir/log/utt2num_frames.JOB" + +sdata_in=$data_in/split$nj; +utils/split_data.sh $data_in $nj || exit 1; + +$cmd JOB=1:$nj $dir/log/create_xvector_feats_${name}.JOB.log \ + apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=$cmn_window \ + scp:${sdata_in}/JOB/feats.scp ark:- \| \ + select-voiced-frames ark:- scp,s,cs:${sdata_in}/JOB/vad.scp ark:- \| \ + copy-feats --compress=$compress $write_num_frames_opt ark:- \ + ark,scp:$featdir/xvector_feats_${name}.JOB.ark,$featdir/xvector_feats_${name}.JOB.scp || exit 1; + +for n in $(seq $nj); do + cat $featdir/xvector_feats_${name}.$n.scp || exit 1; +done > ${data_out}/feats.scp || exit 1 + +for n in $(seq $nj); do + cat $featdir/log/utt2num_frames.$n || exit 1; +done > $data_out/utt2num_frames || exit 1 +rm $featdir/log/utt2num_frames.* + +echo "$0: Succeeded creating xvector features for $name" diff --git a/egs/chime6/s5_track2/local/nnet3/xvector/run_xvector.sh b/egs/chime6/s5_track2/local/nnet3/xvector/run_xvector.sh new file mode 120000 index 00000000000..585b63fd2dd --- /dev/null +++ b/egs/chime6/s5_track2/local/nnet3/xvector/run_xvector.sh @@ -0,0 +1 @@ +tuning/run_xvector_1a.sh \ No newline at end of file diff --git a/egs/chime6/s5_track2/local/nnet3/xvector/tuning/run_xvector_1a.sh b/egs/chime6/s5_track2/local/nnet3/xvector/tuning/run_xvector_1a.sh new file mode 100755 index 00000000000..94fc7e7682f --- /dev/null +++ b/egs/chime6/s5_track2/local/nnet3/xvector/tuning/run_xvector_1a.sh @@ -0,0 +1,149 @@ +#!/bin/bash +# Copyright 2018 David Snyder +# 2018 Johns Hopkins University (Author: Daniel Garcia-Romero) +# 2018 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0. + +# This script trains the x-vector DNN. The recipe is similar to the one +# described in "Diarization is Hard: Some Experiences and Lessons Learned +# for the JHU Team in the Inaugural DIHARD Challenge" by Sell et al. + +. ./cmd.sh +set -e + +stage=1 +train_stage=-1 +use_gpu=true +remove_egs=false + +data=data/train +nnet_dir=exp/xvector_nnet_1a/ +egs_dir=exp/xvector_nnet_1a/egs + +. ./path.sh +. ./cmd.sh +. ./utils/parse_options.sh + +num_pdfs=$(awk '{print $2}' $data/utt2spk | sort | uniq -c | wc -l) + +# Now we create the nnet examples using sid/nnet3/xvector/get_egs.sh. +# The argument --num-repeats is related to the number of times a speaker +# repeats per archive. If it seems like you're getting too many archives +# (e.g., more than 200) try increasing the --frames-per-iter option. The +# arguments --min-frames-per-chunk and --max-frames-per-chunk specify the +# minimum and maximum length (in terms of number of frames) of the features +# in the examples. +# +# To make sense of the egs script, it may be necessary to put an "exit 1" +# command immediately after stage 3. Then, inspect +# exp//egs/temp/ranges.* . The ranges files specify the examples that +# will be created, and which archives they will be stored in. Each line of +# ranges.* has the following form: +# +# For example: +# 100304-f-sre2006-kacg-A 1 2 4079 881 23 + +# If you're satisfied with the number of archives (e.g., 50-150 archives is +# reasonable) and with the number of examples per speaker (e.g., 1000-5000 +# is reasonable) then you can let the script continue to the later stages. +# Otherwise, try increasing or decreasing the --num-repeats option. You might +# need to fiddle with --frames-per-iter. Increasing this value decreases the +# the number of archives and increases the number of examples per archive. +# Decreasing this value increases the number of archives, while decreasing the +# number of examples per archive. +if [ $stage -le 6 ]; then + echo "$0: Getting neural network training egs"; + # dump egs. + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b{03,04,05,06}/$USER/kaldi-data/egs/callhome_diarization/v2/xvector-$(date +'%m_%d_%H_%M')/$egs_dir/storage $egs_dir/storage + fi + sid/nnet3/xvector/get_egs.sh --cmd "$train_cmd" \ + --nj 8 \ + --stage 0 \ + --frames-per-iter 1000000000 \ + --frames-per-iter-diagnostic 500000 \ + --min-frames-per-chunk 200 \ + --max-frames-per-chunk 400 \ + --num-diagnostic-archives 3 \ + --num-repeats 40 \ + "$data" $egs_dir +fi + +if [ $stage -le 7 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + num_targets=$(wc -w $egs_dir/pdf2num | awk '{print $1}') + feat_dim=$(cat $egs_dir/info/feat_dim) + + # This chunk-size corresponds to the maximum number of frames the + # stats layer is able to pool over. In this script, it corresponds + # to 4 seconds. If the input recording is greater than 4 seconds, + # we will compute multiple xvectors from the same recording and average + # to produce the final xvector. + max_chunk_size=400 + + # The smallest number of frames we're comfortable computing an xvector from. + # Note that the hard minimum is given by the left and right context of the + # frame-level layers. + min_chunk_size=20 + mkdir -p $nnet_dir/configs + cat < $nnet_dir/configs/network.xconfig + # please note that it is important to have input layer with the name=input + + # The frame-level layers + input dim=${feat_dim} name=input + relu-batchnorm-layer name=tdnn1 input=Append(-2,-1,0,1,2) dim=512 + relu-batchnorm-layer name=tdnn2 input=Append(-2,0,2) dim=512 + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=512 + relu-batchnorm-layer name=tdnn4 dim=512 + relu-batchnorm-layer name=tdnn5 dim=1500 + + # The stats pooling layer. Layers after this are segment-level. + # In the config below, the first and last argument (0, and ${max_chunk_size}) + # means that we pool over an input segment starting at frame 0 + # and ending at frame ${max_chunk_size} or earlier. The other arguments (1:1) + # mean that no subsampling is performed. + stats-layer name=stats config=mean+stddev(0:1:1:${max_chunk_size}) + + # This is where we usually extract the embedding (aka xvector) from. + relu-batchnorm-layer name=tdnn6 dim=128 input=stats + output-layer name=output include-log-softmax=true dim=${num_targets} +EOF + + steps/nnet3/xconfig_to_configs.py \ + --xconfig-file $nnet_dir/configs/network.xconfig \ + --config-dir $nnet_dir/configs/ + cp $nnet_dir/configs/final.config $nnet_dir/nnet.config + + # These three files will be used by sid/nnet3/xvector/extract_xvectors.sh + echo "output-node name=output input=tdnn6.affine" > $nnet_dir/extract.config + echo "$max_chunk_size" > $nnet_dir/max_chunk_size + echo "$min_chunk_size" > $nnet_dir/min_chunk_size +fi + +dropout_schedule='0,0@0.20,0.1@0.50,0' +srand=123 +if [ $stage -le 8 ]; then + steps/nnet3/train_raw_dnn.py --stage=$train_stage \ + --cmd="$train_cmd" \ + --trainer.optimization.proportional-shrink 10 \ + --trainer.optimization.momentum=0.5 \ + --trainer.optimization.num-jobs-initial=3 \ + --trainer.optimization.num-jobs-final=8 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.minibatch-size=64 \ + --trainer.srand=$srand \ + --trainer.max-param-change=2 \ + --trainer.num-epochs=3 \ + --trainer.dropout-schedule="$dropout_schedule" \ + --trainer.shuffle-buffer-size=1000 \ + --egs.frames-per-eg=1 \ + --egs.dir="$egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --cleanup.preserve-model-interval=10 \ + --use-gpu=true \ + --dir=$nnet_dir || exit 1; +fi + +exit 0; diff --git a/egs/chime6/s5_track2/local/prepare_data.sh b/egs/chime6/s5_track2/local/prepare_data.sh new file mode 100755 index 00000000000..c6b8121dab0 --- /dev/null +++ b/egs/chime6/s5_track2/local/prepare_data.sh @@ -0,0 +1,149 @@ +#!/bin/bash +# +# Copyright 2017 Johns Hopkins University (Author: Shinji Watanabe, Yenda Trmal) +# Apache 2.0 + +# Begin configuration section. +mictype=worn # worn, ref or others +cleanup=true +train=true + +# End configuration section +. ./utils/parse_options.sh # accept options.. you can run this run.sh with the + +. ./path.sh + +echo >&2 "$0" "$@" +if [ $# -ne 3 ] ; then + echo >&2 "$0" "$@" + echo >&2 "$0: Error: wrong number of arguments" + echo -e >&2 "Usage:\n $0 [opts] " + echo -e >&2 "eg:\n $0 /corpora/chime5/audio/train /corpora/chime5/transcriptions/train data/train" + exit 1 +fi + +set -e -o pipefail + +adir=$1 +jdir=$2 +dir=$3 + +json_count=$(find -L $jdir -name "*.json" | wc -l) +wav_count=$(find -L $adir -name "*.wav" | wc -l) + +if [ "$json_count" -eq 0 ]; then + echo >&2 "We expect that the directory $jdir will contain json files." + echo >&2 "That implies you have supplied a wrong path to the data." + exit 1 +fi +if [ "$wav_count" -eq 0 ]; then + echo >&2 "We expect that the directory $adir will contain wav files." + echo >&2 "That implies you have supplied a wrong path to the data." + exit 1 +fi + +echo "$0: Converting transcription to text" + +mkdir -p $dir +for file in $jdir/*json; do + ./local/json2text.py --mictype $mictype $file +done | \ + sed -e "s/\[inaudible[- 0-9]*\]/[inaudible]/g" |\ + sed -e 's/ - / /g' |\ + sed -e 's/mm-/mm/g' > $dir/text.orig + +echo "$0: Creating datadir $dir for type=\"$mictype\"" + +if [ $mictype == "worn" ]; then + # convert the filenames to wav.scp format, use the basename of the file + # as a the wav.scp key, add .L and .R for left and right channel + # i.e. each file will have two entries (left and right channel) + find -L $adir -name "S[0-9]*_P[0-9]*.wav" | \ + perl -ne '{ + chomp; + $path = $_; + next unless $path; + @F = split "/", $path; + ($f = $F[@F-1]) =~ s/.wav//; + @F = split "_", $f; + print "${F[1]}_${F[0]}.L sox $path -t wav - remix 1 |\n"; + print "${F[1]}_${F[0]}.R sox $path -t wav - remix 2 |\n"; + }' | sort > $dir/wav.scp + + # generate the transcripts for both left and right channel + # from the original transcript in the form + # P09_S03-0006072-0006147 gimme the baker + # create left and right channel transcript + # P09_S03.L-0006072-0006147 gimme the baker + # P09_S03.R-0006072-0006147 gimme the baker + sed -n 's/ *$//; h; s/-/\.L-/p; g; s/-/\.R-/p' $dir/text.orig | sort > $dir/text +elif [ $mictype == "ref" ]; then + # fixed reference array + + # first get a text, which will be used to extract reference arrays + perl -ne 's/-/.ENH-/;print;' $dir/text.orig | sort > $dir/text + + find -L $adir | grep "\.wav" | sort > $dir/wav.flist + # following command provide the argument for grep to extract only reference arrays + #grep `cut -f 1 -d"-" $dir/text | awk -F"_" '{print $2 "_" $3}' | sed -e "s/\.ENH//" | sort | uniq | sed -e "s/^/ -e /" | tr "\n" " "` $dir/wav.flist > $dir/wav.flist2 + paste -d" " \ + <(awk -F "/" '{print $NF}' $dir/wav.flist | sed -e "s/\.wav/.ENH/") \ + $dir/wav.flist | sort > $dir/wav.scp +else + # array mic case + # convert the filenames to wav.scp format, use the basename of the file + # as a the wav.scp key + find -L $adir -name "*.wav" -ipath "*${mictype}*" |\ + perl -ne '$p=$_;chomp $_;@F=split "/";$F[$#F]=~s/\.wav//;print "$F[$#F] $p";' |\ + sort -u > $dir/wav.scp + + # convert the transcripts from + # P09_S03-0006072-0006147 gimme the baker + # to the per-channel transcripts + # P09_S03_U01_NOLOCATION.CH1-0006072-0006147 gimme the baker + # P09_S03_U01_NOLOCATION.CH2-0006072-0006147 gimme the baker + # P09_S03_U01_NOLOCATION.CH3-0006072-0006147 gimme the baker + # P09_S03_U01_NOLOCATION.CH4-0006072-0006147 gimme the baker + perl -ne '$l=$_; + for($i=1; $i<=4; $i++) { + ($x=$l)=~ s/-/.CH\Q$i\E-/; + print $x;}' $dir/text.orig | sort > $dir/text + +fi +$cleanup && rm -f $dir/text.* $dir/wav.scp.* $dir/wav.flist + +# Prepare 'segments', 'utt2spk', 'spk2utt' +if [ $mictype == "worn" ]; then + cut -d" " -f 1 $dir/text | \ + awk -F"-" '{printf("%s %s %08.2f %08.2f\n", $0, $1, $2/100.0, $3/100.0)}' |\ + sed -e "s/_[A-Z]*\././2" \ + > $dir/segments +elif [ $mictype == "ref" ]; then + cut -d" " -f 1 $dir/text | \ + awk -F"-" '{printf("%s %s %08.2f %08.2f\n", $0, $1, $2/100.0, $3/100.0)}' |\ + sed -e "s/_[A-Z]*\././2" |\ + sed -e "s/ P.._/ /" > $dir/segments +else + cut -d" " -f 1 $dir/text | \ + awk -F"-" '{printf("%s %s %08.2f %08.2f\n", $0, $1, $2/100.0, $3/100.0)}' |\ + sed -e "s/_[A-Z]*\././2" |\ + sed -e 's/ P.._/ /' > $dir/segments +fi +cut -f 1 -d ' ' $dir/segments | \ + perl -ne 'chomp;$utt=$_;s/_.*//;print "$utt $_\n";' > $dir/utt2spk + +utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt + +if [ $train != 'true' ]; then + # For scoring the final system, we need the original utt2spk + # and text file. So we keep them with the extension .bak here + # so that they don't affect the validate_data_dir steps in + # the intermediate steps. + for file in text utt2spk spk2utt segments; do + mv $dir/$file $dir/$file.bak + done + + # For dev and eval data, prepare pseudo utt2spk. + awk '{print $1, $1}' $dir/wav.scp > $dir/utt2spk + utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt +fi diff --git a/egs/chime6/s5_track2/local/prepare_dict.sh b/egs/chime6/s5_track2/local/prepare_dict.sh new file mode 120000 index 00000000000..ada30947463 --- /dev/null +++ b/egs/chime6/s5_track2/local/prepare_dict.sh @@ -0,0 +1 @@ +../../s5_track1/local/prepare_dict.sh \ No newline at end of file diff --git a/egs/chime6/s5_track2/local/print_dset_error.py b/egs/chime6/s5_track2/local/print_dset_error.py new file mode 100755 index 00000000000..1a7fd4ff365 --- /dev/null +++ b/egs/chime6/s5_track2/local/print_dset_error.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 +# Copyright 2019 Ashish Arora +# Apache 2.0. + +import sys, io +import string +infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') +output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') +spkorder_writer = open(sys.argv[1],'w', encoding='utf8') +array_id_error_dict={} +for line in infile: + toks = line.strip().split() + recordingid = toks[1] + total_words = toks[-5][:-1] + total_errors = toks[-4][:-1] + total_ins = toks[-3][:-1] + total_del = toks[-2][:-1] + total_sub = toks[-1] + spk_order = toks[6][1] + '_' + toks[7][0] + '_' + toks[8][0] + '_' + toks[9][0] + spkorder_writer.write(recordingid + ':' + spk_order + '\n') + arrayid=recordingid.strip().split('_')[1] + if arrayid not in array_id_error_dict: + array_id_error_dict[arrayid]=[0]*5 + array_id_error_dict[arrayid][0]+=int(total_words) + array_id_error_dict[arrayid][1]+=int(total_errors) + array_id_error_dict[arrayid][2]+=int(total_ins) + array_id_error_dict[arrayid][3]+=int(total_del) + array_id_error_dict[arrayid][4]+=int(total_sub) + + +for arrayid in sorted(array_id_error_dict): + wer = float(array_id_error_dict[arrayid][1])/float(array_id_error_dict[arrayid][0])*100 + wer_detail = "%WER {0:5.2f} [ {1} / {2}, {3} ins, {4} del, {5} sub ]".format(wer, array_id_error_dict[arrayid][0], array_id_error_dict[arrayid][1], array_id_error_dict[arrayid][2], array_id_error_dict[arrayid][3], array_id_error_dict[arrayid][4]) + output.write(arrayid + ' ' + wer_detail + '\n') + diff --git a/egs/chime6/s5_track2/local/reverberate_lat_dir.sh b/egs/chime6/s5_track2/local/reverberate_lat_dir.sh new file mode 120000 index 00000000000..57302268f6d --- /dev/null +++ b/egs/chime6/s5_track2/local/reverberate_lat_dir.sh @@ -0,0 +1 @@ +../../s5_track1/local/reverberate_lat_dir.sh \ No newline at end of file diff --git a/egs/chime6/s5_track2/local/run_beamformit.sh b/egs/chime6/s5_track2/local/run_beamformit.sh new file mode 120000 index 00000000000..832a16e3ba7 --- /dev/null +++ b/egs/chime6/s5_track2/local/run_beamformit.sh @@ -0,0 +1 @@ +../../s5_track1/local/run_beamformit.sh \ No newline at end of file diff --git a/egs/chime6/s5_track2/local/run_ivector_common.sh b/egs/chime6/s5_track2/local/run_ivector_common.sh new file mode 120000 index 00000000000..df7fca84335 --- /dev/null +++ b/egs/chime6/s5_track2/local/run_ivector_common.sh @@ -0,0 +1 @@ +../../s5_track1/local/nnet3/run_ivector_common.sh \ No newline at end of file diff --git a/egs/chime6/s5_track2/local/run_wpe.py b/egs/chime6/s5_track2/local/run_wpe.py new file mode 120000 index 00000000000..6621607c932 --- /dev/null +++ b/egs/chime6/s5_track2/local/run_wpe.py @@ -0,0 +1 @@ +../../s5_track1/local/run_wpe.py \ No newline at end of file diff --git a/egs/chime6/s5_track2/local/run_wpe.sh b/egs/chime6/s5_track2/local/run_wpe.sh new file mode 120000 index 00000000000..187080e62e4 --- /dev/null +++ b/egs/chime6/s5_track2/local/run_wpe.sh @@ -0,0 +1 @@ +../../s5_track1/local/run_wpe.sh \ No newline at end of file diff --git a/egs/chime6/s5_track2/local/score.sh b/egs/chime6/s5_track2/local/score.sh new file mode 120000 index 00000000000..6a200b42ed3 --- /dev/null +++ b/egs/chime6/s5_track2/local/score.sh @@ -0,0 +1 @@ +../steps/scoring/score_kaldi_wer.sh \ No newline at end of file diff --git a/egs/chime6/s5_track2/local/score_for_submit.sh b/egs/chime6/s5_track2/local/score_for_submit.sh new file mode 100755 index 00000000000..087a078316a --- /dev/null +++ b/egs/chime6/s5_track2/local/score_for_submit.sh @@ -0,0 +1,102 @@ +#!/bin/bash +# Apache 2.0 +# +# This script provides CHiME-6 challenge track 2 submission scores. +# It calculates the best search parameter configurations by using the dev set +# and provides wer for dev and eval + +cmd=run.pl +stage=0 +word_ins_penalty=0.0,0.5,1.0 +min_lmwt=7 +max_lmwt=17 +dev_decodedir=exp/chain_train_worn_simu_u400k_cleaned_rvb/tdnn1b_sp/decode_dev_beamformit_dereverb_diarized_2stage +eval_decodedir=exp/chain_train_worn_simu_u400k_cleaned_rvb/tdnn1b_sp/decode_eval_beamformit_dereverb_diarized_2stage +dev_datadir=dev_beamformit_dereverb_diarized_hires +eval_datadir=eval_beamformit_dereverb_diarized_hires + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh +. ./cmd.sh +. ./utils/parse_options.sh + +if [ $# -ne 0 ]; then + echo "Usage: $0 [--cmd (run.pl|queue.pl...)]" + echo "This script provides CHiME-6 challenge submission scores" + echo " Options:" + echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." + echo " --dev_decodedir # dev set decoding directory" + echo " --eval_decodedir # eval set decoding directory" + echo " --dev_datadir # dev set data directory" + echo " --eval_datadir # eval set data directory" + echo " --min_lmwt # minumum LM-weight for lattice rescoring " + echo " --max_lmwt # maximum LM-weight for lattice rescoring " + + exit 1; +fi + +if [ $stage -le 1 ]; then + # obtaining multi speaker WER for all lmwt and wip + for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + for LMWT in $(seq $min_lmwt $max_lmwt); do + local/multispeaker_score.sh --cmd "$cmd" \ + --datadir $dev_datadir --get_stats false data/$dev_datadir/text \ + $dev_decodedir/scoring_kaldi/penalty_$wip/$LMWT.txt \ + $dev_decodedir/scoring_kaldi_multispeaker/penalty_$wip/$LMWT + done + done +fi + +if [ $stage -le 2 ]; then + # obtaining best lmwt, wip and wer + # adding /dev/null to the command list below forces grep to output the filename + mkdir -p $dev_decodedir/scoring_kaldi_multispeaker + grep WER $dev_decodedir/scoring_kaldi_multispeaker/penalty_*/*/per_speaker_wer/array_wer.txt /dev/null \ + | utils/best_wer.sh >& $dev_decodedir/scoring_kaldi_multispeaker/best_wer + + best_wer_file=$(awk '{print $NF}' $dev_decodedir/scoring_kaldi_multispeaker/best_wer) + best_array=$(echo $best_wer_file | awk -F: '{N=NF; print $N}') + best_lmwt=$(echo $best_wer_file | awk -F/ '{N=NF-2; print $N}') + best_wip=$(echo $best_wer_file | awk -F_ '{N=NF-3; print $N}' | awk -F/ '{N=NF-2; print $N}') + + # printing and storing best lmwt, best_array and wip + echo "best array: $best_array" + echo "best LM weight: $best_lmwt" + echo "best insertion penalty weight: $best_wip" + + echo $best_lmwt > $dev_decodedir/scoring_kaldi_multispeaker/lmwt + echo $best_wip > $dev_decodedir/scoring_kaldi_multispeaker/wip + echo $best_array > $dev_decodedir/scoring_kaldi_multispeaker/best_array +fi + +if [ $stage -le 3 ]; then + # obtaining per utterance stats for dev + local/multispeaker_score.sh --cmd "$cmd" \ + --datadir $dev_datadir data/$dev_datadir/text \ + $dev_decodedir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \ + $dev_decodedir/scoring_kaldi_multispeaker/penalty_$best_wip/$best_lmwt/ +fi + +if [ $stage -le 4 ]; then + # obtaining per utterance stats for eval + local/multispeaker_score.sh --cmd "$cmd" \ + --datadir $eval_datadir data/$eval_datadir/text \ + $eval_decodedir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \ + $eval_decodedir/scoring_kaldi_multispeaker/penalty_$best_wip/$best_lmwt/ +fi + +if [ $stage -le 5 ]; then + # obtaining eval wer corresponding to best lmwt, best_array and wip of dev + best_array="$(cat $dev_decodedir/scoring_kaldi_multispeaker/best_array)" + best_lmwt="$(cat $dev_decodedir/scoring_kaldi_multispeaker/lmwt)" + best_wip="$(cat $dev_decodedir/scoring_kaldi_multispeaker/wip)" + + grep WER $eval_decodedir/scoring_kaldi_multispeaker/penalty_$best_wip/$best_lmwt/per_speaker_wer/array_wer.txt /dev/null \ + | grep $best_array | utils/best_wer.sh >& $eval_decodedir/scoring_kaldi_multispeaker/best_wer + + # printing dev and eval wer + echo "Dev: $(<$dev_decodedir/scoring_kaldi_multispeaker/best_wer)" | cut -d " " -f 1-15 + echo "Eval: $(<$eval_decodedir/scoring_kaldi_multispeaker/best_wer)" | cut -d " " -f 1-14 +fi + diff --git a/egs/chime6/s5_track2/local/segmentation/detect_speech_activity.sh b/egs/chime6/s5_track2/local/segmentation/detect_speech_activity.sh new file mode 100755 index 00000000000..91d52b39269 --- /dev/null +++ b/egs/chime6/s5_track2/local/segmentation/detect_speech_activity.sh @@ -0,0 +1,217 @@ +#!/bin/bash + +# Copyright 2016-17 Vimal Manohar +# 2017 Nagendra Kumar Goel +# Apache 2.0. + +# This script does nnet3-based speech activity detection given an input +# kaldi data directory and outputs a segmented kaldi data directory. +# This script can also do music detection and other similar segmentation +# using appropriate options such as --output-name output-music. + +set -e +set -o pipefail +set -u + +if [ -f ./path.sh ]; then . ./path.sh; fi + +affix= # Affix for the segmentation +nj=32 +cmd=queue.pl +stage=-1 + +# Feature options (Must match training) +mfcc_config=conf/mfcc_hires.conf +feat_affix= # Affix for the type of feature used + +output_name=output # The output node in the network +sad_name=sad # Base name for the directory storing the computed loglikes + # Can be music for music detection +segmentation_name=segmentation # Base name for the directory doing segmentation + # Can be segmentation_music for music detection + +# SAD network config +iter=final # Model iteration to use + +# Contexts must ideally match training for LSTM models, but +# may not necessarily for stats components +extra_left_context=0 # Set to some large value, typically 40 for LSTM (must match training) +extra_right_context=0 +extra_left_context_initial=-1 +extra_right_context_final=-1 +frames_per_chunk=150 + +# Decoding options +graph_opts="--min-silence-duration=0.03 --min-speech-duration=0.3 --max-speech-duration=10.0" +acwt=0.3 + +# These _in__weight represent the fraction of probability +# to transfer to class. +# e.g. --speech-in-sil-weight=0.0 --garbage-in-sil-weight=0.0 --sil-in-speech-weight=0.0 --garbage-in-speech-weight=0.3 +transform_probs_opts="" + +# Postprocessing options +segment_padding=0.2 # Duration (in seconds) of padding added to segments +min_segment_dur=0 # Minimum duration (in seconds) required for a segment to be included + # This is before any padding. Segments shorter than this duration will be removed. + # This is an alternative to --min-speech-duration above. +merge_consecutive_max_dur=0 # Merge consecutive segments as long as the merged segment is no longer than this many + # seconds. The segments are only merged if their boundaries are touching. + # This is after padding by --segment-padding seconds. + # 0 means do not merge. Use 'inf' to not limit the duration. + +echo $* + +. utils/parse_options.sh + +if [ $# -ne 5 ]; then + echo "This script does nnet3-based speech activity detection given an input kaldi " + echo "data directory and outputs an output kaldi data directory." + echo "See script for details of the options to be supplied." + echo "Usage: $0 " + echo " e.g.: $0 ~/workspace/egs/ami/s5b/data/sdm1/dev exp/nnet3_sad_snr/nnet_tdnn_j_n4 \\" + echo " mfcc_hires exp/segmentation_sad_snr/nnet_tdnn_j_n4 data/ami_sdm1_dev" + echo "" + echo "Options: " + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --nj # number of parallel jobs to run." + echo " --stage # stage to do partial re-run from." + echo " --convert-data-dir-to-whole # If true, the input data directory is " + echo " # first converted to whole data directory (i.e. whole recordings) " + echo " # and segmentation is done on that." + echo " # If false, then the original segments are " + echo " # retained and they are split into sub-segments." + echo " --output-name # The output node in the network" + echo " --extra-left-context # Set to some large value, typically 40 for LSTM (must match training)" + echo " --extra-right-context # For BLSTM or statistics pooling" + exit 1 +fi + +src_data_dir=$1 # The input data directory that needs to be segmented. + # If convert_data_dir_to_whole is true, any segments in that will be ignored. +sad_nnet_dir=$2 # The SAD neural network +mfcc_dir=$3 # The directory to store the features +dir=$4 # Work directory +data_dir=$5 # The output data directory will be ${data_dir}_seg + +affix=${affix:+_$affix} +feat_affix=${feat_affix:+_$feat_affix} + +data_id=`basename $data_dir` +sad_dir=${dir}/${sad_name}${affix}_${data_id}${feat_affix} +seg_dir=${dir}/${segmentation_name}${affix}_${data_id}${feat_affix} +test_data_dir=data/${data_id}${feat_affix} + +############################################################################### +## Forward pass through the network network and dump the log-likelihoods. +############################################################################### + +frame_subsampling_factor=1 +if [ -f $sad_nnet_dir/frame_subsampling_factor ]; then + frame_subsampling_factor=$(cat $sad_nnet_dir/frame_subsampling_factor) +fi + +mkdir -p $dir +if [ $stage -le 1 ]; then + if [ "$(readlink -f $sad_nnet_dir)" != "$(readlink -f $dir)" ]; then + cp $sad_nnet_dir/cmvn_opts $dir || exit 1 + fi + + ######################################################################## + ## Initialize neural network for decoding using the output $output_name + ######################################################################## + + if [ ! -z "$output_name" ] && [ "$output_name" != output ]; then + $cmd $dir/log/get_nnet_${output_name}.log \ + nnet3-copy --edits="rename-node old-name=$output_name new-name=output" \ + $sad_nnet_dir/$iter.raw $dir/${iter}_${output_name}.raw || exit 1 + iter=${iter}_${output_name} + else + if ! diff $sad_nnet_dir/$iter.raw $dir/$iter.raw; then + cp $sad_nnet_dir/$iter.raw $dir/ + fi + fi + + steps/nnet3/compute_output.sh --nj $nj --cmd "$cmd" \ + --iter ${iter} \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial $extra_left_context_initial \ + --extra-right-context-final $extra_right_context_final \ + --frames-per-chunk $frames_per_chunk --apply-exp true \ + --frame-subsampling-factor $frame_subsampling_factor \ + ${test_data_dir} $dir $sad_dir || exit 1 +fi + +############################################################################### +## Prepare FST we search to make speech/silence decisions. +############################################################################### + +utils/data/get_utt2dur.sh --nj $nj --cmd "$cmd" $test_data_dir || exit 1 +frame_shift=$(utils/data/get_frame_shift.sh $test_data_dir) || exit 1 + +graph_dir=${dir}/graph_${output_name} +if [ $stage -le 2 ]; then + mkdir -p $graph_dir + + # 1 for silence and 2 for speech + cat < $graph_dir/words.txt + 0 +silence 1 +speech 2 +EOF + + $cmd $graph_dir/log/make_graph.log \ + steps/segmentation/internal/prepare_sad_graph.py $graph_opts \ + --frame-shift=$(perl -e "print $frame_shift * $frame_subsampling_factor") - \| \ + fstcompile --isymbols=$graph_dir/words.txt --osymbols=$graph_dir/words.txt '>' \ + $graph_dir/HCLG.fst +fi + +############################################################################### +## Do Viterbi decoding to create per-frame alignments. +############################################################################### + +post_vec=$sad_nnet_dir/post_${output_name}.vec +if [ ! -f $sad_nnet_dir/post_${output_name}.vec ]; then + if [ ! -f $sad_nnet_dir/post_${output_name}.txt ]; then + echo "$0: Could not find $sad_nnet_dir/post_${output_name}.vec. " + echo "Re-run the corresponding stage in the training script possibly " + echo "with --compute-average-posteriors=true or compute the priors " + echo "from the training labels" + exit 1 + else + post_vec=$sad_nnet_dir/post_${output_name}.txt + fi +fi + +mkdir -p $seg_dir +if [ $stage -le 3 ]; then + steps/segmentation/internal/get_transform_probs_mat.py \ + --priors="$post_vec" $transform_probs_opts > $seg_dir/transform_probs.mat + + steps/segmentation/decode_sad.sh --acwt $acwt --cmd "$cmd" \ + --nj $nj \ + --transform "$seg_dir/transform_probs.mat" \ + $graph_dir $sad_dir $seg_dir +fi + +############################################################################### +## Post-process segmentation to create kaldi data directory. +############################################################################### + +if [ $stage -le 4 ]; then + steps/segmentation/post_process_sad_to_segments.sh \ + --segment-padding $segment_padding --min-segment-dur $min_segment_dur \ + --merge-consecutive-max-dur $merge_consecutive_max_dur \ + --cmd "$cmd" --frame-shift $(perl -e "print $frame_subsampling_factor * $frame_shift") \ + ${test_data_dir} ${seg_dir} ${seg_dir} +fi + +if [ $stage -le 5 ]; then + utils/data/subsegment_data_dir.sh ${test_data_dir} ${seg_dir}/segments \ + ${data_dir}_seg +fi + +echo "$0: Created output segmented kaldi data directory in ${data_dir}_seg" +exit 0 diff --git a/egs/chime6/s5_track2/local/segmentation/tuning/train_lstm_sad_1a.sh b/egs/chime6/s5_track2/local/segmentation/tuning/train_lstm_sad_1a.sh new file mode 100755 index 00000000000..5701424869a --- /dev/null +++ b/egs/chime6/s5_track2/local/segmentation/tuning/train_lstm_sad_1a.sh @@ -0,0 +1,140 @@ +#!/bin/bash + +# Copyright 2017 Nagendra Kumar Goel +# 2018 Vimal Manohar +# Apache 2.0 + +# This is a script to train a TDNN for speech activity detection (SAD) +# using LSTM for long-context information. + +stage=0 +train_stage=-10 +get_egs_stage=-10 +egs_opts= + +chunk_width=20 + +extra_left_context=60 +extra_right_context=10 +relu_dim=256 +cell_dim=256 +projection_dim=64 + +# training options +num_epochs=1 +initial_effective_lrate=0.0003 +final_effective_lrate=0.00003 +num_jobs_initial=3 +num_jobs_final=8 +remove_egs=true +max_param_change=0.2 # Small max-param change for small network +dropout_schedule='0,0@0.20,0.1@0.50,0' + +egs_dir= +nj=40 + +dir= +affix=1a + +data_dir= +targets_dir= + +. ./cmd.sh +if [ -f ./path.sh ]; then . ./path.sh; fi +. ./utils/parse_options.sh + +set -o pipefail +set -u + +if [ -z "$dir" ]; then + dir=exp/segmentation_1a/tdnn_lstm_asr_sad +fi +dir=$dir${affix:+_$affix} + +if ! cuda-compiled; then + cat < $dir/cmvn_opts + +if [ $stage -le 5 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=`feat-to-dim scp:$data_dir/feats.scp -` name=input + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2) affine-transform-file=$dir/configs/lda.mat + + relu-renorm-layer name=tdnn1 input=lda dim=$relu_dim add-log-stddev=true + relu-renorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$relu_dim add-log-stddev=true + relu-renorm-layer name=tdnn3 input=Append(-3,0,3,6) dim=$relu_dim add-log-stddev=true + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim decay-time=20 delay=-3 dropout-proportion=0.0 + relu-renorm-layer name=tdnn4 input=Append(-6,0,6,12) add-log-stddev=true dim=$relu_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim decay-time=20 delay=-3 dropout-proportion=0.0 + relu-renorm-layer name=tdnn5 input=Append(-12,0,12,24) dim=$relu_dim + + output-layer name=output include-log-softmax=true dim=3 learning-rate-factor=0.1 input=tdnn5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig \ + --config-dir $dir/configs/ + + cat <> $dir/configs/vars +num_targets=3 +EOF +fi + +if [ $stage -le 6 ]; then + num_utts=`cat $data_dir/utt2spk | wc -l` + # Set num_utts_subset for diagnostics to a reasonable value + # of max(min(0.005 * num_utts, 300), 12) + num_utts_subset=`perl -e '$n=int($ARGV[0] * 0.005); print ($n > 300 ? 300 : ($n < 12 ? 12 : $n))' $num_utts` + + steps/nnet3/train_raw_rnn.py --stage=$train_stage \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$egs_dir" --egs.stage=$get_egs_stage \ + --egs.chunk-left-context=$extra_left_context \ + --egs.chunk-right-context=$extra_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --trainer.num-epochs=$num_epochs \ + --trainer.samples-per-iter=20000 \ + --trainer.optimization.num-jobs-initial=$num_jobs_initial \ + --trainer.optimization.num-jobs-final=$num_jobs_final \ + --trainer.optimization.initial-effective-lrate=$initial_effective_lrate \ + --trainer.optimization.final-effective-lrate=$final_effective_lrate \ + --trainer.optimization.shrink-value=0.99 \ + --trainer.dropout-schedule="$dropout_schedule" \ + --trainer.rnn.num-chunk-per-minibatch=128,64 \ + --trainer.optimization.momentum=0.5 \ + --trainer.deriv-truncate-margin=10 \ + --trainer.max-param-change=$max_param_change \ + --trainer.compute-per-dim-accuracy=true \ + --cmd="$decode_cmd" --nj $nj \ + --cleanup=true \ + --cleanup.remove-egs=$remove_egs \ + --cleanup.preserve-model-interval=10 \ + --use-gpu=true \ + --use-dense-targets=true \ + --feat-dir=$data_dir \ + --targets-scp="$targets_dir/targets.scp" \ + --egs.opts="--frame-subsampling-factor 3 --num-utts-subset $num_utts_subset" \ + --dir=$dir || exit 1 +fi + +if [ $stage -le 7 ]; then + # Use a subset to compute prior over the output targets + $train_cmd $dir/log/get_priors.log \ + matrix-sum-rows "scp:utils/subset_scp.pl --quiet 1000 $targets_dir/targets.scp |" \ + ark:- \| vector-sum --binary=false ark:- $dir/post_output.vec || exit 1 + + echo 3 > $dir/frame_subsampling_factor +fi diff --git a/egs/chime6/s5_track2/local/segmentation/tuning/train_stats_sad_1a.sh b/egs/chime6/s5_track2/local/segmentation/tuning/train_stats_sad_1a.sh new file mode 100755 index 00000000000..bb985462f49 --- /dev/null +++ b/egs/chime6/s5_track2/local/segmentation/tuning/train_stats_sad_1a.sh @@ -0,0 +1,150 @@ +#!/bin/bash + +# Copyright 2017 Nagendra Kumar Goel +# 2018 Vimal Manohar +# Apache 2.0 + +# This is a script to train a TDNN for speech activity detection (SAD) +# using statistics pooling for long-context information. + +stage=0 +train_stage=-10 +get_egs_stage=-10 +egs_opts= + +chunk_width=20 + +# The context is chosen to be around 1 second long. The context at test time +# is expected to be around the same. +extra_left_context=79 +extra_right_context=21 + +relu_dim=256 + +# training options +num_epochs=1 +initial_effective_lrate=0.0003 +final_effective_lrate=0.00003 +num_jobs_initial=3 +num_jobs_final=8 +remove_egs=true +max_param_change=0.2 # Small max-param change for small network + +egs_dir= +nj=40 + +dir= +affix=1a + +data_dir= +targets_dir= + +. ./cmd.sh +if [ -f ./path.sh ]; then . ./path.sh; fi +. ./utils/parse_options.sh + +set -o pipefail +set -u + +if [ -z "$dir" ]; then + dir=exp/segmentation_1a/tdnn_stats_sad +fi +dir=$dir${affix:+_$affix} + +if ! cuda-compiled; then + cat < $dir/cmvn_opts + +if [ $stage -le 5 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=`feat-to-dim scp:$data_dir/feats.scp -` name=input + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2) affine-transform-file=$dir/configs/lda.mat + + relu-renorm-layer name=tdnn1 input=lda dim=$relu_dim add-log-stddev=true + relu-renorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$relu_dim add-log-stddev=true + relu-renorm-layer name=tdnn3 input=Append(-3,0,3,6) dim=$relu_dim add-log-stddev=true + stats-layer name=tdnn3_stats config=mean+count(-99:3:9:99) + relu-renorm-layer name=tdnn4 input=Append(tdnn3@-6,tdnn3@0,tdnn3@6,tdnn3@12,tdnn3_stats) add-log-stddev=true dim=$relu_dim + stats-layer name=tdnn4_stats config=mean+count(-108:6:18:108) + relu-renorm-layer name=tdnn5 input=Append(tdnn4@-12,tdnn4@0,tdnn4@12,tdnn4@24,tdnn4_stats) dim=$relu_dim + + output-layer name=output include-log-softmax=true dim=3 learning-rate-factor=0.1 input=tdnn5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig \ + --config-dir $dir/configs/ + + cat <> $dir/configs/vars +num_targets=3 +EOF +fi + +if [ $stage -le 6 ]; then + num_utts=`cat $data_dir/utt2spk | wc -l` + # Set num_utts_subset for diagnostics to a reasonable value + # of max(min(0.005 * num_utts, 300), 12) + num_utts_subset=`perl -e '$n=int($ARGV[0] * 0.005); print ($n > 300 ? 300 : ($n < 12 ? 12 : $n))' $num_utts` + + steps/nnet3/train_raw_rnn.py --stage=$train_stage \ + --feat.cmvn-opts=$cmvn_opts \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$egs_dir" --egs.stage=$get_egs_stage \ + --egs.chunk-left-context=$extra_left_context \ + --egs.chunk-right-context=$extra_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --trainer.num-epochs=$num_epochs \ + --trainer.samples-per-iter=20000 \ + --trainer.optimization.num-jobs-initial=$num_jobs_initial \ + --trainer.optimization.num-jobs-final=$num_jobs_final \ + --trainer.optimization.initial-effective-lrate=$initial_effective_lrate \ + --trainer.optimization.final-effective-lrate=$final_effective_lrate \ + --trainer.rnn.num-chunk-per-minibatch=128,64 \ + --trainer.optimization.momentum=0.5 \ + --trainer.deriv-truncate-margin=10 \ + --trainer.max-param-change=$max_param_change \ + --trainer.compute-per-dim-accuracy=true \ + --cmd="$decode_cmd" --nj $nj \ + --cleanup=true \ + --cleanup.remove-egs=$remove_egs \ + --cleanup.preserve-model-interval=10 \ + --use-gpu=true \ + --use-dense-targets=true \ + --feat-dir=$data_dir \ + --targets-scp="$targets_dir/targets.scp" \ + --egs.opts="--frame-subsampling-factor 3 --num-utts-subset $num_utts_subset" \ + --dir=$dir || exit 1 +fi + +if [ $stage -le 7 ]; then + # Use a subset to compute prior over the output targets + #$train_cmd $dir/log/get_priors.log \ + # matrix-sum-rows "scp:utils/subset_scp.pl --quiet 1000 $targets_dir/targets.scp |" \ + # ark:- \| vector-sum --binary=false ark:- $dir/post_output.vec || exit 1 + + # Since the train data is individual microphones, while the dev and + # eval are beamformed, it is likely that the train contains a much + # higher ratio of silences. So using priors computed from the train + # data may miss a lot of speech in the dev/eval sets. Hence we manually + # tune the prior on the dev set. + # With the following prior, the SAD system results are: + # Dev (using -c 0.25) + # MISSED SPEECH = 1188.59 secs ( 3.3 percent of scored time) + # FALARM SPEECH = 539.37 secs ( 1.5 percent of scored time) + echo "[ 30 2 1 ]" > $dir/post_output.vec || exit 1 + + echo 3 > $dir/frame_subsampling_factor +fi + diff --git a/egs/chime6/s5_track2/local/train_diarizer.sh b/egs/chime6/s5_track2/local/train_diarizer.sh new file mode 100755 index 00000000000..71918e7cabc --- /dev/null +++ b/egs/chime6/s5_track2/local/train_diarizer.sh @@ -0,0 +1,186 @@ +#!/bin/bash +# Copyright +# 2019 David Snyder +# Apache 2.0. +# +# This script is based on the run.sh script in the Voxceleb v2 recipe. +# It trains an x-vector DNN for diarization. + +mfccdir=`pwd`/mfcc +vaddir=`pwd`/mfcc + +voxceleb1_root=/export/corpora/VoxCeleb1 +voxceleb2_root=/export/corpora/VoxCeleb2 +data_dir=train_worn_simu_u400k +model_dir=exp/xvector_nnet_1a + +stage=0 +train_stage=-1 + +. ./cmd.sh + +if [ -f ./path.sh ]; then . ./path.sh; fi +set -e -u -o pipefail +. utils/parse_options.sh + +if [ $# -ne 0 ]; then + exit 1 +fi + +if [ $stage -le 0 ]; then + echo "$0: preparing voxceleb 2 data" + local/make_voxceleb2.pl $voxceleb2_root dev data/voxceleb2_train + local/make_voxceleb2.pl $voxceleb2_root test data/voxceleb2_test + + echo "$0: preparing voxceleb 1 data (see comments if this step fails)" + # The format of the voxceleb 1 corpus has changed several times since it was + # released. Therefore, our dataprep scripts may or may not fail depending + # on the version of the corpus you obtained. + # If you downloaded the corpus soon after it was first released, this + # version of the dataprep script might work: + local/make_voxceleb1.pl $voxceleb1_root data/voxceleb1 + # However, if you've downloaded the corpus recently, you may need to use the + # the following scripts instead: + #local/make_voxceleb1_v2.pl $voxceleb1_root dev data/voxceleb1_train + #local/make_voxceleb1_v2.pl $voxceleb1_root test data/voxceleb1_test + + # We should now have about 7,351 speakers and 1,277,503 utterances. + utils/combine_data.sh data/voxceleb data/voxceleb2_train data/voxceleb2_test +fi + +if [ $stage -le 1 ]; then + echo "$0: preparing features for training data (voxceleb 1 + 2)" + steps/make_mfcc.sh --write-utt2num-frames true \ + --mfcc-config conf/mfcc_hires.conf --nj 40 --cmd "$train_cmd" \ + data/voxceleb exp/make_mfcc $mfccdir + utils/fix_data_dir.sh data/voxceleb + # Note that we apply CMN to the MFCCs and write these to the disk. These + # features will later be used to train the x-vector DNN. +fi + +# In this section, we augment the voxceleb data with reverberation. +# Note that we can probably improve the x-vector DNN if we include +# augmentations from the nonspeech regions of the Chime 6 training +# dataset. +if [ $stage -le 2 ]; then + echo "$0: applying augmentation to x-vector training data (just reverb for now)" + frame_shift=0.01 + awk -v frame_shift=$frame_shift '{print $1, $2*frame_shift;}' data/voxceleb/utt2num_frames > data/voxceleb/reco2dur + + if [ ! -d "RIRS_NOISES" ]; then + echo "$0: downloading simulated room impulse response dataset" + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + + # Make a version with reverberated speech + rvb_opts=() + rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list") + rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list") + + # Make a reverberated version of the training data. Note that we don't add any + # additive noise here. + steps/data/reverberate_data_dir.py \ + "${rvb_opts[@]}" \ + --speech-rvb-probability 1 \ + --pointsource-noise-addition-probability 0 \ + --isotropic-noise-addition-probability 0 \ + --num-replications 1 \ + --source-sampling-rate 16000 \ + data/voxceleb data/voxceleb_reverb + utils/copy_data_dir.sh --utt-suffix "-reverb" data/voxceleb_reverb data/voxceleb_reverb.new + rm -rf data/voxceleb_reverb + mv data/voxceleb_reverb.new data/voxceleb_reverb +fi + +if [ $stage -le 3 ]; then + echo "$0: making MFCCs for augmented training data" + # Make MFCCs for the augmented data. Note that we do not compute a new + # vad.scp file here. Instead, we use the vad.scp from the clean version of + # the list. + steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 40 --cmd "$train_cmd" \ + data/voxceleb_reverb exp/make_mfcc $mfccdir + # Combine the clean and augmented training data. This is now roughly + # double the size of the original clean list. + utils/combine_data.sh data/voxceleb_combined data/voxceleb_reverb data/voxceleb +fi + +# Now we prepare the features to generate examples for xvector training. +if [ $stage -le 4 ]; then + # This script applies CMVN and removes nonspeech frames. Note that this is somewhat + # wasteful, as it roughly doubles the amount of training data on disk. After + # creating voxceleb examples, this can be removed. + echo "$0: preparing features to train x-vector DNN" + local/nnet3/xvector/prepare_feats.sh --nj 40 --cmd "$train_cmd" \ + data/voxceleb_combined data/voxceleb_combined_cmn exp/voxceleb_combined_cmn + utils/fix_data_dir.sh data/voxceleb_combined_cmn +fi + +if [ $stage -le 5 ]; then + # Now, we need to remove features that are too short after removing silence + # frames. We want at least 4s (400 frames) per utterance. + min_len=400 + mv data/voxceleb_combined_cmn/utt2num_frames data/voxceleb_combined_cmn/utt2num_frames.bak + awk -v min_len=${min_len} '$2 > min_len {print $1, $2}' data/voxceleb_combined_cmn/utt2num_frames.bak > data/voxceleb_combined_cmn/utt2num_frames + utils/filter_scp.pl data/voxceleb_combined_cmn/utt2num_frames data/voxceleb_combined_cmn/utt2spk > data/voxceleb_combined_cmn/utt2spk.new + mv data/voxceleb_combined_cmn/utt2spk.new data/voxceleb_combined_cmn/utt2spk + utils/fix_data_dir.sh data/voxceleb_combined_cmn + + # We also want several utterances per speaker. Now we'll throw out speakers + # with fewer than 8 utterances. + min_num_utts=8 + awk '{print $1, NF-1}' data/voxceleb_combined_cmn/spk2utt > data/voxceleb_combined_cmn/spk2num + awk -v min_num_utts=${min_num_utts} '$2 >= min_num_utts {print $1, $2}' data/voxceleb_combined_cmn/spk2num | utils/filter_scp.pl - data/voxceleb_combined_cmn/spk2utt > data/voxceleb_combined_cmn/spk2utt.new + mv data/voxceleb_combined_cmn/spk2utt.new data/voxceleb_combined_cmn/spk2utt + utils/spk2utt_to_utt2spk.pl data/voxceleb_combined_cmn/spk2utt > data/voxceleb_combined_cmn/utt2spk + + utils/filter_scp.pl data/voxceleb_combined_cmn/utt2spk data/voxceleb_combined_cmn/utt2num_frames > data/voxceleb_combined_cmn/utt2num_frames.new + mv data/voxceleb_combined_cmn/utt2num_frames.new data/voxceleb_combined_cmn/utt2num_frames + + utils/fix_data_dir.sh data/voxceleb_combined_cmn +fi + +# Stages 6 through 8 are handled in run_xvector.sh. +# This script trains the x-vector DNN on the augmented voxceleb data. +local/nnet3/xvector/run_xvector.sh --stage $stage --train-stage $train_stage \ + --data data/voxceleb_combined_cmn --nnet-dir $model_dir \ + --egs-dir $model_dir/egs + +if [ $stage -le 9 ]; then + echo "$0: preparing a subset of Chime 6 training data to train PLDA model" + utils/subset_data_dir.sh ${data_dir} 100000 data/plda_train + steps/make_mfcc.sh --write-utt2num-frames true \ + --mfcc-config conf/mfcc_hires.conf --nj 40 --cmd "$train_cmd" \ + data/plda_train exp/make_mfcc $mfccdir + utils/fix_data_dir.sh data/plda_train + local/nnet3/xvector/prepare_feats.sh --nj 40 --cmd "$train_cmd" \ + data/plda_train data/plda_train_cmn exp/plda_train_cmn + if [ -f data/plda_train/segments ]; then + cp data/plda_train/segments data/plda_train_cmn/ + fi +fi + +if [ $stage -le 10 ]; then + echo "$0: extracting x-vector for PLDA training data" + utils/fix_data_dir.sh data/plda_train_cmn + diarization/nnet3/xvector/extract_xvectors.sh --cmd "$train_cmd --mem 10G" \ + --nj 40 --window 3.0 --period 10.0 --min-segment 1.5 --apply-cmn false \ + --hard-min true $model_dir \ + data/plda_train_cmn $model_dir/xvectors_plda_train +fi + +# Train PLDA models +if [ $stage -le 11 ]; then + echo "$0: training PLDA model" + $train_cmd $model_dir/xvectors_plda_train/log/plda.log \ + ivector-compute-plda ark:$model_dir/xvectors_plda_train/spk2utt \ + "ark:ivector-subtract-global-mean \ + scp:$model_dir/xvectors_plda_train/xvector.scp ark:- \ + | transform-vec $model_dir/xvectors_plda_train/transform.mat ark:- ark:- \ + | ivector-normalize-length ark:- ark:- |" \ + $model_dir/xvectors_plda_train/plda || exit 1; + cp $model_dir/xvectors_plda_train/plda $model_dir/ + cp $model_dir/xvectors_plda_train/transform.mat $model_dir/ + cp $model_dir/xvectors_plda_train/mean.vec $model_dir/ +fi diff --git a/egs/chime6/s5_track2/local/train_lms_srilm.sh b/egs/chime6/s5_track2/local/train_lms_srilm.sh new file mode 120000 index 00000000000..a7666f6cded --- /dev/null +++ b/egs/chime6/s5_track2/local/train_lms_srilm.sh @@ -0,0 +1 @@ +../../s5_track1/local/train_lms_srilm.sh \ No newline at end of file diff --git a/egs/chime6/s5_track2/local/train_sad.sh b/egs/chime6/s5_track2/local/train_sad.sh new file mode 100755 index 00000000000..e12a0cad694 --- /dev/null +++ b/egs/chime6/s5_track2/local/train_sad.sh @@ -0,0 +1,155 @@ +#!/bin/bash + +# Copyright 2017 Nagendra Kumar Goel +# 2017 Vimal Manohar +# 2019 Desh Raj +# Apache 2.0 + +# This script is based on local/run_asr_segmentation.sh script in the +# Aspire recipe. It demonstrates nnet3-based speech activity detection for +# segmentation. +# This script: +# 1) Prepares targets (per-frame labels) for a subset of training data +# using GMM models +# 2) Trains TDNN+Stats or TDNN+LSTM neural network using the targets +# 3) Demonstrates using the SAD system to get segments of dev data + +lang=data/lang # Must match the one used to train the models +lang_test=data/lang_test # Lang directory for decoding. + +data_dir= +test_sets= +# Model directory used to align the $data_dir to get target labels for training +# SAD. This should typically be a speaker-adapted system. +sat_model_dir= +# Model direcotry used to decode the whole-recording version of the $data_dir to +# get target labels for training SAD. This should typically be a +# speaker-independent system like LDA+MLLT system. +model_dir= +graph_dir= # Graph for decoding whole-recording version of $data_dir. + # If not provided, a new one will be created using $lang_test + +# List of weights on labels obtained from alignment; +# labels obtained from decoding; and default labels in out-of-segment regions +merge_weights=1.0,0.1,0.5 + +prepare_targets_stage=-10 +nstage=-10 +train_stage=-10 +stage=0 +nj=50 +reco_nj=40 + +# test options +test_nj=10 + +. ./cmd.sh +. ./conf/sad.conf + +if [ -f ./path.sh ]; then . ./path.sh; fi + +set -e -u -o pipefail +. utils/parse_options.sh + +if [ $# -ne 0 ]; then + exit 1 +fi + +dir=exp/segmentation${affix} +sad_work_dir=exp/sad${affix}_${nnet_type}/ +sad_nnet_dir=$dir/tdnn_${nnet_type}_sad_1a + +mkdir -p $dir +mkdir -p ${sad_work_dir} + +# See $lang/phones.txt and decide which should be garbage +garbage_phones="laughs inaudible" +silence_phones="sil spn noise" + +for p in $garbage_phones; do + for a in "" "_B" "_E" "_I" "_S"; do + echo "$p$a" + done +done > $dir/garbage_phones.txt + +for p in $silence_phones; do + for a in "" "_B" "_E" "_I" "_S"; do + echo "$p$a" + done +done > $dir/silence_phones.txt + +if ! cat $dir/garbage_phones.txt $dir/silence_phones.txt | \ + steps/segmentation/internal/verify_phones_list.py $lang/phones.txt; then + echo "$0: Invalid $dir/{silence,garbage}_phones.txt" + exit 1 +fi + +# The training data may already be segmented, so we first prepare +# a "whole" training data (not segmented) for training the SAD +# system. + +whole_data_dir=${data_dir}_whole +whole_data_id=$(basename $whole_data_dir) + +if [ $stage -le 0 ]; then + utils/data/convert_data_dir_to_whole.sh $data_dir $whole_data_dir +fi + +############################################################################### +# Extract features for the whole data directory. We extract 13-dim MFCCs to +# generate targets using the GMM system, and 40-dim MFCCs to train the NN-based +# SAD. +############################################################################### +if [ $stage -le 1 ]; then + steps/make_mfcc.sh --nj $reco_nj --cmd "$train_cmd" --write-utt2num-frames true \ + --mfcc-config conf/mfcc.conf \ + $whole_data_dir exp/make_mfcc/${whole_data_id} + steps/compute_cmvn_stats.sh $whole_data_dir exp/make_mfcc/${whole_data_id} + utils/fix_data_dir.sh $whole_data_dir + + utils/copy_data_dir.sh $whole_data_dir ${whole_data_dir}_hires + steps/make_mfcc.sh --nj $reco_nj --cmd "$train_cmd" --write-utt2num-frames true \ + --mfcc-config conf/mfcc_hires.conf \ + ${whole_data_dir}_hires exp/make_mfcc/${whole_data_id}_hires + steps/compute_cmvn_stats.sh ${whole_data_dir}_hires exp/make_mfcc/${whole_data_id}_hires + utils/fix_data_dir.sh ${whole_data_dir}_hires +fi + +############################################################################### +# Prepare SAD targets for recordings +############################################################################### +targets_dir=$dir/${whole_data_id}_combined_targets_sub3 +if [ $stage -le 2 ]; then + steps/segmentation/prepare_targets_gmm.sh --stage $prepare_targets_stage \ + --train-cmd "$train_cmd" --decode-cmd "$decode_cmd" \ + --nj $nj --reco-nj $reco_nj --lang-test $lang \ + --garbage-phones-list $dir/garbage_phones.txt \ + --silence-phones-list $dir/silence_phones.txt \ + --merge-weights "$merge_weights" \ + --remove-mismatch-frames false \ + --graph-dir "$graph_dir" \ + $lang $data_dir $whole_data_dir $sat_model_dir $model_dir $dir +fi + +############################################################################### +# Train a neural network for SAD +############################################################################### +if [ $stage -le 3 ]; then + if [ $nnet_type == "stats" ]; then + # Train a STATS-pooling network for SAD + local/segmentation/tuning/train_stats_sad_1a.sh \ + --stage $nstage --train-stage $train_stage \ + --targets-dir ${targets_dir} \ + --data-dir ${whole_data_dir}_hires --affix "1a" || exit 1 + + elif [ $nnet_type == "lstm" ]; then + # Train a TDNN+LSTM network for SAD + local/segmentation/tuning/train_lstm_sad_1a.sh \ + --stage $nstage --train-stage $train_stage \ + --targets-dir ${targets_dir} \ + --data-dir ${whole_data_dir}_hires --affix "1a" || exit 1 + + fi +fi + +exit 0; diff --git a/egs/chime6/s5_track2/local/wer_output_filter b/egs/chime6/s5_track2/local/wer_output_filter new file mode 120000 index 00000000000..12a6c616d3d --- /dev/null +++ b/egs/chime6/s5_track2/local/wer_output_filter @@ -0,0 +1 @@ +../../s5_track1/local/wer_output_filter \ No newline at end of file diff --git a/egs/chime6/s5_track2/path.sh b/egs/chime6/s5_track2/path.sh new file mode 100644 index 00000000000..c2526194bee --- /dev/null +++ b/egs/chime6/s5_track2/path.sh @@ -0,0 +1,7 @@ +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sctk/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C + diff --git a/egs/chime6/s5_track2/run.sh b/egs/chime6/s5_track2/run.sh new file mode 100755 index 00000000000..1350b8e14d5 --- /dev/null +++ b/egs/chime6/s5_track2/run.sh @@ -0,0 +1,296 @@ +#!/bin/bash +# +# Chime-6 Track 2 baseline. Based mostly on the Chime-5 recipe, with the exception +# that we are required to perform speech activity detection and speaker +# diarization before ASR, since we do not have access to the oracle SAD and +# diarization labels. +# +# Copyright 2017 Johns Hopkins University (Author: Shinji Watanabe and Yenda Trmal) +# 2019 Desh Raj, David Snyder, Ashish Arora +# Apache 2.0 + +# Begin configuration section. +nj=50 +decode_nj=20 +stage=0 +nnet_stage=-10 +sad_stage=0 +diarizer_stage=0 +decode_stage=1 +enhancement=beamformit # for a new enhancement method, + # change this variable and decode stage +decode_only=false +num_data_reps=4 +snrs="20:10:15:5:0" +foreground_snrs="20:10:15:5:0" +background_snrs="20:10:15:5:0" +# End configuration section +. ./utils/parse_options.sh + +. ./cmd.sh +. ./path.sh + +if [ $decode_only == "true" ]; then + stage=18 +fi + +set -e # exit on error + +# chime5 main directory path +# please change the path accordingly +chime5_corpus=/export/corpora4/CHiME5 +# chime6 data directories, which are generated from ${chime5_corpus}, +# to synchronize audio files across arrays and modify the annotation (JSON) file accordingly +chime6_corpus=${PWD}/CHiME6 +json_dir=${chime6_corpus}/transcriptions +audio_dir=${chime6_corpus}/audio + +# training and test data +train_set=train_worn_simu_u400k +sad_train_set=train_worn_u400k +test_sets="dev_${enhancement}_dereverb eval_${enhancement}_dereverb" + +# This script also needs the phonetisaurus g2p, srilm, beamformit +./local/check_tools.sh || exit 1; + +########################################################################### +# We first generate the synchronized audio files across arrays and +# corresponding JSON files. Note that this requires sox v14.4.2, +# which is installed via miniconda in ./local/check_tools.sh +########################################################################### + +if [ $stage -le 0 ]; then + local/generate_chime6_data.sh \ + --cmd "$train_cmd" \ + ${chime5_corpus} \ + ${chime6_corpus} +fi + +########################################################################### +# We prepare dict and lang in stages 1 to 3. +########################################################################### + +if [ $stage -le 1 ]; then + # skip u03 and u04 as they are missing + for mictype in worn u01 u02 u05 u06; do + local/prepare_data.sh --mictype ${mictype} --train true \ + ${audio_dir}/train ${json_dir}/train data/train_${mictype} + done + for dataset in dev; do + for mictype in worn; do + local/prepare_data.sh --mictype ${mictype} --train true \ + ${audio_dir}/${dataset} ${json_dir}/${dataset} \ + data/${dataset}_${mictype} + done + done +fi + +if [ $stage -le 2 ]; then + local/prepare_dict.sh + + utils/prepare_lang.sh \ + data/local/dict "" data/local/lang data/lang + + local/train_lms_srilm.sh \ + --train-text data/train_worn/text --dev-text data/dev_worn/text \ + --oov-symbol "" --words-file data/lang/words.txt \ + data/ data/srilm +fi + +LM=data/srilm/best_3gram.gz +if [ $stage -le 3 ]; then + # Compiles G for chime5 trigram LM + utils/format_lm.sh \ + data/lang $LM data/local/dict/lexicon.txt data/lang + +fi + +if [ $stage -le 4 ]; then + # remove possibly bad sessions (P11_S03, P52_S19, P53_S24, P54_S24) + # see http://spandh.dcs.shef.ac.uk/chime_challenge/data.html for more details + utils/copy_data_dir.sh data/train_worn data/train_worn_org # back up + grep -v -e "^P11_S03" -e "^P52_S19" -e "^P53_S24" -e "^P54_S24" data/train_worn_org/text > data/train_worn/text + utils/fix_data_dir.sh data/train_worn +fi + + +######################################################################################### +# In stages 5 and 6, we augment and fix train data for our training purpose. point source +# noises are extracted from chime corpus. Here we use 400k utterances from array microphones, +# its augmentation and all the worn set utterances in train. +######################################################################################### + +if [ $stage -le 5 ]; then + echo "$0: Extracting noise list from training data" + local/extract_noises.py $chime6_corpus/audio/train $chime6_corpus/transcriptions/train \ + local/distant_audio_list distant_noises + local/make_noise_list.py distant_noises > distant_noise_list + + noise_list=distant_noise_list + + echo "$0: Preparing simulated RIRs for data augmentation" + if [ ! -d RIRS_NOISES/ ]; then + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + + # This is the config for the system using simulated RIRs and point-source noises + rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list") + rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list") + rvb_opts+=(--noise-set-parameters $noise_list) + + steps/data/reverberate_data_dir.py \ + "${rvb_opts[@]}" \ + --prefix "rev" \ + --foreground-snrs $foreground_snrs \ + --background-snrs $background_snrs \ + --speech-rvb-probability 1 \ + --pointsource-noise-addition-probability 1 \ + --isotropic-noise-addition-probability 1 \ + --num-replications $num_data_reps \ + --max-noises-per-minute 1 \ + --source-sampling-rate 16000 \ + data/train_worn data/train_worn_rvb +fi + +if [ $stage -le 6 ]; then + # combine mix array and worn mics + # randomly extract first 400k utterances from all mics + # if you want to include more training data, you can increase the number of array mic utterances + utils/combine_data.sh data/train_uall data/train_u01 data/train_u02 data/train_u05 data/train_u06 + utils/subset_data_dir.sh data/train_uall 400000 data/train_u400k + utils/combine_data.sh data/${train_set} data/train_worn data/train_worn_rvb data/train_u400k + utils/combine_data.sh data/${sad_train_set} data/train_worn data/train_u400k +fi + +if [ $stage -le 7 ]; then + # Split speakers up into 3-minute chunks. This doesn't hurt adaptation, and + # lets us use more jobs for decoding etc. + utils/copy_data_dir.sh data/${train_set} data/${train_set}_nosplit + utils/data/modify_speaker_info.sh --seconds-per-spk-max 180 data/${train_set}_nosplit data/${train_set} +fi + +################################################################################## +# Now make MFCC features. We use 13-dim MFCCs to train the GMM-HMM models. +################################################################################## + +if [ $stage -le 8 ]; then + # Now make MFCC features. + # mfccdir should be some place with a largish disk where you + # want to store MFCC features. + echo "$0: make features..." + mfccdir=mfcc + steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" \ + --mfcc-config conf/mfcc.conf \ + data/${train_set} exp/make_mfcc/${train_set} $mfccdir + steps/compute_cmvn_stats.sh data/${train_set} exp/make_mfcc/${train_set} $mfccdir + utils/fix_data_dir.sh data/${train_set} +fi + +################################################################################### +# Stages 9 to 14 train monophone and triphone models. They will be used for +# generating lattices for training the chain model and for obtaining targets +# for training the SAD system. +################################################################################### + +if [ $stage -le 9 ]; then + # make a subset for monophone training + utils/subset_data_dir.sh --shortest data/${train_set} 100000 data/${train_set}_100kshort + utils/subset_data_dir.sh data/${train_set}_100kshort 30000 data/${train_set}_30kshort +fi + +if [ $stage -le 10 ]; then + # Starting basic training on MFCC features + steps/train_mono.sh --nj $nj --cmd "$train_cmd" \ + data/${train_set}_30kshort data/lang exp/mono +fi + +if [ $stage -le 11 ]; then + steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + data/${train_set} data/lang exp/mono exp/mono_ali + + steps/train_deltas.sh --cmd "$train_cmd" \ + 2500 30000 data/${train_set} data/lang exp/mono_ali exp/tri1 +fi + +if [ $stage -le 12 ]; then + steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + data/${train_set} data/lang exp/tri1 exp/tri1_ali + + steps/train_lda_mllt.sh --cmd "$train_cmd" \ + 4000 50000 data/${train_set} data/lang exp/tri1_ali exp/tri2 +fi + +if [ $stage -le 13 ]; then + steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + data/${train_set} data/lang exp/tri2 exp/tri2_ali + + steps/train_sat.sh --cmd "$train_cmd" \ + 5000 100000 data/${train_set} data/lang exp/tri2_ali exp/tri3 +fi + +if [ $stage -le 14 ]; then + # The following script cleans the data and produces cleaned data + steps/cleanup/clean_and_segment_data.sh --nj $nj --cmd "$train_cmd" \ + --segmentation-opts "--min-segment-length 0.3 --min-new-segment-length 0.6" \ + data/${train_set} data/lang exp/tri3 exp/tri3_cleaned data/${train_set}_cleaned +fi + +########################################################################## +# CHAIN MODEL TRAINING +# You can also download a pretrained chain ASR model using: +# wget http://kaldi-asr.org/models/12/0012_asr_v1.tar.gz +# Once it is downloaded, extract using: tar -xvzf 0012_asr_v1.tar.gz +# and copy the contents of the exp/ directory to your exp/ +########################################################################## +if [ $stage -le 15 ]; then + # chain TDNN + local/chain/run_tdnn.sh --nj $nj \ + --stage $nnet_stage \ + --train-set ${train_set}_cleaned \ + --test-sets "$test_sets" \ + --gmm tri3_cleaned --nnet3-affix _${train_set}_cleaned_rvb +fi + +########################################################################## +# SAD MODEL TRAINING +# You can also download a pretrained SAD model using: +# wget http://kaldi-asr.org/models/12/0012_sad_v1.tar.gz +# Once it is downloaded, extract using: tar -xvzf 0012_sad_v1.tar.gz +# and copy the contents of the exp/ directory to your exp/ +########################################################################## +if [ $stage -le 16 ]; then + local/train_sad.sh --stage $sad_stage --nj $nj \ + --data-dir data/${sad_train_set} --test-sets "${test_sets}" \ + --sat-model-dir exp/tri3_cleaned \ + --model-dir exp/tri2 +fi + +########################################################################## +# DIARIZATION MODEL TRAINING +# You can also download a pretrained diarization model using: +# wget http://kaldi-asr.org/models/12/0012_diarization_v1.tar.gz +# Once it is downloaded, extract using: tar -xvzf 0012_diarization_v1.tar.gz +# and copy the contents of the exp/ directory to your exp/ +########################################################################## +if [ $stage -le 17 ]; then + local/train_diarizer.sh --stage $diarizer_stage \ + --data-dir data/${train_set} \ + --model-dir exp/xvector_nnet_1a +fi + +########################################################################## +# DECODING: In track 2, we are given raw utterances without segment +# or speaker information, so we have to decode the whole pipeline, i.e., +# SAD -> Diarization -> ASR. This is done in the local/decode.sh +# script. +########################################################################## +if [ $stage -le 18 ]; then + local/decode.sh --stage $decode_stage \ + --enhancement $enhancement \ + --test-sets "$test_sets" +fi + +exit 0; + diff --git a/egs/chime6/s5_track2/sid b/egs/chime6/s5_track2/sid new file mode 120000 index 00000000000..893a12f30c9 --- /dev/null +++ b/egs/chime6/s5_track2/sid @@ -0,0 +1 @@ +../../sre08/v1/sid \ No newline at end of file diff --git a/egs/chime6/s5_track2/steps b/egs/chime6/s5_track2/steps new file mode 120000 index 00000000000..1b186770dd1 --- /dev/null +++ b/egs/chime6/s5_track2/steps @@ -0,0 +1 @@ +../../wsj/s5/steps/ \ No newline at end of file diff --git a/egs/chime6/s5_track2/utils b/egs/chime6/s5_track2/utils new file mode 120000 index 00000000000..a3279dc8679 --- /dev/null +++ b/egs/chime6/s5_track2/utils @@ -0,0 +1 @@ +../../wsj/s5/utils/ \ No newline at end of file diff --git a/egs/cifar/v1/image/copy_data_dir.sh b/egs/cifar/v1/image/copy_data_dir.sh new file mode 100755 index 00000000000..c923f5cc07a --- /dev/null +++ b/egs/cifar/v1/image/copy_data_dir.sh @@ -0,0 +1,118 @@ +#!/bin/bash + +# Copyright 2013 Johns Hopkins University (author: Daniel Povey) +# Apache 2.0 + +# This script operates on a directory, such as in data/train/, +# that contains some subset of the following files: +# feats.scp +# images.scp +# vad.scp +# spk2utt +# utt2spk +# text +# +# It copies to another directory, possibly adding a specified prefix or a suffix +# to the utterance and/or speaker names. Note, the recording-ids stay the same. +# + + +# begin configuration section +spk_prefix= +utt_prefix= +spk_suffix= +utt_suffix= +validate_opts= # should rarely be needed. +# end configuration section + +. utils/parse_options.sh + +if [ $# != 2 ]; then + echo "Usage: " + echo " $0 [options] " + echo "e.g.:" + echo " $0 --spk-prefix=1- --utt-prefix=1- data/train data/train_1" + echo "Options" + echo " --spk-prefix= # Prefix for speaker ids, default empty" + echo " --utt-prefix= # Prefix for utterance ids, default empty" + echo " --spk-suffix= # Suffix for speaker ids, default empty" + echo " --utt-suffix= # Suffix for utterance ids, default empty" + exit 1; +fi + + +export LC_ALL=C + +srcdir=$1 +destdir=$2 + +if [ ! -f $srcdir/utt2spk ]; then + echo "copy_data_dir.sh: no such file $srcdir/utt2spk" + exit 1; +fi + +if [ "$destdir" == "$srcdir" ]; then + echo "$0: this script requires and to be different." + exit 1 +fi + +set -e; + +mkdir -p $destdir + +cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/utt_map +cat $srcdir/spk2utt | awk -v p=$spk_prefix -v s=$spk_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/spk_map + +if [ ! -f $srcdir/utt2uniq ]; then + if [[ ! -z $utt_prefix || ! -z $utt_suffix ]]; then + cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $1);}' > $destdir/utt2uniq + fi +else + cat $srcdir/utt2uniq | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $2);}' > $destdir/utt2uniq +fi + +cat $srcdir/utt2spk | utils/apply_map.pl -f 1 $destdir/utt_map | \ + utils/apply_map.pl -f 2 $destdir/spk_map >$destdir/utt2spk + +utils/utt2spk_to_spk2utt.pl <$destdir/utt2spk >$destdir/spk2utt + +if [ -f $srcdir/feats.scp ]; then + utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/feats.scp >$destdir/feats.scp +fi + +if [ -f $srcdir/vad.scp ]; then + utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/vad.scp >$destdir/vad.scp +fi + +if [ -f $srcdir/images.scp ]; then + utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/images.scp >$destdir/images.scp +fi + +if [ -f $srcdir/text ]; then + utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text >$destdir/text +fi +if [ -f $srcdir/utt2dur ]; then + utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2dur >$destdir/utt2dur +fi +if [ -f $srcdir/cmvn.scp ]; then + utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/cmvn.scp >$destdir/cmvn.scp +fi + +rm $destdir/spk_map $destdir/utt_map + +echo "$0: copied data from $srcdir to $destdir" + +for f in feats.scp cmvn.scp vad.scp utt2uniq utt2dur utt2num_frames text images.scp; do + if [ -f $destdir/$f ] && [ ! -f $srcdir/$f ]; then + echo "$0: file $f exists in dest $destdir but not in src $srcdir. Moving it to" + echo " ... $destdir/.backup/$f" + mkdir -p $destdir/.backup + mv $destdir/$f $destdir/.backup/ + fi +done + + +[ ! -f $srcdir/feats.scp ] && validate_opts="$validate_opts --no-feats" +[ ! -f $srcdir/text ] && validate_opts="$validate_opts --no-text" + +utils/validate_data_dir.sh $validate_opts $destdir diff --git a/egs/cifar/v1/image/get_allowed_lengths.py b/egs/cifar/v1/image/get_allowed_lengths.py index 02321fdd2df..33996c8eef1 100755 --- a/egs/cifar/v1/image/get_allowed_lengths.py +++ b/egs/cifar/v1/image/get_allowed_lengths.py @@ -10,6 +10,7 @@ file is later used by make_features.py to pad each image sufficiently so that they all have an allowed length. This is intended for end2end chain training. """ +from __future__ import division import argparse import os @@ -117,14 +118,14 @@ def find_allowed_durations(start_len, end_len, args): (length // args.frame_subsampling_factor)) allowed_lengths.append(length) fp.write("{}\n".format(int(length))) - length *= args.factor + length = max(length * args.factor, length + args.frame_subsampling_factor) return allowed_lengths def main(): args = get_args() - args.factor = 1.0 + args.factor / 100.0 + args.factor = 1.0 + args.factor/100.0 image2length = read_kaldi_mapfile(os.path.join(args.srcdir, 'image2num_frames')) @@ -133,7 +134,7 @@ def main(): "Coverage rate: {}%".format(start_dur, end_dur, 100.0 - args.coverage_factor * 2)) logger.info("There will be {} unique allowed lengths " - "for the images.".format(int(math.log(end_dur / start_dur) / + "for the images.".format(int((math.log(float(end_dur)/start_dur))/ math.log(args.factor)))) allowed_durations = find_allowed_durations(start_dur, end_dur, args) diff --git a/egs/cifar/v1/image/matrix_to_image.py b/egs/cifar/v1/image/matrix_to_image.py index 52dcead7479..908b1f8b3ed 100755 --- a/egs/cifar/v1/image/matrix_to_image.py +++ b/egs/cifar/v1/image/matrix_to_image.py @@ -26,6 +26,7 @@ copy-feats --binary=false $(grep $imgid data/train/feats.scp | cut -d' ' -f2) - | \ image/matrix_to_image.py --color=1 > $imgid.bmp """ +from __future__ import division import argparse import sys @@ -59,7 +60,7 @@ num_cols = len(line) # initialize if len(line) != num_cols: raise Exception("All rows should be of the same length") - line = map(float, line) # string to float + line = [float(i) for i in line] # string to float if max(line) > 1: raise Excetion("Element value in the matrix should be normalized and no larger than 1") line = [int(x * 255) for x in line] # float to integer ranging from 0 to 255 @@ -70,7 +71,7 @@ if num_cols % 3 != 0: raise Exception("Number of columns should be a multiple of 3 in the color mode") width = num_rows - height = num_cols / 3 + height = num_cols/3 # reform the image matrix image_array = [[0 for i in range(width * 3)] for j in range(height)] for i in range(height): diff --git a/egs/madcat_ar/v1/local/make_features.py b/egs/cifar/v1/image/ocr/make_features.py similarity index 51% rename from egs/madcat_ar/v1/local/make_features.py rename to egs/cifar/v1/image/ocr/make_features.py index a21276d32c2..aa909f596c9 100755 --- a/egs/madcat_ar/v1/local/make_features.py +++ b/egs/cifar/v1/image/ocr/make_features.py @@ -2,27 +2,33 @@ # Copyright 2017 Chun Chieh Chang # 2017 Ashish Arora +# 2017 Yiwen Shao # 2018 Hossein Hadian +# 2018 Desh Raj """ This script converts images to Kaldi-format feature matrices. The input to this script is the path to a data directory, e.g. "data/train". This script reads the images listed in images.scp and writes them to standard output (by default) as Kaldi-formatted matrices (in text form). It also scales the images so they have the same height (via --feat-dim). It can optionally pad - the images (on left/right sides) with white pixels. + the images (on left/right sides) with white pixels. It by default performs + augmentation, (directly scaling down and scaling up). It will double the + data but we can turn augmentation off (via --no-augment). If an 'image2num_frames' file is found in the data dir, it will be used to enforce the images to have the specified length in that file by padding white pixels (the --padding option will be ignored in this case). This relates to end2end chain training. - eg. local/make_features.py data/train --feat-dim 40 """ - +import random import argparse import os import sys import numpy as np from scipy import misc +import math +from signal import signal, SIGPIPE, SIG_DFL +signal(SIGPIPE, SIG_DFL) parser = argparse.ArgumentParser(description="""Converts images (in 'dir'/images.scp) to features and writes them to standard output in text format.""") @@ -38,8 +44,15 @@ parser.add_argument('--padding', type=int, default=5, help='Number of white pixels to pad on the left' 'and right side of the image.') - - +parser.add_argument('--num-channels', type=int, default=1, + help='Number of color channels') +parser.add_argument('--vertical-shift', type=int, default=0, + help='total number of padding pixel per column') +parser.add_argument('--fliplr', type=lambda x: (str(x).lower()=='true'), default=False, + help="Flip the image left-right for right to left languages") +parser.add_argument('--augment_type', type=str, default='no_aug', + choices=['no_aug', 'random_scale','random_shift'], + help='Subset of data to process.') args = parser.parse_args() @@ -59,18 +72,6 @@ def write_kaldi_matrix(file_handle, matrix, key): file_handle.write("\n") file_handle.write(" ]\n") - -def get_scaled_image(im): - scale_size = args.feat_dim - sx = im.shape[1] # width - sy = im.shape[0] # height - scale = (1.0 * scale_size) / sy - nx = int(scale_size) - ny = int(scale * sx) - im = misc.imresize(im, (nx, ny)) - return im - - def horizontal_pad(im, allowed_lengths = None): if allowed_lengths is None: left_padding = right_padding = args.padding @@ -88,21 +89,73 @@ def horizontal_pad(im, allowed_lengths = None): left_padding = int(padding // 2) right_padding = padding - left_padding dim_y = im.shape[0] # height - im_pad = np.concatenate((255 * np.ones((dim_y, left_padding), - dtype=int), im), axis=1) - im_pad1 = np.concatenate((im_pad, 255 * np.ones((dim_y, right_padding), - dtype=int)), axis=1) + if args.num_channels in [1,4]: + im_pad = np.concatenate((255 * np.ones((dim_y, left_padding), + dtype=int), im), axis=1) + im_pad1 = np.concatenate((im_pad, 255 * np.ones((dim_y, right_padding), + dtype=int)), axis=1) + else: + im_pad = np.concatenate((255 * np.ones((dim_y, left_padding, args.num_channels), + dtype=int), im), axis=1) + im_pad1 = np.concatenate((im_pad, 255 * np.ones((dim_y, right_padding, args.num_channels), + dtype=int)), axis=1) return im_pad1 +def get_scaled_image_aug(im, mode='normal'): + scale_size = args.feat_dim + sx = im.shape[1] + sy = im.shape[0] + scale = (1.0 * scale_size) / sy + nx = int(scale_size) + ny = int(scale * sx) + scale_size = random.randint(10, 30) + scale = (1.0 * scale_size) / sy + down_nx = int(scale_size) + down_ny = int(scale * sx) + if mode == 'normal': + im = misc.imresize(im, (nx, ny)) + return im + else: + im_scaled_down = misc.imresize(im, (down_nx, down_ny)) + im_scaled_up = misc.imresize(im_scaled_down, (nx, ny)) + return im_scaled_up + return im -### main ### +def vertical_shift(im, mode='normal'): + if args.vertical_shift == 0: + return im + total = args.vertical_shift + if mode == 'notmid': + val = random.randint(0, 1) + if val == 0: + mode = 'top' + else: + mode = 'bottom' + if mode == 'normal': + top = int(total / 2) + bottom = total - top + elif mode == 'top': # more padding on top + top = random.randint(total / 2, total) + bottom = total - top + elif mode == 'bottom': # more padding on bottom + top = random.randint(0, total / 2) + bottom = total - top + width = im.shape[1] + im_pad = np.concatenate( + (255 * np.ones((top, width), dtype=int) - + np.random.normal(2, 1, (top, width)).astype(int), im), axis=0) + im_pad = np.concatenate( + (im_pad, 255 * np.ones((bottom, width), dtype=int) - + np.random.normal(2, 1, (bottom, width)).astype(int)), axis=0) + return im_pad +### main ### +random.seed(1) data_list_path = args.images_scp_path - if args.out_ark == '-': out_fh = sys.stdout else: - out_fh = open(args.out_ark,'wb') + out_fh = open(args.out_ark,'w') allowed_lengths = None allowed_len_handle = args.allowed_len_file_path @@ -123,13 +176,31 @@ def horizontal_pad(im, allowed_lengths = None): line_vect = line.split(' ') image_id = line_vect[0] image_path = line_vect[1] - im = misc.imread(image_path) - im_scaled = get_scaled_image(im) - im_horizontal_padded = horizontal_pad(im_scaled, allowed_lengths) - if im_horizontal_padded is None: + if args.num_channels == 4: + im = misc.imread(image_path, mode='L') + else: + im = misc.imread(image_path) + if args.fliplr: + im = np.fliplr(im) + if args.augment_type == 'no_aug' or 'random_shift': + im = get_scaled_image_aug(im, 'normal') + elif args.augment_type == 'random_scale': + im = get_scaled_image_aug(im, 'scaled') + im = horizontal_pad(im, allowed_lengths) + if im is None: num_fail += 1 continue - data = np.transpose(im_horizontal_padded, (1, 0)) + if args.augment_type == 'no_aug' or 'random_scale': + im = vertical_shift(im, 'normal') + elif args.augment_type == 'random_shift': + im = vertical_shift(im, 'notmid') + if args.num_channels in [1,4]: + data = np.transpose(im, (1, 0)) + elif args.num_channels == 3: + H = im.shape[0] + W = im.shape[1] + C = im.shape[2] + data = np.reshape(np.transpose(im, (1, 0, 2)), (W, H * C)) data = np.divide(data, 255.0) num_ok += 1 write_kaldi_matrix(out_fh, data, image_id) diff --git a/egs/cifar/v1/image/select_image_in_egs.py b/egs/cifar/v1/image/select_image_in_egs.py index 88d7d568e66..dbf48e6403d 100755 --- a/egs/cifar/v1/image/select_image_in_egs.py +++ b/egs/cifar/v1/image/select_image_in_egs.py @@ -9,6 +9,7 @@ # --vertical-shift=0.3 --srand=27 --num-channels=3 ark:exp/cifar10_egs/egs.1.ark ark,t:- | \ # image/select_image_in_egs.py $id | image/matrix_to_image.py --color 3 > $id.bmp +from __future__ import print_function import argparse import sys diff --git a/egs/cifar/v1/local/process_data.py b/egs/cifar/v1/local/process_data.py index 51173dafc6f..38a599297d2 100755 --- a/egs/cifar/v1/local/process_data.py +++ b/egs/cifar/v1/local/process_data.py @@ -6,6 +6,7 @@ """ This script prepares the training and test data for CIFAR-10 or CIFAR-100. """ +from __future__ import division import argparse import os @@ -14,13 +15,13 @@ parser = argparse.ArgumentParser(description="""Converts train/test data of CIFAR-10 or CIFAR-100 to Kaldi feature format""") -parser.add_argument('database', type=str, +parser.add_argument('database', default='data/dl/cifar-10-batches-bin', help='path to downloaded cifar data (binary version)') -parser.add_argument('dir', type=str, help='output dir') -parser.add_argument('--cifar-version', type=str, default='CIFAR-10', choices=['CIFAR-10', 'CIFAR-100']) -parser.add_argument('--dataset', type=str, default='train', choices=['train', 'test']) -parser.add_argument('--out-ark', type=str, default='-', help='where to write output feature data') +parser.add_argument('dir', help='output dir') +parser.add_argument('--cifar-version', default='CIFAR-10', choices=['CIFAR-10', 'CIFAR-100']) +parser.add_argument('--dataset', default='train', choices=['train', 'test']) +parser.add_argument('--out-ark', default='-', help='where to write output feature data') args = parser.parse_args() @@ -37,7 +38,7 @@ def load_cifar10_data_batch(datafile): for i in range(num_images_in_batch): label = ord(fh.read(1)) bin_img = fh.read(C * H * W) - img = [[[ord(byte) / 255.0 for byte in bin_img[channel*H*W+row*W:channel*H*W+(row+1)*W]] + img = [[[ord(byte)/255.0 for byte in bin_img[channel*H*W+row*W:channel*H*W+(row+1)*W]] for row in range(H)] for channel in range(C)] labels += [label] data += [img] @@ -52,7 +53,7 @@ def load_cifar100_data_batch(datafile, num_images_in_batch): coarse_label = ord(fh.read(1)) fine_label = ord(fh.read(1)) bin_img = fh.read(C * H * W) - img = [[[ord(byte) / 255.0 for byte in bin_img[channel*H*W+row*W:channel*H*W+(row+1)*W]] + img = [[[ord(byte)/255.0 for byte in bin_img[channel*H*W+row*W:channel*H*W+(row+1)*W]] for row in range(H)] for channel in range(C)] fine_labels += [fine_label] coarse_labels += [coarse_label] @@ -80,7 +81,7 @@ def write_kaldi_matrix(file_handle, matrix, key): if num_cols != len(matrix[row_index]): raise Exception("All the rows of a matrix are expected to " "have the same length") - file_handle.write(" ".join(map(lambda x: str(x), matrix[row_index]))) + file_handle.write(" ".join([str(x) for x in matrix[row_index]])) if row_index != num_rows - 1: file_handle.write("\n") file_handle.write(" ]\n") diff --git a/egs/cmu_cslu_kids/README b/egs/cmu_cslu_kids/README new file mode 100644 index 00000000000..0b8512e2487 --- /dev/null +++ b/egs/cmu_cslu_kids/README @@ -0,0 +1,21 @@ +This is an ASR recipe for children speech using cmu_kids and cslu_kids. +Both of the corpora can be found on LDC: + - cmu_kids : https://catalog.ldc.upenn.edu/LDC97S63 + - cslu_kids: https://catalog.ldc.upenn.edu/LDC2007S18 + +To run this recipe, you'll need a copy of both corpora: + ./run.sh --cmu_kids --cslu_kids + +By default, this recipe will download an LM pretrained on LibriSpeech from +lm_url=www.openslr.org/resources/11. If you already have a copy of this LM +and do not wish to redownload, you can specify the LM path using the --lm_src option: + ./run.sh --cmu_kids --cslu_kids \ + --lm_src + +This recipe will also download and clean CMU_Dict by default. If you have a clean copy +already, or wish to use your own dictionary, simply copy your version of the dict to + data/local/dict + +To run extra features for triphone models or VLTN, set the following options true: + ./run.sh --cmu_kids --cslu_kids \ + --vtln true --extra_features true diff --git a/egs/cmu_cslu_kids/s5/cmd.sh b/egs/cmu_cslu_kids/s5/cmd.sh new file mode 100644 index 00000000000..179307556d5 --- /dev/null +++ b/egs/cmu_cslu_kids/s5/cmd.sh @@ -0,0 +1,23 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd=queue.pl +export decode_cmd="queue.pl --mem 2G" +# the use of cuda_cmd is deprecated, used only in 'nnet1', +export cuda_cmd="queue.pl --gpu 1" + +if [[ "$(hostname -f)" == "*.fit.vutbr.cz" ]]; then + queue_conf=$HOME/queue_conf/default.conf # see example /homes/kazi/iveselyk/queue_conf/default.conf, + export train_cmd="queue.pl --config $queue_conf --mem 2G --matylda 0.2" + export decode_cmd="queue.pl --config $queue_conf --mem 3G --matylda 0.1" + export cuda_cmd="queue.pl --config $queue_conf --gpu 1 --mem 10G --tmp 40G" +fi diff --git a/egs/cmu_cslu_kids/s5/conf/decode.config b/egs/cmu_cslu_kids/s5/conf/decode.config new file mode 100644 index 00000000000..10b0eee900b --- /dev/null +++ b/egs/cmu_cslu_kids/s5/conf/decode.config @@ -0,0 +1,4 @@ +# Use wider-than-normal decoding beams for RM. +first_beam=16.0 +beam=20.0 +lattice_beam=10.0 diff --git a/egs/cmu_cslu_kids/s5/conf/decode_dnn.config b/egs/cmu_cslu_kids/s5/conf/decode_dnn.config new file mode 100644 index 00000000000..e7cfca74763 --- /dev/null +++ b/egs/cmu_cslu_kids/s5/conf/decode_dnn.config @@ -0,0 +1,8 @@ +# In RM, the optimal decode LMWT is in range 2..5, which is different from usual 10..15 +# (it is caused by using simple rule-based LM, instead of n-gram LM), +scoring_opts="--min-lmwt 2 --max-lmwt 10" +# Still, it is better to use --acwt 0.1, both for decoding and sMBR, +acwt=0.1 +# For this small task we can afford to have large beams, +beam=30.0 # beam for decoding. Was 13.0 in the scripts. +lattice_beam=18.0 # this has most effect on size of the lattices. diff --git a/egs/cmu_cslu_kids/s5/conf/mfcc.conf b/egs/cmu_cslu_kids/s5/conf/mfcc.conf new file mode 100644 index 00000000000..6bbcb763153 --- /dev/null +++ b/egs/cmu_cslu_kids/s5/conf/mfcc.conf @@ -0,0 +1,2 @@ +--use-energy=false # only non-default option. +--allow_downsample=true diff --git a/egs/cmu_cslu_kids/s5/conf/mfcc_hires.conf b/egs/cmu_cslu_kids/s5/conf/mfcc_hires.conf new file mode 100644 index 00000000000..40f95e97010 --- /dev/null +++ b/egs/cmu_cslu_kids/s5/conf/mfcc_hires.conf @@ -0,0 +1,11 @@ +# config for high-resolution MFCC features, intended for neural network training +# Note: we keep all cepstra, so it has the same info as filterbank features, +# but MFCC is more easily compressible (because less correlated) which is why +# we prefer this method. +--use-energy=false # use average of log energy, not energy. +--num-mel-bins=40 # similar to Google's setup. +--num-ceps=40 # there is no dimensionality reduction. +--low-freq=20 # low cutoff frequency for mel bins... this is high-bandwidth data, so + # there might be some information at the low end. +--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) +--allow-downsample=true diff --git a/egs/cmu_cslu_kids/s5/conf/online_cmvn.conf b/egs/cmu_cslu_kids/s5/conf/online_cmvn.conf new file mode 100644 index 00000000000..7748a4a4dd3 --- /dev/null +++ b/egs/cmu_cslu_kids/s5/conf/online_cmvn.conf @@ -0,0 +1 @@ +# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh diff --git a/egs/reverb/s5/conf/fbank.conf b/egs/cmu_cslu_kids/s5/conf/plp.conf similarity index 57% rename from egs/reverb/s5/conf/fbank.conf rename to egs/cmu_cslu_kids/s5/conf/plp.conf index c4b73674cab..e7e8a9e14af 100644 --- a/egs/reverb/s5/conf/fbank.conf +++ b/egs/cmu_cslu_kids/s5/conf/plp.conf @@ -1,2 +1,2 @@ # No non-default options for now. - +--allow_downsample=true diff --git a/egs/cmu_cslu_kids/s5/local/chain/compare_wer.sh b/egs/cmu_cslu_kids/s5/local/chain/compare_wer.sh new file mode 100755 index 00000000000..8ee5db2326a --- /dev/null +++ b/egs/cmu_cslu_kids/s5/local/chain/compare_wer.sh @@ -0,0 +1,137 @@ +#!/bin/bash + +# this script is used for comparing decoding results between systems. +# e.g. local/chain/compare_wer.sh exp/chain/tdnn_{c,d}_sp +# For use with discriminatively trained systems you specify the epochs after a colon: +# for instance, +# local/chain/compare_wer.sh exp/chain/tdnn_c_sp exp/chain/tdnn_c_sp_smbr:{1,2,3} + + +if [ $# == 0 ]; then + echo "Usage: $0: [--looped] [--online] [ ... ]" + echo "e.g.: $0 exp/chain/tdnn_{b,c}_sp" + echo "or (with epoch numbers for discriminative training):" + echo "$0 exp/chain/tdnn_b_sp_disc:{1,2,3}" + exit 1 +fi + +echo "# $0 $*" + +include_looped=false +if [ "$1" == "--looped" ]; then + include_looped=true + shift +fi +include_online=false +if [ "$1" == "--online" ]; then + include_online=true + shift +fi + + +used_epochs=false + +# this function set_names is used to separate the epoch-related parts of the name +# [for discriminative training] and the regular parts of the name. +# If called with a colon-free directory name, like: +# set_names exp/chain/tdnn_lstm1e_sp_bi_smbr +# it will set dir=exp/chain/tdnn_lstm1e_sp_bi_smbr and epoch_infix="" +# If called with something like: +# set_names exp/chain/tdnn_d_sp_smbr:3 +# it will set dir=exp/chain/tdnn_d_sp_smbr and epoch_infix="_epoch3" + + +set_names() { + if [ $# != 1 ]; then + echo "compare_wer_general.sh: internal error" + exit 1 # exit the program + fi + dirname=$(echo $1 | cut -d: -f1) + epoch=$(echo $1 | cut -s -d: -f2) + if [ -z $epoch ]; then + epoch_infix="" + else + used_epochs=true + epoch_infix=_epoch${epoch} + fi +} + + + +echo -n "# System " +for x in $*; do printf "% 10s" " $(basename $x)"; done +echo + +strings=( + "#WER dev_clean_2 (tgsmall) " + "#WER dev_clean_2 (tglarge) ") + +for n in 0 1; do + echo -n "${strings[$n]}" + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + decode_names=(tgsmall_dev_clean_2 tglarge_dev_clean_2) + + wer=$(cat $dirname/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + if $include_looped; then + echo -n "# [looped:] " + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + wer=$(cat $dirname/decode_looped_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + fi + if $include_online; then + echo -n "# [online:] " + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + wer=$(cat ${dirname}_online/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + fi +done + + +if $used_epochs; then + exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. +fi + + +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final train prob (xent)" +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob (xent)" +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Num-params " +for x in $*; do + printf "% 10s" $(grep num-parameters $x/log/progress.1.log | awk '{print $2}') +done +echo diff --git a/egs/cmu_cslu_kids/s5/local/chain/run_tdnnf.sh b/egs/cmu_cslu_kids/s5/local/chain/run_tdnnf.sh new file mode 120000 index 00000000000..34499362831 --- /dev/null +++ b/egs/cmu_cslu_kids/s5/local/chain/run_tdnnf.sh @@ -0,0 +1 @@ +tuning/run_tdnn_1a.sh \ No newline at end of file diff --git a/egs/cmu_cslu_kids/s5/local/chain/tdnnf_decode.sh b/egs/cmu_cslu_kids/s5/local/chain/tdnnf_decode.sh new file mode 100755 index 00000000000..8d124193584 --- /dev/null +++ b/egs/cmu_cslu_kids/s5/local/chain/tdnnf_decode.sh @@ -0,0 +1,82 @@ +#! /bin/bash + +# Copyright Johns Hopkins University +# 2019 Fei Wu + +# Decode on new data set using trained model. +# The data directory should be prepared in kaldi style. +# Usage: +# ./local/chain/tdnnF_decode.sh --data_src + +set -euo pipefail +echo "$0 $@" + +stage=0 +decode_nj=10 +data_src= +affix= +tree_affix= +nnet3_affix= + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat </dev/null || true + + ( + nspk=$(wc -l <$data_hires/spk2utt) + steps/nnet3/decode.sh \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nspk --cmd "$decode_cmd" --num-threads 4 \ + --online-ivector-dir $ivect_dir \ + $tree_dir/graph_tgsmall $data_hires ${dir}/decode_tgsmall_$data_name || exit 1 + + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_test_{tgsmall,tglarge} \ + $data_hires ${dir}/decode_{tgsmall,tglarge}_$data_name || exit 1 + ) || touch $dir/.error & + + wait + [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1 +fi + diff --git a/egs/cmu_cslu_kids/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/cmu_cslu_kids/s5/local/chain/tuning/run_tdnn_1a.sh new file mode 100755 index 00000000000..51e0123d0f2 --- /dev/null +++ b/egs/cmu_cslu_kids/s5/local/chain/tuning/run_tdnn_1a.sh @@ -0,0 +1,279 @@ +#!/bin/bash + +# Copyright 2017-2018 Johns Hopkins University (author: Daniel Povey) +# 2017-2018 Yiming Wang +# 2019 Fei Wu + +# Based on material recipe for low-resource languages +# Factored TDNN with skip connectiong and splicing (two bottle neck layers) + +# WER results on dev +# Model LM Corpus WER(%) +# tdnn_1a tg_large Combined 11.72 +# tdnn_1a tg_small Combined 13.61 +# tdnn_1a tg_large CMU_Kids 17.26 +# tdnn_1a tg_small CMU_Kids 26.43 +# tdnn_1a tg_large CSLU_Kids 10.80 +# tdnn_1a tg_small CSLU_Kids 12.50 + +# steps/info/chain_dir_info.pl exp/chain/tdnn1a_sp +# exp/chain/tdnn1a_sp/: num-iters=342 nj=2..5 num-params=17.9M dim=40+100->3192 combine=-0.042->-0.041 (over 8) xent:train/valid[227,341,final]=(-0.451,-0.363,-0.346/-0.524,-0.466,-0.434) logprob:train/valid[227,341,final]=(-0.047,-0.043,-0.042/-0.058,-0.056,-0.054) + +set -euo pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=10 +train_set=train +test_sets="test" +gmm=tri3 +nnet3_affix= + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1a +tree_affix= +train_stage=-10 +get_egs_stage=-10 +decode_iter= + +# training chunk-options +chunk_width=140,100,160 +dropout_schedule='0,0@0.20,0.3@0.50,0' +common_egs_dir= +xent_regularize=0.1 + +# training options +srand=0 +remove_egs=true +reporting_email= + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 8 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + + +if [ $stage -le 10 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + +if [ $stage -le 11 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + opts="l2-regularize=0.004 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true" + linear_opts="orthonormal-constraint=-1.0 l2-regularize=0.004" + output_opts="l2-regularize=0.002" + + mkdir -p $dir/configs + + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-dropout-layer name=tdnn1 $opts dim=1024 + linear-component name=tdnn2l0 dim=256 $linear_opts input=Append(-1,0) + linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-dropout-layer name=tdnn2 $opts input=Append(0,1) dim=1024 + linear-component name=tdnn3l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-dropout-layer name=tdnn3 $opts dim=1024 input=Append(0,1) + linear-component name=tdnn4l0 dim=256 $linear_opts input=Append(-1,0) + linear-component name=tdnn4l dim=256 $linear_opts input=Append(0,1) + relu-batchnorm-dropout-layer name=tdnn4 $opts input=Append(0,1) dim=1024 + linear-component name=tdnn5l dim=256 $linear_opts + relu-batchnorm-dropout-layer name=tdnn5 $opts dim=1024 input=Append(0, tdnn3l) + linear-component name=tdnn6l0 dim=256 $linear_opts input=Append(-3,0) + linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn6 $opts input=Append(0,3) dim=1280 + linear-component name=tdnn7l0 dim=256 $linear_opts input=Append(-3,0) + linear-component name=tdnn7l dim=256 $linear_opts input=Append(0,3) + relu-batchnorm-dropout-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1024 + linear-component name=tdnn8l0 dim=256 $linear_opts input=Append(-3,0) + linear-component name=tdnn8l dim=256 $linear_opts input=Append(0,3) + relu-batchnorm-dropout-layer name=tdnn8 $opts input=Append(0,3) dim=1280 + linear-component name=tdnn9l0 dim=256 $linear_opts input=Append(-3,0) + linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn5l) dim=1024 + linear-component name=tdnn10l0 dim=256 $linear_opts input=Append(-3,0) + linear-component name=tdnn10l dim=256 $linear_opts input=Append(0,3) + relu-batchnorm-dropout-layer name=tdnn10 $opts input=Append(0,3) dim=1280 + linear-component name=tdnn11l0 dim=256 $linear_opts input=Append(-3,0) + linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn9l,tdnn7l) dim=1024 + linear-component name=prefinal-l dim=256 $linear_opts + + relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1280 + linear-component name=prefinal-chain-l dim=256 $linear_opts + batchnorm-component name=prefinal-chain-batchnorm + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1280 + linear-component name=prefinal-xent-l dim=256 $linear_opts + batchnorm-component name=prefinal-xent-batchnorm + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts + +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + +fi + + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=20 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.002 \ + --trainer.optimization.final-effective-lrate=0.0002 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if [ $stage -le 14 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l tmp + cut -f 3- < tmp > out + + tr '[:lower:]' '[:upper:]' < out > tmp + tr -d '[:cntrl:]' < tmp > out + sent=$( out + tr '[:lower:]' '[:upper:]' < tmp > out + trans=$(> $data/$target/utt2spk + echo "$uttID $KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe -f wav -p -c 1 $utt|" >> $data/$target/wav.scp + echo "$spkID f" >> $data/$target/spk2gender + echo "$uttID $sent" >> $data/$target/text + fi + done + fi + fi +done + +for d in $data/train $data/test; do + utils/utt2spk_to_spk2utt.pl $d/utt2spk > $d/spk2utt + utils/fix_data_dir.sh $d +done + +printf "\t total: %s; train: %s; test: %s.\n" "$total_cnt" "$train_cnt" "$test_cnt" +rm -f out tmp + +# Optional +# Get data duration, just for book keeping +# for data in $data/train $data/test; do +# ./local/data_duration.sh $data +# done +# + diff --git a/egs/cmu_cslu_kids/s5/local/cslu_aud_prep.sh b/egs/cmu_cslu_kids/s5/local/cslu_aud_prep.sh new file mode 100755 index 00000000000..735f87eca9f --- /dev/null +++ b/egs/cmu_cslu_kids/s5/local/cslu_aud_prep.sh @@ -0,0 +1,43 @@ +#/bin/bash + +# Copyright Johns Hopkins University +# 2019 Fei Wu + +# Called by local/cslu_DataPrep.shi + +Assignment() +{ + rnd=$((1+RANDOM % 100)) + if [ $rnd -le $test_percentage ]; then + target="test" + else + target="train" + fi +} +audio= +test_percentage=30 # Percent of data reserved as test set +debug=debug/cslu_dataprep_debug +data=data/data_cslu +. ./utils/parse_options.sh + +uttID=$(basename $audio) +uttID=${uttID%'.wav'} +sentID=${uttID: -3} +spkID=${uttID%$sentID} +sentID=${sentID%"0"} +sentID=$(echo "$sentID" | tr '[:lower:]' '[:upper:]' ) + +line=$(grep $sentID cslu/docs/all.map) + +if [ -z "$line" ]; then # Can't map utterance to transcript + echo $audio $sentID >> $debug +else + txt=$(echo $line | grep -oP '"\K.*?(?=")') + cap_txt=${txt^^} + Assignment + echo "$uttID $cap_txt" >> $data/$target/text + echo "$uttID $spkID" >> $data/$target/utt2spk + echo "$spkID f" >> $data/$target/spk2gender + echo "$uttID $audio" >> $data/$target/wav.scp +fi + diff --git a/egs/cmu_cslu_kids/s5/local/cslu_prepare_data.sh b/egs/cmu_cslu_kids/s5/local/cslu_prepare_data.sh new file mode 100755 index 00000000000..621179079b3 --- /dev/null +++ b/egs/cmu_cslu_kids/s5/local/cslu_prepare_data.sh @@ -0,0 +1,49 @@ +#! /bin/bash + +# Copyright Johns Hopkins University +# 2019 Fei Wu + +# Prepares cslu_kids +# Should be run from egs/cmu_csli_kids + +set -e +Looper() +{ + # echo "Looping through $1" + for f in $1/*; do + if [ -d $f ]; then + Looper $f + else + ./local/cslu_aud_prep.sh --data $data --audio $f + fi + done +} + +data=data/data_cslu +corpus=cslu +. ./utils/parse_options.sh + +rm -f debug/cslu_dataprep_debug +mkdir -p debug +# File check, remove previous data and features files +for d in $data/test $data/train; do + mkdir -p $d + ./local/file_check.sh $d +done + +echo "Preparing cslu_kids..." +Looper $corpus/speech/scripted + +for d in $data/test $data/train; do + ./utils/utt2spk_to_spk2utt.pl $d + ./utils/fix_data_dir.sh $d +done +if [ -f debug/cslu_dataprep_debug ]; then + echo "Missing transcripts for some utterances. See cslu_dataprep_debug" +fi + +# Optional +# Get data duration, just for book keeping +# for data in data/data_cslu/test data/data_cslu/train; do +# ./local/data_duration.sh $data +# done diff --git a/egs/cmu_cslu_kids/s5/local/data_duration.sh b/egs/cmu_cslu_kids/s5/local/data_duration.sh new file mode 100755 index 00000000000..e838e365ea7 --- /dev/null +++ b/egs/cmu_cslu_kids/s5/local/data_duration.sh @@ -0,0 +1,19 @@ +#! /bin/bash + +# Copyright Johns Hopkins University +# 2019 Fei Wu + +# Get duration of the utterance given data dir +set -eu +echo $0 $@ + +data_dir=$1 +mkdir -p duration + +./utils/data/get_utt2dur.sh $data_dir + +echo "$data_dir" +python local/sum_duration.py $data_dir/utt2dur +echo "" + + diff --git a/egs/cmu_cslu_kids/s5/local/download_cmu_dict.sh b/egs/cmu_cslu_kids/s5/local/download_cmu_dict.sh new file mode 100755 index 00000000000..0248dd0cae1 --- /dev/null +++ b/egs/cmu_cslu_kids/s5/local/download_cmu_dict.sh @@ -0,0 +1,37 @@ +#!/bin/bash +# Copyright 2019 Fei Wu +set -eu +# Adapted from the local/prepare_dict script in +# the librispeech recipe. Download and prepare CMU_dict. +# For childresn speech ASR tasks, since the vocabulary in cmu_kids and +# cslu_kids is relatively easy comparing to librispeech, we use only the +# CMU_dict, and do not handle OOV with G2P. +# Should be run from egs/cmu_cslu_kids. +# Usage: +# local/download_cmu_dict.sh --dict_dir + +dict_dir=data/local/dict +OOV="" + +. ./utils/parse_options.sh || exit 1; +. ./path.sh || exit 1 + +if [ ! -d $dict_dir ]; then + echo "Downloading and preparing CMU dict" + svn co -r 12440 https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict $dict_dir/raw_dict || exit 1; + + echo "Removing the pronunciation variant markers ..." + grep -v ';;;' $dict_dir/raw_dict/cmudict.0.7a | \ + perl -ane 'if(!m:^;;;:){ s:(\S+)\(\d+\) :$1 :; print; }' | \ + sort -u > $dict_dir/lexicon.txt || exit 1; + + tr -d '\r' < $dict_dir/raw_dict/cmudict.0.7a.symbols > $dict_dir/nonsilence_phones.txt + + echo "$OOV SIL" >> $dict_dir/lexicon.txt + + echo "SIL" > $dict_dir/silence_phones.txt + echo "SPN" >> $dict_dir/silence_phones.txt + echo "SIL" > $dict_dir/optional_silence.txt + + rm -rf $dict_dir/raw_dict +fi diff --git a/egs/cmu_cslu_kids/s5/local/download_lm.sh b/egs/cmu_cslu_kids/s5/local/download_lm.sh new file mode 100755 index 00000000000..382f313df7c --- /dev/null +++ b/egs/cmu_cslu_kids/s5/local/download_lm.sh @@ -0,0 +1,76 @@ +#!/bin/bash + +# Copyright 2014 Vassil Panayotov +# Apache 2.0 + +if [ $# -ne "2" ]; then + echo "Usage: $0 " + echo "e.g.: $0 http://www.openslr.org/resources/11 data/local/lm" + exit 1 +fi + +base_url=$1 +dst_dir=$2 + +# given a filename returns the corresponding file size in bytes +# The switch cases below can be autogenerated by entering the data directory and running: +# for f in *; do echo "\"$f\") echo \"$(du -b $f | awk '{print $1}')\";;"; done +function filesize() { + case $1 in + "3-gram.arpa.gz") echo "759636181";; + "3-gram.pruned.1e-7.arpa.gz") echo "34094057";; + "3-gram.pruned.3e-7.arpa.gz") echo "13654242";; + "4-gram.arpa.gz") echo "1355172078";; + "g2p-model-5") echo "20098243";; + "librispeech-lexicon.txt") echo "5627653";; + "librispeech-lm-corpus.tgz") echo "1803499244";; + "librispeech-lm-norm.txt.gz") echo "1507274412";; + "librispeech-vocab.txt") echo "1737588";; + *) echo "";; + esac +} + +function check_and_download () { + [[ $# -eq 1 ]] || { echo "check_and_download() expects exactly one argument!"; return 1; } + fname=$1 + echo "Downloading file '$fname' into '$dst_dir'..." + expect_size="$(filesize $fname)" + [[ ! -z "$expect_size" ]] || { echo "Unknown file size for '$fname'"; return 1; } + if [[ -s $dst_dir/$fname ]]; then + # In the following statement, the first version works on linux, and the part + # after '||' works on Linux. + f=$dst_dir/$fname + fsize=$(set -o pipefail; du -b $f 2>/dev/null | awk '{print $1}' || stat '-f %z' $f) + if [[ "$fsize" -eq "$expect_size" ]]; then + echo "'$fname' already exists and appears to be complete" + return 0 + else + echo "WARNING: '$fname' exists, but the size is wrong - re-downloading ..." + fi + fi + wget --no-check-certificate -O $dst_dir/$fname $base_url/$fname || { + echo "Error while trying to download $fname!" + return 1 + } + f=$dst_dir/$fname + # In the following statement, the first version works on linux, and the part after '||' + # works on Linux. + fsize=$(set -o pipefail; du -b $f 2>/dev/null | awk '{print $1}' || stat '-f %z' $f) + [[ "$fsize" -eq "$expect_size" ]] || { echo "$fname: file size mismatch!"; return 1; } + return 0 +} + +mkdir -p $dst_dir + +for f in 3-gram.arpa.gz 3-gram.pruned.1e-7.arpa.gz 3-gram.pruned.3e-7.arpa.gz 4-gram.arpa.gz \ + g2p-model-5 librispeech-lm-corpus.tgz librispeech-vocab.txt librispeech-lexicon.txt; do + check_and_download $f || exit 1 +done + +cd $dst_dir +ln -sf 3-gram.pruned.1e-7.arpa.gz lm_tgmed.arpa.gz +ln -sf 3-gram.pruned.3e-7.arpa.gz lm_tgsmall.arpa.gz +ln -sf 3-gram.arpa.gz lm_tglarge.arpa.gz +ln -sf 4-gram.arpa.gz lm_fglarge.arpa.gz + +exit 0 diff --git a/egs/cmu_cslu_kids/s5/local/file_check.sh b/egs/cmu_cslu_kids/s5/local/file_check.sh new file mode 100755 index 00000000000..859f228058a --- /dev/null +++ b/egs/cmu_cslu_kids/s5/local/file_check.sh @@ -0,0 +1,17 @@ +#! /bin/bash + +# Copyright Johns Hopkins University +# 2019 Fei Wu + + +printf "\t File Check in folder: %s.\n" "$1" + +WavScp="$1/wav.scp" +Text="$1/text" +Utt2Spk="$1/utt2spk" +Gend="$1/utt2gender" +Spk2Utt="$1/spk2utt" +rm -f $WavScp $Text $Utt2Spk $Gend $Spk2Utt + + + diff --git a/egs/cmu_cslu_kids/s5/local/format_lms.sh b/egs/cmu_cslu_kids/s5/local/format_lms.sh new file mode 100755 index 00000000000..b530f61d2d9 --- /dev/null +++ b/egs/cmu_cslu_kids/s5/local/format_lms.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +# Copyright 2014 Vassil Panayotov +# Apache 2.0 + +# Prepares the test time language model(G) transducers +# (adapted from wsj/s5/local/wsj_format_data.sh) + +. ./path.sh || exit 1; + +# begin configuration section +src_dir=data/lang +# end configuration section + +. utils/parse_options.sh || exit 1; + +set -e + +if [ $# -ne 1 ]; then + echo "Usage: $0 " + echo "e.g.: $0 /export/a15/vpanayotov/data/lm" + echo ", where:" + echo " is the directory in which the language model is stored/downloaded" + echo "Options:" + echo " --src-dir

# source lang directory, default data/lang" + exit 1 +fi + +lm_dir=$1 + +if [ ! -d $lm_dir ]; then + echo "$0: expected source LM directory $lm_dir to exist" + exit 1; +fi +if [ ! -f $src_dir/words.txt ]; then + echo "$0: expected $src_dir/words.txt to exist." + exit 1; +fi + + +tmpdir=data/local/lm_tmp.$$ +trap "rm -r $tmpdir" EXIT + +mkdir -p $tmpdir + +for lm_suffix in tgsmall tgmed; do + # tglarge is prepared by a separate command, called from run.sh; we don't + # want to compile G.fst for tglarge, as it takes a while. + test=${src_dir}_test_${lm_suffix} + mkdir -p $test + cp -r ${src_dir}/* $test + gunzip -c $lm_dir/lm_${lm_suffix}.arpa.gz | \ + arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=$test/words.txt - $test/G.fst + utils/validate_lang.pl --skip-determinization-check $test || exit 1; +done + +echo "Succeeded in formatting data." + +exit 0 diff --git a/egs/cmu_cslu_kids/s5/local/make_lm.pl b/egs/cmu_cslu_kids/s5/local/make_lm.pl new file mode 100755 index 00000000000..80eea5a6198 --- /dev/null +++ b/egs/cmu_cslu_kids/s5/local/make_lm.pl @@ -0,0 +1,119 @@ +#!/usr/bin/env perl + +# Copyright 2010-2011 Yanmin Qian Microsoft Corporation + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +# This file takes as input the file wp_gram.txt that comes with the RM +# distribution, and creates the language model as an acceptor in FST form. + +# make_rm_lm.pl wp_gram.txt > G.txt + +if (@ARGV != 1) { + print "usage: make_rm_lm.pl wp_gram.txt > G.txt\n"; + exit(0); +} +unless (open(IN_FILE, "@ARGV[0]")) { + die ("can't open @ARGV[0]"); +} + + +$flag = 0; +$count_wrd = 0; +$cnt_ends = 0; +$init = ""; + +while ($line = ) +{ + chop($line); # Return the last char + + $line =~ s/ //g; # Selete all spaces + + if(($line =~ /^>/)) # If line has ">" + { + if($flag == 0) # Flip flag + { + $flag = 1; + } + $line =~ s/>//g; # Delete ">" + $hashcnt{$init} = $i; + $init = $line; + $i = 0; + $count_wrd++; + @LineArray[$count_wrd - 1] = $init; + $hashwrd{$init} = 0; + } + elsif($flag != 0) + { + + $hash{$init}[$i] = $line; + $i++; + if($line =~ /SENTENCE-END/) + { + $cnt_ends++; + } + } + else + {} +} + +$hashcnt{$init} = $i; + +$num = 0; +$weight = 0; +$init_wrd = "SENTENCE-END"; +$hashwrd{$init_wrd} = @LineArray; +for($i = 0; $i < $hashcnt{$init_wrd}; $i++) +{ + $weight = -log(1/$hashcnt{$init_wrd}); + $hashwrd{$hash{$init_wrd}[$i]} = $i + 1; + print "0 $hashwrd{$hash{$init_wrd}[$i]} $hash{$init_wrd}[$i] $hash{$init_wrd}[$i] $weight\n"; +} +$num = $i; + +for($i = 0; $i < @LineArray; $i++) +{ + if(@LineArray[$i] eq 'SENTENCE-END') + {} + else + { + if($hashwrd{@LineArray[$i]} == 0) + { + $num++; + $hashwrd{@LineArray[$i]} = $num; + } + for($j = 0; $j < $hashcnt{@LineArray[$i]}; $j++) + { + $weight = -log(1/$hashcnt{@LineArray[$i]}); + if($hashwrd{$hash{@LineArray[$i]}[$j]} == 0) + { + $num++; + $hashwrd{$hash{@LineArray[$i]}[$j]} = $num; + } + if($hash{@LineArray[$i]}[$j] eq 'SENTENCE-END') + { + print "$hashwrd{@LineArray[$i]} $hashwrd{$hash{@LineArray[$i]}[$j]} $weight\n" + } + else + { + print "$hashwrd{@LineArray[$i]} $hashwrd{$hash{@LineArray[$i]}[$j]} $hash{@LineArray[$i]}[$j] $hash{@LineArray[$i]}[$j] $weight\n"; + } + } + } +} + +print "$hashwrd{$init_wrd} 0\n"; +close(IN_FILE); + + diff --git a/egs/cmu_cslu_kids/s5/local/nnet3/compare_wer.sh b/egs/cmu_cslu_kids/s5/local/nnet3/compare_wer.sh new file mode 100755 index 00000000000..095e85cc338 --- /dev/null +++ b/egs/cmu_cslu_kids/s5/local/nnet3/compare_wer.sh @@ -0,0 +1,132 @@ +#!/bin/bash + +# this script is used for comparing decoding results between systems. +# e.g. local/chain/compare_wer.sh exp/chain/tdnn_{c,d}_sp +# For use with discriminatively trained systems you specify the epochs after a colon: +# for instance, +# local/chain/compare_wer.sh exp/chain/tdnn_c_sp exp/chain/tdnn_c_sp_smbr:{1,2,3} + + +if [ $# == 0 ]; then + echo "Usage: $0: [--looped] [--online] [ ... ]" + echo "e.g.: $0 exp/chain/tdnn_{b,c}_sp" + echo "or (with epoch numbers for discriminative training):" + echo "$0 exp/chain/tdnn_b_sp_disc:{1,2,3}" + exit 1 +fi + +echo "# $0 $*" + +include_looped=false +if [ "$1" == "--looped" ]; then + include_looped=true + shift +fi +include_online=false +if [ "$1" == "--online" ]; then + include_online=true + shift +fi + + +used_epochs=false + +# this function set_names is used to separate the epoch-related parts of the name +# [for discriminative training] and the regular parts of the name. +# If called with a colon-free directory name, like: +# set_names exp/chain/tdnn_lstm1e_sp_bi_smbr +# it will set dir=exp/chain/tdnn_lstm1e_sp_bi_smbr and epoch_infix="" +# If called with something like: +# set_names exp/chain/tdnn_d_sp_smbr:3 +# it will set dir=exp/chain/tdnn_d_sp_smbr and epoch_infix="_epoch3" + + +set_names() { + if [ $# != 1 ]; then + echo "compare_wer_general.sh: internal error" + exit 1 # exit the program + fi + dirname=$(echo $1 | cut -d: -f1) + epoch=$(echo $1 | cut -s -d: -f2) + if [ -z $epoch ]; then + epoch_infix="" + else + used_epochs=true + epoch_infix=_epoch${epoch} + fi +} + + + +echo -n "# System " +for x in $*; do printf "% 10s" " $(basename $x)"; done +echo + +strings=( + "#WER dev_clean_2 (tgsmall) " + "#WER dev_clean_2 (tglarge) ") + +for n in 0 1; do + echo -n "${strings[$n]}" + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + decode_names=(tgsmall_dev_clean_2 tglarge_dev_clean_2) + + wer=$(cat $dirname/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + if $include_looped; then + echo -n "# [looped:] " + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + wer=$(cat $dirname/decode_looped_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + fi + if $include_online; then + echo -n "# [online:] " + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + wer=$(cat ${dirname}_online/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + fi +done + + +if $used_epochs; then + exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. +fi + +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.{final,combined}.log 2>/dev/null | grep log-like | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.{final,combined}.log 2>/dev/null | grep log-like | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final train acc " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.{final,combined}.log 2>/dev/null | grep accuracy | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid acc " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.{final,combined}.log 2>/dev/null | grep accuracy | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo diff --git a/egs/cmu_cslu_kids/s5/local/nnet3/run_ivector_common.sh b/egs/cmu_cslu_kids/s5/local/nnet3/run_ivector_common.sh new file mode 100755 index 00000000000..c695f2c9f1c --- /dev/null +++ b/egs/cmu_cslu_kids/s5/local/nnet3/run_ivector_common.sh @@ -0,0 +1,148 @@ +#!/bin/bash + +set -euo pipefail + +# This script is called from local/nnet3/run_tdnn.sh and +# local/chain/run_tdnn.sh (and may eventually be called by more +# scripts). It contains the common feature preparation and +# iVector-related parts of the script. See those scripts for examples +# of usage. + +stage=0 +train_set=train +test_sets="test" +gmm=tri3b + +nnet3_affix= + +. ./cmd.sh +. ./path.sh +. utils/parse_options.sh + +gmm_dir=exp/${gmm} +ali_dir=exp/${gmm}_ali_${train_set}_sp + +for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do + if [ ! -f $f ]; then + echo "$0: expected file $f to exist" + exit 1 + fi +done + +if [ $stage -le 1 ]; then + # Although the nnet will be trained by high resolution data, we still have to + # perturb the normal data to get the alignment _sp stands for speed-perturbed + echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)" + utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp + echo "$0: making MFCC features for low-resolution speed-perturbed data" + steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 data/${train_set}_sp || exit 1; + steps/compute_cmvn_stats.sh data/${train_set}_sp || exit 1; + utils/fix_data_dir.sh data/${train_set}_sp +fi + +if [ $stage -le 2 ]; then + echo "$0: aligning with the perturbed low-resolution data" + steps/align_fmllr.sh --nj 20 --cmd "$train_cmd" \ + data/${train_set}_sp data/lang $gmm_dir $ali_dir || exit 1 +fi + +if [ $stage -le 3 ]; then + # Create high-resolution MFCC features (with 40 cepstra instead of 13). + # this shows how you can split across multiple file-systems. + echo "$0: creating high-resolution MFCC features" + mfccdir=data/${train_set}_sp_hires/data + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + utils/create_split_dir.pl /export/fs0{1,2}/$USER/kaldi-data/mfcc/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage + fi + + for datadir in ${train_set}_sp ${test_sets}; do + utils/copy_data_dir.sh data/$datadir data/${datadir}_hires + done + + # do volume-perturbation on the training data prior to extracting hires + # features; this helps make trained nnets more invariant to test data volume. + utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires || exit 1; + + for datadir in ${train_set}_sp ${test_sets}; do + steps/make_mfcc.sh --nj 10 --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/${datadir}_hires || exit 1; + steps/compute_cmvn_stats.sh data/${datadir}_hires || exit 1; + utils/fix_data_dir.sh data/${datadir}_hires || exit 1; + done +fi + +if [ $stage -le 4 ]; then + echo "$0: computing a subset of data to train the diagonal UBM." + # We'll use about a quarter of the data. + mkdir -p exp/nnet3${nnet3_affix}/diag_ubm + temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm + + num_utts_total=$(wc -l 2041 combine=-0.47->-0.38 loglike:train/valid[20,31,combined]=(-0.62,-0.38,-0.37/-1.03,-1.03,-1.02) accuracy:train/valid[20,31,combined]=(0.79,0.87,0.87/0.70,0.72,0.72) + +# Below, comparing with the chain TDNN system. It's a little better with the +# small-vocab decoding. Both systems are probably super-badly tuned, and the +# chain system probably used too many jobs. +# +# local/nnet3/compare_wer.sh exp/chain/tdnn1a_sp exp/nnet3/tdnn_lstm1a_sp +# System tdnn1a_sp tdnn_lstm1a_sp +#WER dev_clean_2 (tgsmall) 18.43 17.37 +#WER dev_clean_2 (tglarge) 13.15 13.43 +# Final train prob -0.3933 +# Final valid prob -0.9662 +# Final train acc 0.8652 +# Final valid acc 0.7206 + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +decode_nj=10 +train_set=train_clean_5 +test_sets=dev_clean_2 +gmm=tri3b +nnet3_affix= + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1a # affix for the TDNN directory name +train_stage=-10 +get_egs_stage=-10 +decode_iter= + +# training options +# training chunk-options +chunk_width=40,30,20 +chunk_left_context=40 +chunk_right_context=0 +common_egs_dir= +xent_regularize=0.1 + +# training options +srand=0 +remove_egs=true +reporting_email= + +#decode options +test_online_decoding=true # if true, it will run the last decoding stage. + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda delay=$label_delay input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + relu-renorm-layer name=tdnn1 dim=520 + relu-renorm-layer name=tdnn2 dim=520 input=Append(-1,0,1) + fast-lstmp-layer name=lstm1 cell-dim=520 recurrent-projection-dim=130 non-recurrent-projection-dim=130 decay-time=20 delay=-3 + relu-renorm-layer name=tdnn3 dim=520 input=Append(-3,0,3) + relu-renorm-layer name=tdnn4 dim=520 input=Append(-3,0,3) + fast-lstmp-layer name=lstm2 cell-dim=520 recurrent-projection-dim=130 non-recurrent-projection-dim=130 decay-time=20 delay=-3 + relu-renorm-layer name=tdnn5 dim=520 input=Append(-3,0,3) + relu-renorm-layer name=tdnn6 dim=520 input=Append(-3,0,3) + fast-lstmp-layer name=lstm3 cell-dim=520 recurrent-projection-dim=130 non-recurrent-projection-dim=130 decay-time=20 delay=-3 + + output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 11 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/train_rnn.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=6 \ + --trainer.deriv-truncate-margin=10 \ + --trainer.samples-per-iter=20000 \ + --trainer.optimization.num-jobs-initial=1 \ + --trainer.optimization.num-jobs-final=2 \ + --trainer.optimization.initial-effective-lrate=0.0003 \ + --trainer.optimization.final-effective-lrate=0.00003 \ + --trainer.optimization.shrink-value=0.99 \ + --trainer.rnn.num-chunk-per-minibatch=128,64 \ + --trainer.optimization.momentum=0.5 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --ali-dir=$ali_dir \ + --lang=$lang \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 12 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l 2041 combine=-0.71->-0.58 loglike:train/valid[20,31,combined]=(-2.78,-0.95,-0.57/-2.94,-1.31,-0.98) accuracy:train/valid[20,31,combined]=(0.48,0.75,0.81/0.45,0.67,0.71) + +# local/nnet3/compare_wer.sh --online exp/nnet3/tdnn_lstm1a_sp exp/nnet3/tdnn_lstm1b_sp +# System tdnn_lstm1a_sp tdnn_lstm1b_sp +#WER dev_clean_2 (tgsmall) 17.67 17.01 +# [online:] 18.06 17.26 +#WER dev_clean_2 (tglarge) 13.43 12.63 +# [online:] 13.73 12.94 +# Final train prob -0.3660 -0.5680 +# Final valid prob -1.0236 -0.9771 +# Final train acc 0.8737 0.8067 +# Final valid acc 0.7222 0.7144 + + + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +decode_nj=10 +train_set=train_clean_5 +test_sets=dev_clean_2 +gmm=tri3b +nnet3_affix= + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1b # affix for the TDNN+LSTM directory name +train_stage=-10 +get_egs_stage=-10 +decode_iter= + +# training options +# training chunk-options +chunk_width=40,30,20 +chunk_left_context=40 +chunk_right_context=0 +common_egs_dir= +xent_regularize=0.1 +dropout_schedule='0,0@0.20,0.3@0.50,0' + +# training options +srand=0 +remove_egs=true +reporting_email= + +#decode options +test_online_decoding=true # if true, it will run the last decoding stage. + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda delay=$label_delay input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + relu-renorm-layer name=tdnn1 dim=520 + relu-renorm-layer name=tdnn2 dim=520 input=Append(-1,0,1) + fast-lstmp-layer name=lstm1 cell-dim=520 recurrent-projection-dim=130 non-recurrent-projection-dim=130 $lstm_opts + relu-renorm-layer name=tdnn3 dim=520 input=Append(-3,0,3) + relu-renorm-layer name=tdnn4 dim=520 input=Append(-3,0,3) + fast-lstmp-layer name=lstm2 cell-dim=520 recurrent-projection-dim=130 non-recurrent-projection-dim=130 $lstm_opts + relu-renorm-layer name=tdnn5 dim=520 input=Append(-3,0,3) + relu-renorm-layer name=tdnn6 dim=520 input=Append(-3,0,3) + fast-lstmp-layer name=lstm3 cell-dim=520 recurrent-projection-dim=130 non-recurrent-projection-dim=130 $lstm_opts + + output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 11 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/train_rnn.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=6 \ + --trainer.deriv-truncate-margin=10 \ + --trainer.samples-per-iter=20000 \ + --trainer.optimization.num-jobs-initial=1 \ + --trainer.optimization.num-jobs-final=2 \ + --trainer.optimization.initial-effective-lrate=0.0003 \ + --trainer.optimization.final-effective-lrate=0.00003 \ + --trainer.optimization.shrink-value=0.99 \ + --trainer.dropout-schedule="$dropout_schedule" \ + --trainer.rnn.num-chunk-per-minibatch=128,64 \ + --trainer.optimization.momentum=0.5 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --ali-dir=$ali_dir \ + --lang=$lang \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 12 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l 2041 combine=-0.99->-0.81 loglike:train/valid[20,31,combined]=(-1.22,-0.69,-0.61/-1.34,-1.02,-0.91) accuracy:train/valid[20,31,combined]=(0.68,0.779,0.800/0.64,0.70,0.724) + + + + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +decode_nj=10 +train_set=train_clean_5 +test_sets=dev_clean_2 +gmm=tri3b +nnet3_affix= + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1c # affix for the TDNN+LSTM directory name +train_stage=-10 +get_egs_stage=-10 +decode_iter= + +# training options +# training chunk-options +chunk_width=40,30,20 +chunk_left_context=40 +chunk_right_context=0 +common_egs_dir= +xent_regularize=0.1 +dropout_schedule='0,0@0.20,0.3@0.50,0' + +# training options +srand=0 +remove_egs=true +reporting_email= + +#decode options +test_online_decoding=true # if true, it will run the last decoding stage. + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda delay=$label_delay input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + relu-batchnorm-layer name=tdnn1 dim=520 $tdnn_opts + relu-batchnorm-layer name=tdnn2 dim=520 $tdnn_opts input=Append(-1,0,1) + fast-lstmp-layer name=lstm1 cell-dim=520 recurrent-projection-dim=130 non-recurrent-projection-dim=130 $lstm_opts + relu-batchnorm-layer name=tdnn3 dim=520 $tdnn_opts input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn4 dim=520 $tdnn_opts input=Append(-3,0,3) + fast-lstmp-layer name=lstm2 cell-dim=520 recurrent-projection-dim=130 non-recurrent-projection-dim=130 $lstm_opts + relu-batchnorm-layer name=tdnn5 dim=520 $tdnn_opts input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn6 dim=520 $tdnn_opts input=Append(-3,0,3) + fast-lstmp-layer name=lstm3 cell-dim=520 recurrent-projection-dim=130 non-recurrent-projection-dim=130 $lstm_opts + + output-layer name=output input=lstm3 $output_opts output-delay=$label_delay dim=$num_targets max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 11 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/train_rnn.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=6 \ + --trainer.deriv-truncate-margin=10 \ + --trainer.samples-per-iter=20000 \ + --trainer.optimization.num-jobs-initial=1 \ + --trainer.optimization.num-jobs-final=2 \ + --trainer.optimization.initial-effective-lrate=0.0003 \ + --trainer.optimization.final-effective-lrate=0.00003 \ + --trainer.dropout-schedule="$dropout_schedule" \ + --trainer.rnn.num-chunk-per-minibatch=128,64 \ + --trainer.optimization.momentum=0.5 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --ali-dir=$ali_dir \ + --lang=$lang \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 12 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l data/lang/G.fst || exit 1; + +# Checking that G is stochastic [note, it wouldn't be for an Arpa] +fstisstochastic data/lang/G.fst || echo Error: G is not stochastic + +# Checking that G.fst is determinizable. +fstdeterminize data/lang/G.fst /dev/null || echo Error determinizing G. + +# Checking that L_disambig.fst is determinizable. +fstdeterminize data/lang/L_disambig.fst /dev/null || echo Error determinizing L. + +# Checking that disambiguated lexicon times G is determinizable +fsttablecompose data/lang/L_disambig.fst data/lang/G.fst | \ + fstdeterminize >/dev/null || echo Error + +# Checking that LG is stochastic: +fsttablecompose data/lang/L.fst data/lang/G.fst | \ + fstisstochastic || echo Error: LG is not stochastic. + +# Checking that L_disambig.G is stochastic: +fsttablecompose data/lang/L_disambig.fst data/lang/G.fst | \ + fstisstochastic || echo Error: LG is not stochastic. + +echo "Succeeded preparing grammar for CMU_kids." diff --git a/egs/cmu_cslu_kids/s5/local/score.sh b/egs/cmu_cslu_kids/s5/local/score.sh new file mode 100755 index 00000000000..c812199fc98 --- /dev/null +++ b/egs/cmu_cslu_kids/s5/local/score.sh @@ -0,0 +1,63 @@ +#!/bin/bash +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +# 2014 Guoguo Chen +# Apache 2.0 + +[ -f ./path.sh ] && . ./path.sh + +# begin configuration section. +cmd=run.pl +stage=0 +decode_mbr=true +word_ins_penalty=0.0,0.5,1.0 +min_lmwt=7 +max_lmwt=17 +iter=final +#end configuration section. + +[ -f ./path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# -ne 3 ]; then + echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] " + echo " Options:" + echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." + echo " --stage (0|1|2) # start scoring script from part-way through." + echo " --decode_mbr (true/false) # maximum bayes risk decoding (confusion network)." + echo " --min_lmwt # minumum LM-weight for lattice rescoring " + echo " --max_lmwt # maximum LM-weight for lattice rescoring " + exit 1; +fi + +data=$1 +lang_or_graph=$2 +dir=$3 + +symtab=$lang_or_graph/words.txt + +for f in $symtab $dir/lat.1.gz $data/text; do + [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1; +done + +mkdir -p $dir/scoring/log + +cat $data/text | sed 's:::g' | sed 's:::g' > $dir/scoring/test_filt.txt + +for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.$wip.log \ + lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ + lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ + lattice-best-path --word-symbol-table=$symtab \ + ark:- ark,t:$dir/scoring/LMWT.$wip.tra || exit 1; +done + +# Note: the double level of quoting for the sed command +for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.$wip.log \ + cat $dir/scoring/LMWT.$wip.tra \| \ + utils/int2sym.pl -f 2- $symtab \| sed 's:\::g' \| \ + compute-wer --text --mode=present \ + ark:$dir/scoring/test_filt.txt ark,p:- ">&" $dir/wer_LMWT_$wip || exit 1; +done + +exit 0; diff --git a/egs/cmu_cslu_kids/s5/local/sort_result.sh b/egs/cmu_cslu_kids/s5/local/sort_result.sh new file mode 100755 index 00000000000..aedec9dc344 --- /dev/null +++ b/egs/cmu_cslu_kids/s5/local/sort_result.sh @@ -0,0 +1,46 @@ +#! /bin/bash + +# Copyright Johns Hopkins University +# 2019 Fei Wu + +# Sorts and reports results in results/results.txt +# for all models in exp. Expects decode directories +# to be named as exp//decode* or exp/chain/tdnn*/decode* +# Should be run from egs/cmu_cslu_kids. + +res=${1:-"results/results.txt"} +exp=exp +mkdir -p results +rm -f $res + +echo "Sorting results in: " +echo "# ---------- GMM-HMM Models ----------" >> $res +for mdl in $exp/mono* $exp/tri*; do + echo " $mdl" + if [ -d $mdl ];then + for dec in $mdl/decode*;do + echo " $dec" + if [ -d $dec ];then + grep WER $dec/wer* | \ + sort -k2 -n > $dec/WERs + head -n 1 $dec/WERs >> $res + fi + done + fi +done + +echo "# ---------- DNN-HMM Models ----------" >> $res +# DNN results +for mdl in $exp/chain/tdnn*; do + echo " $mdl" + for dec in $mdl/decode*; do + if [ -d $dec ]; then + echo " $dec" + grep WER $dec/wer* | \ + sort -k2 -n > $dec/WERs + head -n 1 $dec/WERs >> $res + fi + done +done + +sed -i "s/:/ /g" $res diff --git a/egs/cmu_cslu_kids/s5/local/subset_dataset.sh b/egs/cmu_cslu_kids/s5/local/subset_dataset.sh new file mode 100755 index 00000000000..050128247a4 --- /dev/null +++ b/egs/cmu_cslu_kids/s5/local/subset_dataset.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +# Copyright 2017 Luminar Technologies, Inc. (author: Daniel Galvez) +# Apache 2.0 + +# The following commands were used to generate the mini_librispeech dataset: +# +# Note that data generation is random. This could be fixed by +# providing a seed argument to the shuf program. + +if [ "$#" -ne 3 ]; then + echo "Usage: $0 " + echo "e.g.: $0 /export/a05/dgalvez/LibriSpeech/train-clean-100 \\ + /export/a05/dgalvez/LibriSpeech/train-clean-5 5" + exit 1 +fi + +src_dir=$1 +dest_dir=$2 +dest_num_hours=$3 + +src=$(basename $src_dir) +dest=$(basename $dest_dir) +librispeech_dir=$(dirname $src_dir) + +# TODO: Possibly improve this to ensure gender balance and speaker +# balance. +# TODO: Use actual time values instead of assuming that to make sure we get $dest_num_hours of data +src_num_hours=$(grep "$src" $librispeech_dir/CHAPTERS.TXT | awk -F'|' '{ print $3 }' | \ +python -c ' +from __future__ import print_function +from sys import stdin +minutes_str = stdin.read().split() +print(int(round(sum([float(minutes) for minutes in minutes_str]) / 60.0)))') +src_num_chapters=$(grep "$src" $librispeech_dir/CHAPTERS.TXT | \ + awk -F'|' '{ print $1 }' | sort -u | wc -l) +mkdir -p data/subset_tmp +grep "$src" $librispeech_dir/CHAPTERS.TXT | \ + awk -F'|' '{ print $1 }' | \ + shuf -n $(((dest_num_hours * src_num_chapters) / src_num_hours)) > \ + data/subset_tmp/${dest}_chapter_id_list.txt + +while read -r chapter_id || [[ -n "$chapter_id" ]]; do + chapter_dir=$(find $src_dir/ -mindepth 2 -name "$chapter_id" -type d) + speaker_id=$(basename $(dirname $chapter_dir)) + mkdir -p $dest_dir/$speaker_id/ + cp -r $chapter_dir $dest_dir/$speaker_id/ +done < data/subset_tmp/${dest}_chapter_id_list.txt diff --git a/egs/cmu_cslu_kids/s5/local/sum_duration.py b/egs/cmu_cslu_kids/s5/local/sum_duration.py new file mode 100644 index 00000000000..0af7ba62151 --- /dev/null +++ b/egs/cmu_cslu_kids/s5/local/sum_duration.py @@ -0,0 +1,15 @@ +# Sum duration obtained by using +# utils/data/get_utt2dur.sh + +import sys +file = sys.argv[1] +sum = 0 +with open(file, 'r') as fp: + line = fp.readline() + while(line): + toks = line.strip().split() + sum += float(toks[1]) + line = fp.readline() +fp.close() +h=sum/3600 +sys.stdout.write("%f hour data.\n"%h) diff --git a/egs/cmu_cslu_kids/s5/local/train_lms.sh b/egs/cmu_cslu_kids/s5/local/train_lms.sh new file mode 100755 index 00000000000..0807210be18 --- /dev/null +++ b/egs/cmu_cslu_kids/s5/local/train_lms.sh @@ -0,0 +1,217 @@ +#!/bin/bash + +# This script trains LMs on the WSJ LM-training data. +# It requires that you have already run wsj_extend_dict.sh, +# to get the larger-size dictionary including all of CMUdict +# plus any OOVs and possible acronyms that we could easily +# derive pronunciations for. + +dict_suffix= + +echo "$0 $@" # Print the command line for logging +. utils/parse_options.sh || exit 1; + +dir=data/local/local_lm +srcdir=data/local/dict${dict_suffix}_larger +mkdir -p $dir +. ./path.sh || exit 1; # for KALDI_ROOT +export PATH=$KALDI_ROOT/tools/kaldi_lm:$PATH +( # First make sure the kaldi_lm toolkit is installed. + cd $KALDI_ROOT/tools || exit 1; + if [ -d kaldi_lm ]; then + echo Not installing the kaldi_lm toolkit since it is already there. + else + echo Downloading and installing the kaldi_lm tools + if [ ! -f kaldi_lm.tar.gz ]; then + wget http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz || exit 1; + fi + tar -xvzf kaldi_lm.tar.gz || exit 1; + cd kaldi_lm + make || exit 1; + echo Done making the kaldi_lm tools + fi +) || exit 1; + + + +if [ ! -f $srcdir/cleaned.gz -o ! -f $srcdir/lexicon.txt ]; then + echo "Expecting files $srcdir/cleaned.gz and $srcdir/lexicon.txt to exist"; + echo "You need to run local/wsj_extend_dict.sh before running this script." + exit 1; +fi + +# Get a wordlist-- keep everything but silence, which should not appear in +# the LM. +awk '{print $1}' $srcdir/lexicon.txt | grep -v -w '!SIL' > $dir/wordlist.txt + +# Get training data with OOV words (w.r.t. our current vocab) replaced with . +echo "Getting training data with OOV words replaced with (train_nounk.gz)" +gunzip -c $srcdir/cleaned.gz | awk -v w=$dir/wordlist.txt \ + 'BEGIN{while((getline0) v[$1]=1;} + {for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf " ";print ""}'|sed 's/ $//g' \ + | gzip -c > $dir/train_nounk.gz + +# Get unigram counts (without bos/eos, but this doens't matter here, it's +# only to get the word-map, which treats them specially & doesn't need their +# counts). +# Add a 1-count for each word in word-list by including that in the data, +# so all words appear. +gunzip -c $dir/train_nounk.gz | cat - $dir/wordlist.txt | \ + awk '{ for(x=1;x<=NF;x++) count[$x]++; } END{for(w in count){print count[w], w;}}' | \ + sort -nr > $dir/unigram.counts + +# Get "mapped" words-- a character encoding of the words that makes the common words very short. +cat $dir/unigram.counts | awk '{print $2}' | get_word_map.pl "" "" "" > $dir/word_map + +gunzip -c $dir/train_nounk.gz | awk -v wmap=$dir/word_map 'BEGIN{while((getline0)map[$1]=$2;} + { for(n=1;n<=NF;n++) { printf map[$n]; if(n$dir/train.gz + +# To save disk space, remove the un-mapped training data. We could +# easily generate it again if needed. +rm $dir/train_nounk.gz + +train_lm.sh --arpa --lmtype 3gram-mincount $dir +#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 141.444826 +# 7.8 million N-grams. + +prune_lm.sh --arpa 6.0 $dir/3gram-mincount/ +# 1.45 million N-grams. +# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 165.394139 + +train_lm.sh --arpa --lmtype 4gram-mincount $dir +#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 126.734180 +# 10.3 million N-grams. + +prune_lm.sh --arpa 7.0 $dir/4gram-mincount +# 1.50 million N-grams +# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 155.663757 + + +exit 0 + +### Below here, this script is showing various commands that +## were run during LM tuning. + +train_lm.sh --arpa --lmtype 3gram-mincount $dir +#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 141.444826 +# 7.8 million N-grams. + +prune_lm.sh --arpa 3.0 $dir/3gram-mincount/ +#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 156.408740 +# 2.5 million N-grams. + +prune_lm.sh --arpa 6.0 $dir/3gram-mincount/ +# 1.45 million N-grams. +# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 165.394139 + +train_lm.sh --arpa --lmtype 4gram-mincount $dir +#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 126.734180 +# 10.3 million N-grams. + +prune_lm.sh --arpa 3.0 $dir/4gram-mincount +#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 143.206294 +# 2.6 million N-grams. + +prune_lm.sh --arpa 4.0 $dir/4gram-mincount +# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 146.927717 +# 2.15 million N-grams. + +prune_lm.sh --arpa 5.0 $dir/4gram-mincount +# 1.86 million N-grams +# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 150.162023 + +prune_lm.sh --arpa 7.0 $dir/4gram-mincount +# 1.50 million N-grams +# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 155.663757 + +train_lm.sh --arpa --lmtype 3gram $dir +# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 135.692866 +# 20.0 million N-grams + +! which ngram-count \ + && echo "SRILM tools not installed so not doing the comparison" && exit 1; + +################# +# You could finish the script here if you wanted. +# Below is to show how to do baselines with SRILM. +# You'd have to install the SRILM toolkit first. + +heldout_sent=10000 # Don't change this if you want result to be comparable with + # kaldi_lm results +sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities. +mkdir -p $sdir +gunzip -c $srcdir/cleaned.gz | head -$heldout_sent > $sdir/cleaned.heldout +gunzip -c $srcdir/cleaned.gz | tail -n +$heldout_sent > $sdir/cleaned.train +(echo ""; echo "" ) | cat - $dir/wordlist.txt > $sdir/wordlist.final.s + +# 3-gram: +ngram-count -text $sdir/cleaned.train -order 3 -limit-vocab -vocab $sdir/wordlist.final.s -unk \ + -map-unk "" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz +ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/cleaned.heldout # consider -debug 2 +#file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs +#0 zeroprobs, logprob= -491456 ppl= 141.457 ppl1= 177.437 + +# Trying 4-gram: +ngram-count -text $sdir/cleaned.train -order 4 -limit-vocab -vocab $sdir/wordlist.final.s -unk \ + -map-unk "" -kndiscount -interpolate -lm $sdir/srilm.o4g.kn.gz +ngram -order 4 -lm $sdir/srilm.o4g.kn.gz -ppl $sdir/cleaned.heldout +#file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs +#0 zeroprobs, logprob= -480939 ppl= 127.233 ppl1= 158.822 + +#3-gram with pruning: +ngram-count -text $sdir/cleaned.train -order 3 -limit-vocab -vocab $sdir/wordlist.final.s -unk \ + -prune 0.0000001 -map-unk "" -kndiscount -interpolate -lm $sdir/srilm.o3g.pr7.kn.gz +ngram -lm $sdir/srilm.o3g.pr7.kn.gz -ppl $sdir/cleaned.heldout +#file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs +#0 zeroprobs, logprob= -510828 ppl= 171.947 ppl1= 217.616 +# Around 2.25M N-grams. +# Note: this is closest to the experiment done with "prune_lm.sh --arpa 3.0 $dir/3gram-mincount/" +# above, which gave 2.5 million N-grams and a perplexity of 156. + +# Note: all SRILM experiments above fully discount all singleton 3 and 4-grams. +# You can use -gt3min=0 and -gt4min=0 to stop this (this will be comparable to +# the kaldi_lm experiments above without "-mincount". + +## From here is how to train with +# IRSTLM. This is not really working at the moment. + +if [ -z $IRSTLM ] ; then + export IRSTLM=$KALDI_ROOT/tools/irstlm/ +fi +export PATH=${PATH}:$IRSTLM/bin +if ! command -v prune-lm >/dev/null 2>&1 ; then + echo "$0: Error: the IRSTLM is not available or compiled" >&2 + echo "$0: Error: We used to install it by default, but." >&2 + echo "$0: Error: this is no longer the case." >&2 + echo "$0: Error: To install it, go to $KALDI_ROOT/tools" >&2 + echo "$0: Error: and run extras/install_irstlm.sh" >&2 + exit 1 +fi + +idir=$dir/irstlm +mkdir $idir +gunzip -c $srcdir/cleaned.gz | tail -n +$heldout_sent | add-start-end.sh | \ + gzip -c > $idir/train.gz + +dict -i=WSJ.cleaned.irstlm.txt -o=dico -f=y -sort=no + cat dico | gawk 'BEGIN{while (getline<"vocab.20k.nooov") v[$1]=1; print "DICTIONARY 0 "length(v);}FNR>1{if ($1 in v)\ +{print $0;}}' > vocab.irstlm.20k + + +build-lm.sh -i "gunzip -c $idir/train.gz" -o $idir/lm_3gram.gz -p yes \ + -n 3 -s improved-kneser-ney -b yes +# Testing perplexity with SRILM tools: +ngram -lm $idir/lm_3gram.gz -ppl $sdir/cleaned.heldout +#data/local/local_lm/irstlm/lm_3gram.gz: line 162049: warning: non-zero probability for in closed-vocabulary LM +#file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 0 OOVs +#0 zeroprobs, logprob= -513670 ppl= 175.041 ppl1= 221.599 + +# Perplexity is very bad (should be ~141, since we used -p option, +# not 175), +# but adding -debug 3 to the command line shows that +# the IRSTLM LM does not seem to sum to one properly, so it seems that +# it produces an LM that isn't interpretable in the normal way as an ARPA +# LM. + + + diff --git a/egs/cmu_cslu_kids/s5/local/vtln.sh b/egs/cmu_cslu_kids/s5/local/vtln.sh new file mode 100755 index 00000000000..0ca179ce89f --- /dev/null +++ b/egs/cmu_cslu_kids/s5/local/vtln.sh @@ -0,0 +1,61 @@ +#!/bin/bash + +# Copyright Johns Hopkins University +# 2019 Fei Wu + +# Run VTLN. This will be run if the vtln option +# is set to be true in run.sh. + +set -eu +stage=0 +featdir=mfcc/vtln +data=data +mdl=exp/tri3 +mdl_vtln=${mdl}_vtln +vtln_lda=exp/tri4 +vtln_sat=exp/tri5 + +. ./cmd.sh +. ./utils/parse_options.sh + +mkdir -p $featdir + +steps/train_lvtln.sh --cmd "$train_cmd" 1800 9000 $data/train $data/lang $mdl $mdl_vtln + +if [ $stage -le 0 ]; then + mkdir -p $data/train_vtln + cp $data/train/* $data/train_vtln || true + cp $mdl_vtln/final.warp $data/train_vtln/spk2warp + steps/make_mfcc.sh --nj 8 --cmd "$train_cmd" $data/train_vtln exp/make_mfcc/train_vtln $featdir + steps/compute_cmvn_stats.sh $data/train_vtln exp/make_mfcc/train_vtln $featdir +fi + +if [ $stage -le 1 ]; then + utils/mkgraph.sh $data/lang_test_tgmed $mdl_vtln $mdl_vtln/graph + steps/decode_lvtln.sh --config conf/decode.config --nj 20 --cmd "$decode_cmd" \ + $mdl_vtln/graph $data/test $mdl_vtln/decode +fi + +if [ $stage -le 2 ]; then + mkdir -p $data/test_vtln + cp $data/test/* $data/test_vtln || true + cp $mdl_vtln/decode/final.warp $data/test_vtln/spk2warp + steps/make_mfcc.sh --nj 8 --cmd "$train_cmd" $data/test_vtln exp/make_mfcc/test_vtln $featdir + steps/compute_cmvn_stats.sh $data/test_vtln exp/make_mfcc/test_vtln $featdir +fi + +if [ $stage -le 3 ]; then + steps/train_lda_mllt.sh --cmd "$train_cmd" --splice-opts "--left-context=3 --right-context=3" 1800 9000 \ + $data/train_vtln $data/lang $mdl_vtln $vtln_lda + utils/mkgraph.sh $data/lang_test_tgmed $vtln_lda $vtln_lda/graph + echo "$mdl_vtln + lda + mllt" > $vtln_lda/mcodel_discription + steps/decode.sh --config conf/decode.config --nj 20 --cmd "$decode_cmd" \ + $vtln_lda/graph $data/test_vtln $vtln_lda/decode +fi + +if [ $stage -le 4 ]; then + steps/train_sat.sh 1800 9000 $data/train_vtln $data/lang $vtln_lda $vtln_sat + utils/mkgraph.sh $data/lang_test_tgmed $vtln_sat $vtln_sat/graph + steps/decode_fmllr.sh --config conf/decode.config --nj 20 --cmd "$decode_cmd" $vtln_sat/graph $data/test_vtln $vtln_sat/decode + echo "$mdl_vtln + lda + mllt + SAT" > $vtln_sat/model_discription +fi diff --git a/egs/cmu_cslu_kids/s5/path.sh b/egs/cmu_cslu_kids/s5/path.sh new file mode 100755 index 00000000000..2d17b17a84a --- /dev/null +++ b/egs/cmu_cslu_kids/s5/path.sh @@ -0,0 +1,6 @@ +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C diff --git a/egs/cmu_cslu_kids/s5/run.sh b/egs/cmu_cslu_kids/s5/run.sh new file mode 100755 index 00000000000..43ae1ea9426 --- /dev/null +++ b/egs/cmu_cslu_kids/s5/run.sh @@ -0,0 +1,177 @@ +#! /bin/bash + +# Copyright Johns Hopkins University +# 2019 Fei Wu + +set -eo + +stage=0 +cmu_kids= # path to cmu_kids corpus +cslu_kids= # path to cslu_kids corpus +lm_src= # path of existing librispeech lm +extra_features=false # Extra features for GMM model (MMI, boosting and MPE) +vtln=false # Optional, run VLTN on gmm and tdnnf models if set true +email= # Reporting email for tdnn-f training + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +lm_url=www.openslr.org/resources/11 +mkdir -p data +mkdir -p data/local + +# Prepare data +if [ $stage -le 0 ]; then + # Make soft link to the corpora + if [ ! -e cmu_kids ]; then + ln -sf $cmu_kids cmu_kids + fi + if [ ! -e cslu ]; then + ln -sf $cslu_kids cslu + fi + + # Make softlink to lm, if lm_src provided + if [ ! -z "$lm_src" ] && [ ! -e data/local/lm ] ; then + ln -sf $lm_src data/local/lm + fi + + # Remove old data dirs + rm -rf data/data_cmu + rm -rf data/data_cslu + + # Data Prep + ./local/cmu_prepare_data.sh --corpus cmu_kids/kids --data data/data_cmu + ./local/cslu_prepare_data.sh --corpus cslu --data data/data_cslu +fi + +# Combine data +if [ $stage -le 1 ]; then + mkdir -p data/train + mkdir -p data/test + rm -rf data/train/* + rm -rf data/test/* + ./utils/combine_data.sh data/train data/data_cmu/train data/data_cslu/train + ./utils/combine_data.sh data/test data/data_cmu/test data/data_cslu/test +fi + +# LM, WFST Preparation +if [ $stage -le 2 ]; then + if [ ! -d data/local/dict ]; then + ./local/download_cmu_dict.sh + fi + + if [ ! -e data/local/lm ]; then + echo "lm_src not provided. Downloading lm from openslr." + ./local/download_lm.sh $lm_url data/local/lm + fi + + utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang + local/format_lms.sh --src_dir data/lang data/local/lm + + # Create ConstArpaLm format language model for full 3-gram and 4-gram LMs + utils/build_const_arpa_lm.sh data/local/lm/lm_tglarge.arpa.gz data/lang data/lang_test_tglarge + utils/build_const_arpa_lm.sh data/local/lm/lm_fglarge.arpa.gz data/lang data/lang_test_fglarge +fi + +# Make MFCC features +if [ $stage -le 3 ]; then + mkdir -p mfcc + mkdir -p exp + steps/make_mfcc.sh --nj 40 --cmd "$train_cmd" data/test exp/make_feat/test mfcc + steps/compute_cmvn_stats.sh data/test exp/make_feat/test mfcc + steps/make_mfcc.sh --nj 40 --cmd "$train_cmd" data/train exp/make_feat/train mfcc + steps/compute_cmvn_stats.sh data/train exp/make_feat/train mfcc +fi + +# Mono-phone +if [ $stage -le 4 ]; then + # Train + steps/train_mono.sh --nj 40 --cmd "$train_cmd" data/train data/lang exp/mono + #Decode + utils/mkgraph.sh data/lang_test_tgsmall exp/mono exp/mono/graph + steps/decode.sh --config conf/decode.config --nj 40 --cmd "$decode_cmd" exp/mono/graph data/test exp/mono/decode + #Align + steps/align_si.sh --nj 20 --cmd "$train_cmd" data/train data/lang exp/mono exp/mono_ali +fi + +# Tri1 [Vanilla tri phone model] +if [ $stage -le 5 ]; then + # Train + steps/train_deltas.sh --cmd "$train_cmd" 1800 9000 data/train data/lang exp/mono_ali exp/tri1 + # Decode + utils/mkgraph.sh data/lang_test_tgmed exp/tri1 exp/tri1/graph + steps/decode.sh --config conf/decode.config --nj 40 --cmd "$decode_cmd" exp/tri1/graph data/test exp/tri1/decode + # Align - make graph - decode again + steps/align_si.sh --nj 20 --cmd "queue.pl" --use-graphs true data/train data/lang_test_tgmed exp/tri1 exp/tri1_ali + utils/mkgraph.sh data/lang_test_tgmed exp/tri1_ali exp/tri1_ali/graph + steps/decode.sh --config conf/decode.config --nj 40 --cmd "$decode_cmd" exp/tri1_ali/graph data/test exp/tri1_ali/decode +fi + +# Add LDA and MLLT +if [ $stage -le 6 ]; then + # Train + steps/train_lda_mllt.sh --cmd "$train_cmd" --splice-opts "--left-context=3 --right-context=3" 1800 9000 data/train data/lang exp/tri1_ali exp/tri2 + utils/mkgraph.sh data/lang_test_tgmed exp/tri2 exp/tri2/graph + # Decode + steps/decode.sh --config conf/decode.config --nj 40 --cmd "$decode_cmd" exp/tri2/graph data/test exp/tri2/decode + # Align - make graph - dcode again + steps/align_si.sh --nj 20 --cmd "$train_cmd" --use-graphs true data/train data/lang_test_tgmed exp/tri2 exp/tri2_ali + utils/mkgraph.sh data/lang_test_tgmed exp/tri2_ali exp/tri2_ali/graph + steps/decode_fmllr.sh --config conf/decode.config --nj 40 --cmd "$decode_cmd" exp/tri2_ali/graph data/test exp/tri2_ali/decode +fi + +# Add other features +if [ $stage -le 7 ]; then + if [ $extra_features = true ]; then + # Add MMI + steps/make_denlats.sh --nj 20 --cmd "$train_cmd" data/train data/lang exp/tri2 exp/tri2_denlats + steps/train_mmi.sh data/train data/lang exp/tri2_ali exp/tri2_denlats exp/tri2_mmi + steps/decode.sh --config conf/decode.config --iter 4 --nj 20 --cmd "$decode_cmd" exp/tri2/graph data/test exp/tri2_mmi/decode_it4 + steps/decode.sh --config conf/decode.config --iter 3 --nj 20 --cmd "$decode_cmd" exp/tri2/graph data/test exp/tri2_mmi/decode_it3 + + # Add Boosting + steps/train_mmi.sh --boost 0.05 data/train data/lang exp/tri2_ali exp/tri2_denlats exp/tri2_mmi_b0.05 + steps/decode.sh --config conf/decode.config --iter 4 --nj 20 --cmd "$decode_cmd" exp/tri2/graph data/test exp/tri2_mmi_b0.05/decode_it4 + steps/decode.sh --config conf/decode.config --iter 3 --nj 20 --cmd "$decode_cmd" exp/tri2/graph data/test exp/tri2_mmi_b0.05/decode_it3 + + # Add MPE + steps/train_mpe.sh data/train data/lang exp/tri2_ali exp/tri2_denlats exp/tri2_mpe + steps/decode.sh --config conf/decode.config --iter 4 --nj 20 --cmd "$decode_cmd" exp/tri2/graph data/test exp/tri2_mpe/decode_it4 + steps/decode.sh --config conf/decode.config --iter 3 --nj 20 --cmd "$decode_cmd" exp/tri2/graph data/test exp/tri2_mpe/decode_it3 + fi +fi + +# Add SAT +if [ $stage -le 8 ]; then + # Do LDA+MLLT+SAT, and decode. + steps/train_sat.sh 1800 9000 data/train data/lang exp/tri2_ali exp/tri3 + utils/mkgraph.sh data/lang_test_tgmed exp/tri3 exp/tri3/graph + steps/decode_fmllr.sh --config conf/decode.config --nj 40 --cmd "$decode_cmd" exp/tri3/graph data/test exp/tri3/decode +fi + +if [ $stage -le 9 ]; then + # Align all data with LDA+MLLT+SAT system (tri3) + steps/align_fmllr.sh --nj 20 --cmd "$train_cmd" --use-graphs true data/train data/lang_test_tgmed exp/tri3 exp/tri3_ali + utils/mkgraph.sh data/lang_test_tgmed exp/tri3_ali exp/tri3_ali/graph + steps/decode_fmllr.sh --config conf/decode.config --nj 40 --cmd "$decode_cmd" exp/tri3_ali/graph data/test exp/tri3_ali/decode +fi + +if [ $stage -le 10 ]; then + # Uncomment reporting email option to get training progress updates by email + ./local/chain/run_tdnnf.sh --train_set train \ + --test_sets test --gmm tri3 # --reporting_email $email +fi + + +# Optional VTLN. Run if vtln is set to true +if [ $stage -le 11 ]; then + if [ $vtln = true ]; then + ./local/vtln.sh + ./local/chain/run_tdnnf.sh --nnet3_affix vtln --train_set train_vtln \ + --test_sets test_vtln --gmm tri5 # --reporting_email $email + fi +fi + +# Collect and resport WER results for all models +./local/sort_result.sh diff --git a/egs/cmu_cslu_kids/s5/steps b/egs/cmu_cslu_kids/s5/steps new file mode 120000 index 00000000000..1b186770dd1 --- /dev/null +++ b/egs/cmu_cslu_kids/s5/steps @@ -0,0 +1 @@ +../../wsj/s5/steps/ \ No newline at end of file diff --git a/egs/cmu_cslu_kids/s5/utils b/egs/cmu_cslu_kids/s5/utils new file mode 120000 index 00000000000..a3279dc8679 --- /dev/null +++ b/egs/cmu_cslu_kids/s5/utils @@ -0,0 +1 @@ +../../wsj/s5/utils/ \ No newline at end of file diff --git a/egs/cnceleb/README.txt b/egs/cnceleb/README.txt new file mode 100644 index 00000000000..db8789839a9 --- /dev/null +++ b/egs/cnceleb/README.txt @@ -0,0 +1,9 @@ + +This directory contains example scripts for CN-Celeb speaker +verification. The CN-Celeb corpus is required, and can be +downloaded from Openslr http://www.openslr.org/82/ or from +CSLT@Tsinghua http://cslt.riit.tsinghua.edu.cn/~data/CN-Celeb/ + +The subdirectories "v1" and so on are different speaker recognition +recipes. The recipe in v1 demonstrates a standard approach using a +full-covariance GMM-UBM, iVectors, and a PLDA backend. diff --git a/egs/cnceleb/v1/README.txt b/egs/cnceleb/v1/README.txt new file mode 100644 index 00000000000..dc5086f0b7a --- /dev/null +++ b/egs/cnceleb/v1/README.txt @@ -0,0 +1,4 @@ + + This example demonstrates a traditional iVector system based on + CN-Celeb dataset. + diff --git a/egs/cnceleb/v1/cmd.sh b/egs/cnceleb/v1/cmd.sh new file mode 100755 index 00000000000..d1ca1a6d126 --- /dev/null +++ b/egs/cnceleb/v1/cmd.sh @@ -0,0 +1,15 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 4G" + + diff --git a/egs/cnceleb/v1/conf/mfcc.conf b/egs/cnceleb/v1/conf/mfcc.conf new file mode 100644 index 00000000000..649cffb9de8 --- /dev/null +++ b/egs/cnceleb/v1/conf/mfcc.conf @@ -0,0 +1,7 @@ +--sample-frequency=16000 +--frame-length=25 # the default is 25 +--low-freq=20 # the default. +--high-freq=7600 # the default is zero meaning use the Nyquist (8k in this case). +--num-mel-bins=30 +--num-ceps=24 +--snip-edges=false diff --git a/egs/cnceleb/v1/conf/vad.conf b/egs/cnceleb/v1/conf/vad.conf new file mode 100644 index 00000000000..a0ca2449b10 --- /dev/null +++ b/egs/cnceleb/v1/conf/vad.conf @@ -0,0 +1,2 @@ +--vad-energy-threshold=5.5 +--vad-energy-mean-scale=0.5 diff --git a/egs/cnceleb/v1/local/make_cnceleb.sh b/egs/cnceleb/v1/local/make_cnceleb.sh new file mode 100755 index 00000000000..14d44d6d3d0 --- /dev/null +++ b/egs/cnceleb/v1/local/make_cnceleb.sh @@ -0,0 +1,63 @@ +#!/bin/bash +# Copyright 2017 Ignacio Viñals +# 2017-2018 David Snyder +# 2019 Jiawen Kang +# +# This script prepares the CN-Celeb dataset. It creates separate directories +# for train, eval enroll and eval test. It also prepares a trials files, in the eval test directory. + +if [ $# != 2 ]; then + echo "Usage: make_cnceleb.sh " + echo "E.g.: make_cnceleb.sh /export/corpora/CN-Celeb data" + exit 1 +fi + +in_dir=$1 +out_dir=$2 + +# Prepare the development data +this_out_dir=${out_dir}/train +mkdir -p $this_out_dir 2>/dev/null +WAVFILE=$this_out_dir/wav.scp +SPKFILE=$this_out_dir/utt2spk +rm $WAVFILE $SPKFILE 2>/dev/null +this_in_dir=${in_dir}/dev + +for spkr_id in `cat $this_in_dir/dev.lst`; do + for f in $in_dir/data/$spkr_id/*.wav; do + wav_id=$(basename $f | sed s:.wav$::) + echo "${spkr_id}-${wav_id} $f" >> $WAVFILE + echo "${spkr_id}-${wav_id} ${spkr_id}" >> $SPKFILE + done +done +utils/fix_data_dir.sh $this_out_dir + +# Prepare the evaluation data +for mode in enroll test; do + this_out_dir=${out_dir}/eval_${mode} + mkdir -p $this_out_dir 2>/dev/null + WAVFILE=$this_out_dir/wav.scp + SPKFILE=$this_out_dir/utt2spk + rm $WAVFILE $SPKFILE 2>/dev/null + this_in_dir=${in_dir}/eval/${mode} + + for f in $this_in_dir/*.wav; do + wav_id=$(basename $f | sed s:.wav$::) + spkr_id=$(echo ${wav_id} | cut -d "-" -f1) + echo "${wav_id} $f" >> $WAVFILE + echo "${wav_id} ${spkr_id}" >> $SPKFILE + done + utils/fix_data_dir.sh $this_out_dir +done + +# Prepare test trials +this_out_dir=$out_dir/eval_test/trials +mkdir -p $out_dir/eval_test/trials +this_in_dir=${in_dir}/eval/lists +cat $this_in_dir/trials.lst | sed 's@-enroll@@g' | sed 's@test/@@g' | sed 's@.wav@@g' | \ + awk '{if ($3 == "1") + {print $1,$2,"target"} + else + {print $1,$2,"nontarget"} + }'> $this_out_dir/trials.lst + diff --git a/egs/cnceleb/v1/path.sh b/egs/cnceleb/v1/path.sh new file mode 100755 index 00000000000..e50f57c5271 --- /dev/null +++ b/egs/cnceleb/v1/path.sh @@ -0,0 +1,5 @@ +export KALDI_ROOT=`pwd`/../../.. +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C diff --git a/egs/cnceleb/v1/run.sh b/egs/cnceleb/v1/run.sh new file mode 100755 index 00000000000..0ca7ed8f277 --- /dev/null +++ b/egs/cnceleb/v1/run.sh @@ -0,0 +1,133 @@ +#!/bin/bash +# Copyright 2017 Johns Hopkins University (Author: Daniel Garcia-Romero) +# 2017 Johns Hopkins University (Author: Daniel Povey) +# 2017-2018 David Snyder +# 2018 Ewald Enzinger +# 2019 Tsinghua University (Author: Jiawen Kang and Lantian Li) +# Apache 2.0. +# +# This is an i-vector-based recipe for CN-Celeb database. +# See ../README.txt for more info on data required. The recipe uses +# CN-Celeb/dev for training the UBM, T matrix and PLDA, and CN-Celeb/eval +# for evaluation. The results are reported in terms of EER and minDCF, +# and are inline in the comments below. + +. ./cmd.sh +. ./path.sh +set -e +mfccdir=`pwd`/mfcc +vaddir=`pwd`/mfcc + +cnceleb_root=/export/corpora/CN-Celeb +eval_trails_core=data/eval_test/trials/trials.lst + +stage=0 + +if [ $stage -le 0 ]; then + # Prepare the CN-Celeb dataset. The script is used to prepare the development + # dataset and evaluation dataset. + local/make_cnceleb.sh $cnceleb_root data +fi + +if [ $stage -le 1 ]; then + # Make MFCCs and compute the energy-based VAD for each dataset + for name in train eval_enroll eval_test; do + steps/make_mfcc.sh --write-utt2num-frames true --mfcc-config conf/mfcc.conf --nj 20 --cmd "$train_cmd" \ + data/${name} exp/make_mfcc $mfccdir + utils/fix_data_dir.sh data/${name} + sid/compute_vad_decision.sh --nj 20 --cmd "$train_cmd" \ + data/${name} exp/make_vad $vaddir + utils/fix_data_dir.sh data/${name} + done +fi + +if [ $stage -le 2 ]; then + # Train the UBM + sid/train_diag_ubm.sh --cmd "$train_cmd --mem 4G" \ + --nj 20 --num-threads 8 \ + data/train 2048 \ + exp/diag_ubm + + sid/train_full_ubm.sh --cmd "$train_cmd --mem 16G" \ + --nj 20 --remove-low-count-gaussians false \ + data/train \ + exp/diag_ubm exp/full_ubm +fi + +if [ $stage -le 3 ]; then + # Train the i-vector extractor. + sid/train_ivector_extractor.sh --nj 20 --cmd "$train_cmd --mem 16G" \ + --ivector-dim 400 --num-iters 5 \ + exp/full_ubm/final.ubm data/train \ + exp/extractor +fi + +if [ $stage -le 4 ]; then + # Note that there are over one-third of the utterances less than 2 seconds in our training set, + # and these short utterances are harmful for PLDA training. Therefore, to improve performance + # of PLDA modeling and inference, we will combine the short utterances longer than 5 seconds. + utils/data/combine_short_segments.sh --speaker-only true \ + data/train 5 data/train_comb + # Compute the energy-based VAD for train_comb + sid/compute_vad_decision.sh --nj 20 --cmd "$train_cmd" \ + data/train_comb exp/make_vad $vaddir + utils/fix_data_dir.sh data/train_comb +fi + +if [ $stage -le 5 ]; then + # These i-vectors will be used for mean-subtraction, LDA, and PLDA training. + sid/extract_ivectors.sh --cmd "$train_cmd --mem 4G" --nj 20 \ + exp/extractor data/train_comb \ + exp/ivectors_train_comb + + # Extract i-vector for eval sets. + for name in eval_enroll eval_test; do + sid/extract_ivectors.sh --cmd "$train_cmd --mem 4G" --nj 10 \ + exp/extractor data/$name \ + exp/ivectors_$name + done +fi + +if [ $stage -le 6 ]; then + # Compute the mean vector for centering the evaluation i-vectors. + $train_cmd exp/ivectors_train_comb/log/compute_mean.log \ + ivector-mean scp:exp/ivectors_train_comb/ivector.scp \ + exp/ivectors_train_comb/mean.vec || exit 1; + + # This script uses LDA to decrease the dimensionality prior to PLDA. + lda_dim=150 + $train_cmd exp/ivectors_train_comb/log/lda.log \ + ivector-compute-lda --total-covariance-factor=0.0 --dim=$lda_dim \ + "ark:ivector-subtract-global-mean scp:exp/ivectors_train_comb/ivector.scp ark:- |" \ + ark:data/train_comb/utt2spk exp/ivectors_train_comb/transform.mat || exit 1; + + # Train the PLDA model. + $train_cmd exp/ivectors_train_comb/log/plda.log \ + ivector-compute-plda ark:data/train_comb/spk2utt \ + "ark:ivector-subtract-global-mean scp:exp/ivectors_train_comb/ivector.scp ark:- | transform-vec exp/ivectors_train_comb/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \ + exp/ivectors_train_comb/plda || exit 1; + +fi + +if [ $stage -le 7 ]; then + # Compute PLDA scores for CN-Celeb eval core trials + $train_cmd exp/scores/log/cnceleb_eval_scoring.log \ + ivector-plda-scoring --normalize-length=true \ + --num-utts=ark:exp/ivectors_eval_enroll/num_utts.ark \ + "ivector-copy-plda --smoothing=0.0 exp/ivectors_train_comb/plda - |" \ + "ark:ivector-mean ark:data/eval_enroll/spk2utt scp:exp/ivectors_eval_enroll/ivector.scp ark:- | ivector-subtract-global-mean exp/ivectors_train_comb/mean.vec ark:- ark:- | transform-vec exp/ivectors_train_comb/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \ + "ark:ivector-subtract-global-mean exp/ivectors_train_comb/mean.vec scp:exp/ivectors_eval_test/ivector.scp ark:- | transform-vec exp/ivectors_train_comb/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \ + "cat '$eval_trails_core' | cut -d\ --fields=1,2 |" exp/scores/cnceleb_eval_scores || exit 1; + + # CN-Celeb Eval Core: + # EER: 13.91% + # minDCF(p-target=0.01): 0.6530 + # minDCF(p-target=0.001): 0.7521 + echo -e "\nCN-Celeb Eval Core:"; + eer=$(paste $eval_trails_core exp/scores/cnceleb_eval_scores | awk '{print $6, $3}' | compute-eer - 2>/dev/null) + mindcf1=`sid/compute_min_dcf.py --p-target 0.01 exp/scores/cnceleb_eval_scores $eval_trails_core 2> /dev/null` + mindcf2=`sid/compute_min_dcf.py --p-target 0.001 exp/scores/cnceleb_eval_scores $eval_trails_core 2> /dev/null` + echo "EER: $eer%" + echo "minDCF(p-target=0.01): $mindcf1" + echo "minDCF(p-target=0.001): $mindcf2" +fi diff --git a/egs/cnceleb/v1/sid b/egs/cnceleb/v1/sid new file mode 120000 index 00000000000..893a12f30c9 --- /dev/null +++ b/egs/cnceleb/v1/sid @@ -0,0 +1 @@ +../../sre08/v1/sid \ No newline at end of file diff --git a/egs/cnceleb/v1/steps b/egs/cnceleb/v1/steps new file mode 120000 index 00000000000..6e99bf5b5ad --- /dev/null +++ b/egs/cnceleb/v1/steps @@ -0,0 +1 @@ +../../wsj/s5/steps \ No newline at end of file diff --git a/egs/cnceleb/v1/utils b/egs/cnceleb/v1/utils new file mode 120000 index 00000000000..b240885218f --- /dev/null +++ b/egs/cnceleb/v1/utils @@ -0,0 +1 @@ +../../wsj/s5/utils \ No newline at end of file diff --git a/egs/commonvoice/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/commonvoice/s5/local/chain/tuning/run_tdnn_1a.sh index 635e3de1076..d4acd0fed4b 100755 --- a/egs/commonvoice/s5/local/chain/tuning/run_tdnn_1a.sh +++ b/egs/commonvoice/s5/local/chain/tuning/run_tdnn_1a.sh @@ -141,7 +141,7 @@ if [ $stage -le 13 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) mkdir -p $dir/configs cat < $dir/configs/network.xconfig diff --git a/egs/commonvoice/s5/local/prepare_dict.sh b/egs/commonvoice/s5/local/prepare_dict.sh index d6d1aba41fb..cdfffe42080 100755 --- a/egs/commonvoice/s5/local/prepare_dict.sh +++ b/egs/commonvoice/s5/local/prepare_dict.sh @@ -52,7 +52,7 @@ if [[ "$(uname)" == "Darwin" ]]; then alias readlink=greadlink fi -sequitur=$KALDI_ROOT/tools/sequitur +sequitur=$KALDI_ROOT/tools/sequitur-g2p export PATH=$PATH:$sequitur/bin export PYTHONPATH=$PYTHONPATH:`utils/make_absolute.sh $sequitur/lib/python*/site-packages` diff --git a/egs/csj/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/csj/s5/local/chain/tuning/run_tdnn_1a.sh index a463db77066..75ceb80e3e0 100755 --- a/egs/csj/s5/local/chain/tuning/run_tdnn_1a.sh +++ b/egs/csj/s5/local/chain/tuning/run_tdnn_1a.sh @@ -133,7 +133,7 @@ if [ $stage -le 12 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) mkdir -p $dir/configs cat < $dir/configs/network.xconfig diff --git a/egs/csj/s5/local/csj_data_prep.sh b/egs/csj/s5/local/csj_data_prep.sh index 55738bf0e37..69e2865e316 100755 --- a/egs/csj/s5/local/csj_data_prep.sh +++ b/egs/csj/s5/local/csj_data_prep.sh @@ -45,7 +45,9 @@ if [ ! -d $CSJ ]; then fi # CSJ dictionary file check -[ ! -f $dir/lexicon.txt ] && cp $CSJ/lexicon/lexicon.txt $dir || exit 1; +if [ ! -f $dir/lexicon.txt ]; then + cp $CSJ/lexicon/lexicon.txt $dir || exit 1; +fi ### Config of using wav data that relates with acoustic model training ### if [ $mode -eq 3 ] diff --git a/egs/csj/s5/local/csj_make_trans/csj_autorun.sh b/egs/csj/s5/local/csj_make_trans/csj_autorun.sh index f288e4fb4d3..5cd78ee94ae 100755 --- a/egs/csj/s5/local/csj_make_trans/csj_autorun.sh +++ b/egs/csj/s5/local/csj_make_trans/csj_autorun.sh @@ -61,7 +61,7 @@ if [ ! -e $outd/.done_make_trans ];then mkdir -p $outd/$vol/$id case "$csjv" in - "usb" ) TPATH="$resource/${SDB}$vol" ; WPATH="$resource/$WAV" ;; + "usb" ) TPATH="$resource/${SDB}$vol" ; WPATH="$resource/${WAV}$vol" ;; "dvd" ) TPATH="$resource/$vol/$id" ; WPATH="$resource/$vol/$id" ;; "merl" ) TPATH="$resource/$vol/$SDB" ; WPATH="$resource/$vol/$WAV" ;; esac diff --git a/egs/csj/s5/local/nnet/run_dnn_tandem_uc.sh b/egs/csj/s5/local/nnet/run_dnn_tandem_uc.sh index 4677ff473cb..297aed1f486 100755 --- a/egs/csj/s5/local/nnet/run_dnn_tandem_uc.sh +++ b/egs/csj/s5/local/nnet/run_dnn_tandem_uc.sh @@ -280,4 +280,4 @@ exit 0 %WER 14.88 [ 2557 / 17189, 556 ins, 359 del, 1642 sub ] exp/tandem2uc-tri4/decode_eval3_csj/wer_20_0.5 %WER 17.03 [ 2927 / 17189, 592 ins, 417 del, 1918 sub ] exp/tandem2uc-tri4/decode_eval3_csj.si/wer_20_1.0 %WER 13.44 [ 2311 / 17189, 430 ins, 340 del, 1541 sub ] exp/tandem2uc-tri4_mmi_b0.1/decode_eval3_csj/wer_20_1.0 -EOF \ No newline at end of file +EOF diff --git a/egs/csj/s5/local/run_sgmm2.sh b/egs/csj/s5/local/run_sgmm2.sh index 619c6c5d1ef..c66b43c4f7f 100755 --- a/egs/csj/s5/local/run_sgmm2.sh +++ b/egs/csj/s5/local/run_sgmm2.sh @@ -18,7 +18,7 @@ fi if [ ! -f exp/ubm5/final.ubm ]; then steps/train_ubm.sh --cmd "$train_cmd" 1400 data/train_nodup data/lang \ exp/tri4_ali_nodup exp/ubm5 || exit 1; -fi +fi # steps/train_sgmm2.sh --cmd "$train_cmd" \ steps/train_sgmm2_group.sh --cmd "$train_cmd" \ diff --git a/egs/dihard_2018/README.txt b/egs/dihard_2018/README.txt new file mode 100644 index 00000000000..a7a00c8bf4e --- /dev/null +++ b/egs/dihard_2018/README.txt @@ -0,0 +1,14 @@ + + This is a Kaldi recipe for The First DIHARD Speech Diarization Challenge. + DIHARD is a new annual challenge focusing on "hard" diarization; that is, + speech diarization for challenging corpora where there is an expectation that + the current state-of-the-art will fare poorly, including, but not limited + to: clinical interviews, extended child language acquisition recordings, + YouTube videos and "speech in the wild" (e.g., recordings in restaurants) + See https://coml.lscp.ens.fr/dihard/index.html for details. + + The subdirectories "v1" and so on are different speaker diarization + recipes. The recipe in v1 demonstrates a standard approach using a + full-covariance GMM-UBM, i-vectors, PLDA scoring and agglomerative + hierarchical clustering. The example in v2 demonstrates DNN speaker + embeddings, PLDA scoring and agglomerative hierarchical clustering. diff --git a/egs/dihard_2018/v1/README.txt b/egs/dihard_2018/v1/README.txt new file mode 100644 index 00000000000..98bf3641b03 --- /dev/null +++ b/egs/dihard_2018/v1/README.txt @@ -0,0 +1,13 @@ + This recipe is the speaker diarization recipe for The First DIHARD Speech + Diarization Challenge (DIHARD 2018). There are two tracks in the DIHARD 2018 + competition , one uses oracle SAD (track1) and the other required that SAD + was performed from scratch (track2). This script is for track1. + + The recipe is closely based on the following paper: + http://www.danielpovey.com/files/2018_interspeech_dihard.pdf but doesn't + contain the VB refinement. The whole system mainly contains full-covariance + GMM-UBM, i-vector extractor (T-matrix), PLDA scoring and agglomerative + hierarchical clustering. The VoxCeleb datasets are used for training i-vectors + and PLDA. The development set of the DIHARD 2018 competition is used as + validation set to tune parameters. The system is tested on the DIHARD 2018 + evaluation set. diff --git a/egs/dihard_2018/v1/cmd.sh b/egs/dihard_2018/v1/cmd.sh new file mode 100755 index 00000000000..c35cd18f287 --- /dev/null +++ b/egs/dihard_2018/v1/cmd.sh @@ -0,0 +1,15 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl" + + diff --git a/egs/dihard_2018/v1/conf/mfcc.conf b/egs/dihard_2018/v1/conf/mfcc.conf new file mode 100644 index 00000000000..649cffb9de8 --- /dev/null +++ b/egs/dihard_2018/v1/conf/mfcc.conf @@ -0,0 +1,7 @@ +--sample-frequency=16000 +--frame-length=25 # the default is 25 +--low-freq=20 # the default. +--high-freq=7600 # the default is zero meaning use the Nyquist (8k in this case). +--num-mel-bins=30 +--num-ceps=24 +--snip-edges=false diff --git a/egs/dihard_2018/v1/conf/vad.conf b/egs/dihard_2018/v1/conf/vad.conf new file mode 100644 index 00000000000..a0ca2449b10 --- /dev/null +++ b/egs/dihard_2018/v1/conf/vad.conf @@ -0,0 +1,2 @@ +--vad-energy-threshold=5.5 +--vad-energy-mean-scale=0.5 diff --git a/egs/dihard_2018/v1/diarization b/egs/dihard_2018/v1/diarization new file mode 120000 index 00000000000..bad937c1444 --- /dev/null +++ b/egs/dihard_2018/v1/diarization @@ -0,0 +1 @@ +../../callhome_diarization/v1/diarization \ No newline at end of file diff --git a/egs/dihard_2018/v1/local/make_dihard_2018_dev.py b/egs/dihard_2018/v1/local/make_dihard_2018_dev.py new file mode 100755 index 00000000000..fa652da8b4c --- /dev/null +++ b/egs/dihard_2018/v1/local/make_dihard_2018_dev.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python3 + +# This script is called by local/make_dihard_2018_dev.sh, and it creates the +# necessary files for DIHARD 2018 development directory. + +import sys, os + +def prepare_dihard_2018_dev(src_dir, data_dir): + wavscp_fi = open(data_dir + "/wav.scp" , 'w') + utt2spk_fi = open(data_dir + "/utt2spk" , 'w') + segments_fi = open(data_dir + "/segments" , 'w') + rttm_fi = open(data_dir + "/rttm" , 'w') + reco2num_spk_fi = open(data_dir + "/reco2num_spk" , 'w') + + for subdir, dirs, files in os.walk(src_dir): + for file in files: + filename = os.path.join(subdir, file) + if filename.endswith(".lab"): + utt = os.path.basename(filename).split(".")[0] + lines = open(filename, 'r').readlines() + segment_id = 0 + for line in lines: + start, end, speech = line.split() + segment_id_str = "{}_{}".format(utt, str(segment_id).zfill(4)) + segments_str = "{} {} {} {}\n".format(segment_id_str, utt, start, end) + utt2spk_str = "{} {}\n".format(segment_id_str, utt) + segments_fi.write(segments_str) + utt2spk_fi.write(utt2spk_str) + segment_id += 1 + wav_str = "{} sox -t flac {}/data/flac/{}.flac -t wav -r 16k "\ + "-b 16 - channels 1 |\n".format(utt, src_dir, utt) + wavscp_fi.write(wav_str) + with open("{}/data/rttm/{}.rttm".format(src_dir, utt), 'r') as fh: + rttm_str = fh.read() + rttm_fi.write(rttm_str) + with open("{}/data/rttm/{}.rttm".format(src_dir, utt), 'r') as fh: + rttm_list = fh.readlines() + spk_list = [(x.split())[7] for x in rttm_list] + num_spk = len(set(spk_list)) + reco2num_spk_fi.write("{} {}\n".format(utt, num_spk)) + wavscp_fi.close() + utt2spk_fi.close() + segments_fi.close() + rttm_fi.close() + reco2num_spk_fi.close() + return 0 + +def main(): + src_dir = sys.argv[1] + data_dir = sys.argv[2] + if not os.path.exists(data_dir): + os.makedirs(data_dir) + prepare_dihard_2018_dev(src_dir, data_dir) + return 0 + +if __name__=="__main__": + main() diff --git a/egs/dihard_2018/v1/local/make_dihard_2018_dev.sh b/egs/dihard_2018/v1/local/make_dihard_2018_dev.sh new file mode 100755 index 00000000000..cc48e2e792a --- /dev/null +++ b/egs/dihard_2018/v1/local/make_dihard_2018_dev.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# Copyright 2018 Zili Huang +# Apache 2.0. +# +# This script, called by ../run.sh, creates the DIHARD 2018 development data directory. + +if [ $# != 2 ]; then + echo "Usage: $0 " + echo " e.g.: $0 /export/corpora/LDC/LDC2018E31 data/dihard_2018_dev" +fi + +path_to_dihard_2018_dev=$1 +data_dir=$2 + +echo "Preparing ${data_dir}..." +local/make_dihard_2018_dev.py ${path_to_dihard_2018_dev} ${data_dir} + +sort -k 2,2 -s ${data_dir}/rttm > ${data_dir}/rttm_tmp +mv ${data_dir}/rttm_tmp ${data_dir}/rttm +sort -k 1,1 -s ${data_dir}/reco2num_spk > ${data_dir}/reco2num_spk_tmp +mv ${data_dir}/reco2num_spk_tmp ${data_dir}/reco2num_spk +utils/fix_data_dir.sh ${data_dir} diff --git a/egs/dihard_2018/v1/local/make_dihard_2018_eval.py b/egs/dihard_2018/v1/local/make_dihard_2018_eval.py new file mode 100755 index 00000000000..2a8acbee58d --- /dev/null +++ b/egs/dihard_2018/v1/local/make_dihard_2018_eval.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python3 + +# This script is called by local/make_dihard_2018_eval.sh, and it creates the +# necessary files for DIHARD 2018 evaluation directory. + +import sys, os + +def prepare_dihard_2018_eval(src_dir, data_dir): + wavscp_fi = open(data_dir + "/wav.scp" , 'w') + utt2spk_fi = open(data_dir + "/utt2spk" , 'w') + segments_fi = open(data_dir + "/segments" , 'w') + rttm_fi = open(data_dir + "/rttm" , 'w') + reco2num_spk_fi = open(data_dir + "/reco2num_spk" , 'w') + + for subdir, dirs, files in os.walk(src_dir): + for file in files: + filename = os.path.join(subdir, file) + if filename.endswith(".lab"): + utt = os.path.basename(filename).split(".")[0] + lines = open(filename, 'r').readlines() + segment_id = 0 + for line in lines: + start, end, speech = line.split() + segment_id_str = "{}_{}".format(utt, str(segment_id).zfill(4)) + segments_str = "{} {} {} {}\n".format(segment_id_str, utt, start, end) + utt2spk_str = "{} {}\n".format(segment_id_str, utt) + segments_fi.write(segments_str) + utt2spk_fi.write(utt2spk_str) + segment_id += 1 + wav_str = "{} sox -t flac {}/data/flac/{}.flac -t wav -r 16k "\ + "-b 16 - channels 1 |\n".format(utt, src_dir, utt) + wavscp_fi.write(wav_str) + with open("{}/data/rttm/{}.rttm".format(src_dir, utt), 'r') as fh: + rttm_str = fh.read() + rttm_fi.write(rttm_str) + with open("{}/data/rttm/{}.rttm".format(src_dir, utt), 'r') as fh: + rttm_list = fh.readlines() + spk_list = [(x.split())[7] for x in rttm_list] + num_spk = len(set(spk_list)) + reco2num_spk_fi.write("{} {}\n".format(utt, num_spk)) + wavscp_fi.close() + utt2spk_fi.close() + segments_fi.close() + rttm_fi.close() + reco2num_spk_fi.close() + return 0 + +def main(): + src_dir = sys.argv[1] + data_dir = sys.argv[2] + if not os.path.exists(data_dir): + os.makedirs(data_dir) + prepare_dihard_2018_eval(src_dir, data_dir) + return 0 + +if __name__=="__main__": + main() diff --git a/egs/dihard_2018/v1/local/make_dihard_2018_eval.sh b/egs/dihard_2018/v1/local/make_dihard_2018_eval.sh new file mode 100755 index 00000000000..0a461c635ec --- /dev/null +++ b/egs/dihard_2018/v1/local/make_dihard_2018_eval.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# Copyright 2018 Zili Huang +# Apache 2.0. +# +# This script, called by ../run.sh, creates the DIHARD 2018 evaluation directory. + +if [ $# != 2 ]; then + echo "Usage: $0 " + echo " e.g.: $0 /export/corpora/LDC/LDC2018E32v1.1 data/dihard_2018_eval" +fi + +path_to_dihard_2018_eval=$1 +data_dir=$2 + +echo "Preparing ${data_dir}..." +local/make_dihard_2018_eval.py ${path_to_dihard_2018_eval} ${data_dir} + +sort -k 2,2 -s ${data_dir}/rttm > ${data_dir}/rttm_tmp +mv ${data_dir}/rttm_tmp ${data_dir}/rttm +sort -k 1,1 -s ${data_dir}/reco2num_spk > ${data_dir}/reco2num_spk_tmp +mv ${data_dir}/reco2num_spk_tmp ${data_dir}/reco2num_spk +utils/fix_data_dir.sh ${data_dir} diff --git a/egs/dihard_2018/v1/local/make_voxceleb1.pl b/egs/dihard_2018/v1/local/make_voxceleb1.pl new file mode 100755 index 00000000000..2268c20ab52 --- /dev/null +++ b/egs/dihard_2018/v1/local/make_voxceleb1.pl @@ -0,0 +1,130 @@ +#!/usr/bin/perl +# +# Copyright 2018 Ewald Enzinger +# 2018 David Snyder +# +# Usage: make_voxceleb1.pl /export/voxceleb1 data/ + +if (@ARGV != 2) { + print STDERR "Usage: $0 \n"; + print STDERR "e.g. $0 /export/voxceleb1 data/\n"; + exit(1); +} + +($data_base, $out_dir) = @ARGV; +my $out_test_dir = "$out_dir/voxceleb1_test"; +my $out_train_dir = "$out_dir/voxceleb1_train"; + +if (system("mkdir -p $out_test_dir") != 0) { + die "Error making directory $out_test_dir"; +} + +if (system("mkdir -p $out_train_dir") != 0) { + die "Error making directory $out_train_dir"; +} + +opendir my $dh, "$data_base/voxceleb1_wav" or die "Cannot open directory: $!"; +my @spkr_dirs = grep {-d "$data_base/voxceleb1_wav/$_" && ! /^\.{1,2}$/} readdir($dh); +closedir $dh; + +if (! -e "$data_base/voxceleb1_test.txt") { + system("wget -O $data_base/voxceleb1_test.txt http://www.openslr.org/resources/49/voxceleb1_test.txt"); +} + +if (! -e "$data_base/vox1_meta.csv") { + system("wget -O $data_base/vox1_meta.csv http://www.openslr.org/resources/49/vox1_meta.csv"); +} + +open(TRIAL_IN, "<", "$data_base/voxceleb1_test.txt") or die "Could not open the verification trials file $data_base/voxceleb1_test.txt"; +open(META_IN, "<", "$data_base/vox1_meta.csv") or die "Could not open the meta data file $data_base/vox1_meta.csv"; +open(SPKR_TEST, ">", "$out_test_dir/utt2spk") or die "Could not open the output file $out_test_dir/utt2spk"; +open(WAV_TEST, ">", "$out_test_dir/wav.scp") or die "Could not open the output file $out_test_dir/wav.scp"; +open(SPKR_TRAIN, ">", "$out_train_dir/utt2spk") or die "Could not open the output file $out_train_dir/utt2spk"; +open(WAV_TRAIN, ">", "$out_train_dir/wav.scp") or die "Could not open the output file $out_train_dir/wav.scp"; +open(TRIAL_OUT, ">", "$out_test_dir/trials") or die "Could not open the output file $out_test_dir/trials"; + +my %id2spkr = (); +while () { + chomp; + my ($vox_id, $spkr_id, $gender, $nation, $set) = split; + $id2spkr{$vox_id} = $spkr_id; +} + +my $test_spkrs = (); +while () { + chomp; + my ($tar_or_non, $path1, $path2) = split; + + # Create entry for left-hand side of trial + my ($spkr_id, $filename) = split('/', $path1); + my $rec_id = substr($filename, 0, 11); + my $segment = substr($filename, 12, 7); + my $utt_id1 = "$spkr_id-$rec_id-$segment"; + $test_spkrs{$spkr_id} = (); + + # Create entry for right-hand side of trial + my ($spkr_id, $filename) = split('/', $path2); + my $rec_id = substr($filename, 0, 11); + my $segment = substr($filename, 12, 7); + my $utt_id2 = "$spkr_id-$rec_id-$segment"; + $test_spkrs{$spkr_id} = (); + + my $target = "nontarget"; + if ($tar_or_non eq "1") { + $target = "target"; + } + print TRIAL_OUT "$utt_id1 $utt_id2 $target\n"; +} + +foreach (@spkr_dirs) { + my $spkr_id = $_; + my $new_spkr_id = $spkr_id; + # If we're using a newer version of VoxCeleb1, we need to "deanonymize" + # the speaker labels. + if (exists $id2spkr{$spkr_id}) { + $new_spkr_id = $id2spkr{$spkr_id}; + } + opendir my $dh, "$data_base/voxceleb1_wav/$spkr_id/" or die "Cannot open directory: $!"; + my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh); + closedir $dh; + foreach (@files) { + my $filename = $_; + my $rec_id = substr($filename, 0, 11); + my $segment = substr($filename, 12, 7); + my $wav = "$data_base/voxceleb1_wav/$spkr_id/$filename.wav"; + my $utt_id = "$new_spkr_id-$rec_id-$segment"; + if (exists $test_spkrs{$new_spkr_id}) { + print WAV_TEST "$utt_id", " $wav", "\n"; + print SPKR_TEST "$utt_id", " $new_spkr_id", "\n"; + } else { + print WAV_TRAIN "$utt_id", " $wav", "\n"; + print SPKR_TRAIN "$utt_id", " $new_spkr_id", "\n"; + } + } +} + +close(SPKR_TEST) or die; +close(WAV_TEST) or die; +close(SPKR_TRAIN) or die; +close(WAV_TRAIN) or die; +close(TRIAL_OUT) or die; +close(TRIAL_IN) or die; +close(META_IN) or die; + +if (system( + "utils/utt2spk_to_spk2utt.pl $out_test_dir/utt2spk >$out_test_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_test_dir"; +} +system("env LC_COLLATE=C utils/fix_data_dir.sh $out_test_dir"); +if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_test_dir") != 0) { + die "Error validating directory $out_test_dir"; +} + +if (system( + "utils/utt2spk_to_spk2utt.pl $out_train_dir/utt2spk >$out_train_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_train_dir"; +} +system("env LC_COLLATE=C utils/fix_data_dir.sh $out_train_dir"); +if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_train_dir") != 0) { + die "Error validating directory $out_train_dir"; +} diff --git a/egs/dihard_2018/v1/local/make_voxceleb1_v2.pl b/egs/dihard_2018/v1/local/make_voxceleb1_v2.pl new file mode 100755 index 00000000000..0bc13bea251 --- /dev/null +++ b/egs/dihard_2018/v1/local/make_voxceleb1_v2.pl @@ -0,0 +1,123 @@ +#!/usr/bin/perl +# +# Copyright 2018 Ewald Enzinger +# 2018 David Snyder +# 2019 Soonshin Seo +# +# Usage: make_voxceleb1_v2.pl /export/voxceleb1 dev data/dev +# +# The VoxCeleb1 corpus underwent several updates that changed the directory and speaker ID format. +# The script 'make_voxceleb1.pl' works for the oldest version of the corpus. +# This script should be used if you've downloaded the corpus recently. + +if (@ARGV != 3) { + print STDERR "Usage: $0 \n"; + print STDERR "e.g. $0 /export/voxceleb1 dev data/dev\n"; + exit(1); +} + +($data_base, $dataset, $out_dir) = @ARGV; + +if ("$dataset" ne "dev" && "$dataset" ne "test") { + die "dataset parameter must be 'dev' or 'test'!"; +} + +if (system("mkdir -p $out_dir") != 0) { + die "Error making directory $out_dir"; +} +print "$data_base/$dataset/wav\n"; +opendir my $dh, "$data_base/$dataset/wav" or die "Cannot open directory: $!"; +my @spkr_dirs = grep {-d "$data_base/$dataset/wav/$_" && ! /^\.{1,2}$/} readdir($dh); +closedir $dh; + +if ($dataset eq "dev"){ + open(SPKR_TRAIN, ">", "$out_dir/utt2spk") or die "could not open the output file $out_dir/utt2spk"; + open(WAV_TRAIN, ">", "$out_dir/wav.scp") or die "could not open the output file $out_dir/wav.scp"; + + foreach (@spkr_dirs) { + my $spkr_id = $_; + opendir my $dh, "$data_base/$dataset/wav/$spkr_id/" or die "Cannot open directory: $!"; + my @rec_dirs = grep {-d "$data_base/$dataset/wav/$spkr_id/$_" && ! /^\.{1,2}$/} readdir($dh); + closedir $dh; + foreach (@rec_dirs) { + my $rec_id = $_; + opendir my $dh, "$data_base/$dataset/wav/$spkr_id/$rec_id/" or die "Cannot open directory: $!"; + my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh); + closedir $dh; + foreach (@files) { + my $name = $_; + my $wav = "$data_base/$dataset/wav/$spkr_id/$rec_id/$name.wav"; + my $utt_id = "$spkr_id-$rec_id-$name"; + print WAV_TRAIN "$utt_id", " $wav", "\n"; + print SPKR_TRAIN "$utt_id", " $spkr_id", "\n"; + } + } + } + close(SPKR_TRAIN) or die; + close(WAV_TRAIN) or die; +} + +if ($dataset eq "test"){ + if (! -e "$data_base/voxceleb1_test_v2.txt") { + system("wget -O $data_base/voxceleb1_test_v2.txt http://www.openslr.org/resources/49/voxceleb1_test_v2.txt"); + } + + open(TRIAL_IN, "<", "$data_base/voxceleb1_test_v2.txt") or die "could not open the verification trials file $data_base/voxceleb1_test_v2.txt"; + open(TRIAL_OUT, ">", "$out_dir/trials") or die "Could not open the output file $out_test_dir/trials"; + open(SPKR_TEST, ">", "$out_dir/utt2spk") or die "could not open the output file $out_dir/utt2spk"; + open(WAV_TEST, ">", "$out_dir/wav.scp") or die "could not open the output file $out_dir/wav.scp"; + + my $test_spkrs = (); + while () { + chomp; + my ($tar_or_non, $path1, $path2) = split; + # Create entry for left-hand side of trial + my ($spkr_id, $rec_id, $name) = split('/', $path1); + my $utt_id1 = "$spkr_id-$rec_id-$name"; + $test_spkrs{$spkr_id} = (); + + # Create entry for right-hand side of trial + my ($spkr_id, $rec_id, $name) = split('/', $path2); + my $utt_id2 = "$spkr_id-$rec_id-$name"; + $test_spkrs{$spkr_id} = (); + + my $target = "nontarget"; + if ($tar_or_non eq "1") { + $target = "target"; + } + print TRIAL_OUT "$utt_id1 $utt_id2 $target\n"; + } + + foreach (@spkr_dirs) { + my $spkr_id = $_; + opendir my $dh, "$data_base/$dataset/wav/$spkr_id/" or die "Cannot open directory: $!"; + my @rec_dirs = grep {-d "$data_base/$dataset/wav/$spkr_id/$_" && ! /^\.{1,2}$/} readdir($dh); + closedir $dh; + foreach (@rec_dirs) { + my $rec_id = $_; + opendir my $dh, "$data_base/$dataset/wav/$spkr_id/$rec_id/" or die "Cannot open directory: $!"; + my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh); + closedir $dh; + foreach (@files) { + my $name = $_; + my $wav = "$data_base/$dataset/wav/$spkr_id/$rec_id/$name.wav"; + my $utt_id = "$spkr_id-$rec_id-$name"; + print WAV_TEST "$utt_id", " $wav", "\n"; + print SPKR_TEST "$utt_id", " $spkr_id", "\n"; + } + } + } + close(SPKR_TEST) or die; + close(WAV_TEST) or die; + close(TRIAL_OUT) or die; + close(TRIAL_IN) or die; +} + +if (system( + "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_dir"; +} +system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir"); +if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) { + die "Error validating directory $out_dir"; +} diff --git a/egs/dihard_2018/v1/local/make_voxceleb2.pl b/egs/dihard_2018/v1/local/make_voxceleb2.pl new file mode 100755 index 00000000000..34c1591eba3 --- /dev/null +++ b/egs/dihard_2018/v1/local/make_voxceleb2.pl @@ -0,0 +1,70 @@ +#!/usr/bin/perl +# +# Copyright 2018 Ewald Enzinger +# +# Usage: make_voxceleb2.pl /export/voxceleb2 dev data/dev +# +# Note: This script requires ffmpeg to be installed and its location included in $PATH. + +if (@ARGV != 3) { + print STDERR "Usage: $0 \n"; + print STDERR "e.g. $0 /export/voxceleb2 dev data/dev\n"; + exit(1); +} + +# Check that ffmpeg is installed. +if (`which ffmpeg` eq "") { + die "Error: this script requires that ffmpeg is installed."; +} + +($data_base, $dataset, $out_dir) = @ARGV; + +if ("$dataset" ne "dev" && "$dataset" ne "test") { + die "dataset parameter must be 'dev' or 'test'!"; +} + +opendir my $dh, "$data_base/$dataset/aac" or die "Cannot open directory: $!"; +my @spkr_dirs = grep {-d "$data_base/$dataset/aac/$_" && ! /^\.{1,2}$/} readdir($dh); +closedir $dh; + +if (system("mkdir -p $out_dir") != 0) { + die "Error making directory $out_dir"; +} + +open(SPKR, ">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk"; +open(WAV, ">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp"; + +foreach (@spkr_dirs) { + my $spkr_id = $_; + + opendir my $dh, "$data_base/$dataset/aac/$spkr_id/" or die "Cannot open directory: $!"; + my @rec_dirs = grep {-d "$data_base/$dataset/aac/$spkr_id/$_" && ! /^\.{1,2}$/} readdir($dh); + closedir $dh; + + foreach (@rec_dirs) { + my $rec_id = $_; + + opendir my $dh, "$data_base/$dataset/aac/$spkr_id/$rec_id/" or die "Cannot open directory: $!"; + my @files = map{s/\.[^.]+$//;$_}grep {/\.m4a$/} readdir($dh); + closedir $dh; + + foreach (@files) { + my $name = $_; + my $wav = "ffmpeg -v 8 -i $data_base/$dataset/aac/$spkr_id/$rec_id/$name.m4a -f wav -acodec pcm_s16le -|"; + my $utt_id = "$spkr_id-$rec_id-$name"; + print WAV "$utt_id", " $wav", "\n"; + print SPKR "$utt_id", " $spkr_id", "\n"; + } + } +} +close(SPKR) or die; +close(WAV) or die; + +if (system( + "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_dir"; +} +system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir"); +if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) { + die "Error validating directory $out_dir"; +} diff --git a/egs/dihard_2018/v1/local/prepare_feats.sh b/egs/dihard_2018/v1/local/prepare_feats.sh new file mode 100755 index 00000000000..9fa70a2d91e --- /dev/null +++ b/egs/dihard_2018/v1/local/prepare_feats.sh @@ -0,0 +1,91 @@ +#!/bin/bash +# +# Apache 2.0. + +# This script adds deltas, applies sliding window CMVN and writes the features to disk. +# +# Although this kind of script isn't necessary in speaker recognition recipes, +# it can be helpful in the diarization recipes. The script +# diarization/extract_ivectors.sh extracts i-vectors from very +# short (e.g., 1-2 seconds) segments. Therefore, in order to apply the sliding +# window CMVN in a meaningful way, it must be performed prior to performing +# the subsegmentation. + +nj=40 +cmd="run.pl" +stage=0 +norm_vars=false +center=true +compress=true +cmn_window=300 +delta_window=3 +delta_order=2 + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; +if [ $# != 3 ]; then + echo "Usage: $0 " + echo "e.g.: $0 data/train data/train_no_sil exp/make_ivector_features" + echo "Options: " + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --norm-vars # If true, normalize variances in the sliding window cmvn" + exit 1; +fi + +data_in=$1 +data_out=$2 +dir=$3 + +name=`basename $data_in` + +for f in $data_in/feats.scp ; do + [ ! -f $f ] && echo "$0: No such file $f" && exit 1; +done + +# Set various variables. +mkdir -p $dir/log +mkdir -p $data_out +featdir=$(utils/make_absolute.sh $dir) + +if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $featdir/storage ]; then + utils/create_split_dir.pl \ + /export/b{14,15,16,17}/$USER/kaldi-data/egs/dihard_2018/v1/ivector-$(date +'%m_%d_%H_%M')/ivector_cmvn_feats/storage $featdir/storage +fi + +for n in $(seq $nj); do + # the next command does nothing unless $featdir/storage/ exists, see + # utils/create_data_link.pl for more info. + utils/create_data_link.pl $featdir/ivector_cmvn_feats_${name}.${n}.ark +done + +cp $data_in/utt2spk $data_out/utt2spk +cp $data_in/spk2utt $data_out/spk2utt +cp $data_in/wav.scp $data_out/wav.scp + +write_num_frames_opt="--write-num-frames=ark,t:$featdir/log/utt2num_frames.JOB" + +sdata_in=$data_in/split$nj; +utils/split_data.sh $data_in $nj || exit 1; + +delta_opts="--delta-window=$delta_window --delta-order=$delta_order" + +$cmd JOB=1:$nj $dir/log/create_ivector_cmvn_feats_${name}.JOB.log \ + add-deltas $delta_opts scp:${sdata_in}/JOB/feats.scp ark:- \| \ + apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=$cmn_window \ + ark:- ark:- \| \ + copy-feats --compress=$compress $write_num_frames_opt ark:- \ + ark,scp:$featdir/ivector_cmvn_feats_${name}.JOB.ark,$featdir/ivector_cmvn_feats_${name}.JOB.scp || exit 1; + +for n in $(seq $nj); do + cat $featdir/ivector_cmvn_feats_${name}.$n.scp || exit 1; +done > ${data_out}/feats.scp || exit 1 + +for n in $(seq $nj); do + cat $featdir/log/utt2num_frames.$n || exit 1; +done > $data_out/utt2num_frames || exit 1 +rm $featdir/log/utt2num_frames.* + +echo "$0: Succeeded creating ivector features for $name" diff --git a/egs/dihard_2018/v1/path.sh b/egs/dihard_2018/v1/path.sh new file mode 100755 index 00000000000..851c14e27c3 --- /dev/null +++ b/egs/dihard_2018/v1/path.sh @@ -0,0 +1,5 @@ +export KALDI_ROOT=`pwd`/../../.. +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$KALDI_ROOT/tools/sctk/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C diff --git a/egs/dihard_2018/v1/run.sh b/egs/dihard_2018/v1/run.sh new file mode 100755 index 00000000000..eb23ac500cd --- /dev/null +++ b/egs/dihard_2018/v1/run.sh @@ -0,0 +1,240 @@ +#!/bin/bash +# Copyright 2017 Johns Hopkins University (Author: Daniel Garcia-Romero) +# 2017 Johns Hopkins University (Author: Daniel Povey) +# 2017-2018 David Snyder +# 2018 Ewald Enzinger +# 2018 Zili Huang +# Apache 2.0. +# +# See ../README.txt for more info on data required. +# Results (diarization error rate) are inline in comments below. + +. ./cmd.sh +. ./path.sh +set -e +mfccdir=`pwd`/mfcc +vaddir=`pwd`/mfcc + +voxceleb1_root=/export/corpora/VoxCeleb1 +voxceleb2_root=/export/corpora/VoxCeleb2 +dihard_2018_dev=/export/corpora/LDC/LDC2018E31 +dihard_2018_eval=/export/corpora/LDC/LDC2018E32v1.1 +num_components=2048 +ivector_dim=400 +ivec_dir=exp/extractor_c${num_components}_i${ivector_dim} + +stage=0 + +if [ $stage -le 0 ]; then + local/make_voxceleb2.pl $voxceleb2_root dev data/voxceleb2_train + local/make_voxceleb2.pl $voxceleb2_root test data/voxceleb2_test + + # Now prepare the VoxCeleb1 train and test data. If you downloaded the corpus soon + # after it was first released, you may need to use an older version of the script, which + # can be invoked as follows: + # local/make_voxceleb1.pl $voxceleb1_root data + local/make_voxceleb1_v2.pl $voxceleb1_root dev data/voxceleb1_train + local/make_voxceleb1_v2.pl $voxceleb1_root test data/voxceleb1_test + + # We'll train on all of VoxCeleb2, plus the training portion of VoxCeleb1. + # This should give 7,351 speakers and 1,277,503 utterances. + utils/combine_data.sh data/train data/voxceleb2_train data/voxceleb2_test data/voxceleb1_train + + # Prepare the development and evaluation set for DIHARD 2018. + local/make_dihard_2018_dev.sh $dihard_2018_dev data/dihard_2018_dev + local/make_dihard_2018_eval.sh $dihard_2018_eval data/dihard_2018_eval +fi + +if [ $stage -le 1 ]; then + # Make MFCCs for each dataset + for name in train dihard_2018_dev dihard_2018_eval; do + steps/make_mfcc.sh --write-utt2num-frames true \ + --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd --max-jobs-run 20" \ + data/${name} exp/make_mfcc $mfccdir + utils/fix_data_dir.sh data/${name} + done + + # Compute the energy-based VAD for train + sid/compute_vad_decision.sh --nj 40 --cmd "$train_cmd" \ + data/train exp/make_vad $vaddir + utils/fix_data_dir.sh data/train + + # This writes features to disk after adding deltas and applying the sliding window CMN. + # Although this is somewhat wasteful in terms of disk space, for diarization + # it ends up being preferable to performing the CMN in memory. If the CMN + # were performed in memory it would need to be performed after the subsegmentation, + # which leads to poorer results. + for name in train dihard_2018_dev dihard_2018_eval; do + local/prepare_feats.sh --nj 40 --cmd "$train_cmd" \ + data/$name data/${name}_cmn exp/${name}_cmn + if [ -f data/$name/vad.scp ]; then + cp data/$name/vad.scp data/${name}_cmn/ + fi + if [ -f data/$name/segments ]; then + cp data/$name/segments data/${name}_cmn/ + fi + utils/fix_data_dir.sh data/${name}_cmn + done + + echo "0.01" > data/train_cmn/frame_shift + # Create segments to extract i-vectors from for PLDA training data. + # The segments are created using an energy-based speech activity + # detection (SAD) system, but this is not necessary. You can replace + # this with segments computed from your favorite SAD. + diarization/vad_to_segments.sh --nj 40 --cmd "$train_cmd" \ + data/train_cmn data/train_cmn_segmented +fi + +if [ $stage -le 2 ]; then + # Train the UBM on VoxCeleb 1 and 2. + sid/train_diag_ubm.sh --cmd "$train_cmd --mem 4G" \ + --nj 40 --num-threads 8 \ + data/train $num_components \ + exp/diag_ubm + + sid/train_full_ubm.sh --cmd "$train_cmd --mem 25G" \ + --nj 40 --remove-low-count-gaussians false \ + data/train \ + exp/diag_ubm exp/full_ubm +fi + +if [ $stage -le 3 ]; then + # In this stage, we train the i-vector extractor on a subset of VoxCeleb 1 + # and 2. + # + # Note that there are well over 1 million utterances in our training set, + # and it takes an extremely long time to train the extractor on all of this. + # Also, most of those utterances are very short. Short utterances are + # harmful for training the i-vector extractor. Therefore, to reduce the + # training time and improve performance, we will only train on the 100k + # longest utterances. + utils/subset_data_dir.sh \ + --utt-list <(sort -n -k 2 data/train/utt2num_frames | tail -n 100000) \ + data/train data/train_100k + + # Train the i-vector extractor. + sid/train_ivector_extractor.sh --cmd "$train_cmd --mem 16G" \ + --ivector-dim $ivector_dim --num-iters 5 \ + exp/full_ubm/final.ubm data/train_100k \ + $ivec_dir +fi + +if [ $stage -le 4 ]; then + # Extract i-vectors for DIHARD 2018 development and evaluation set. + # We set apply-cmn false and apply-deltas false because we already add + # deltas and apply cmn in stage 1. + diarization/extract_ivectors.sh --cmd "$train_cmd --mem 20G" \ + --nj 40 --window 1.5 --period 0.75 --apply-cmn false --apply-deltas false \ + --min-segment 0.5 $ivec_dir \ + data/dihard_2018_dev_cmn $ivec_dir/ivectors_dihard_2018_dev + + diarization/extract_ivectors.sh --cmd "$train_cmd --mem 20G" \ + --nj 40 --window 1.5 --period 0.75 --apply-cmn false --apply-deltas false \ + --min-segment 0.5 $ivec_dir \ + data/dihard_2018_eval_cmn $ivec_dir/ivectors_dihard_2018_eval + + # Reduce the amount of training data for the PLDA training. + utils/subset_data_dir.sh data/train_cmn_segmented 128000 data/train_cmn_segmented_128k + # Extract i-vectors for the VoxCeleb, which is our PLDA training + # data. A long period is used here so that we don't compute too + # many i-vectors for each recording. + diarization/extract_ivectors.sh --cmd "$train_cmd --mem 25G" \ + --nj 40 --window 3.0 --period 10.0 --min-segment 1.5 --apply-cmn false --apply-deltas false \ + --hard-min true $ivec_dir \ + data/train_cmn_segmented_128k $ivec_dir/ivectors_train_segmented_128k +fi + +if [ $stage -le 5 ]; then + # Train a PLDA model on VoxCeleb, using DIHARD 2018 development set to whiten. + "$train_cmd" $ivec_dir/ivectors_dihard_2018_dev/log/plda.log \ + ivector-compute-plda ark:$ivec_dir/ivectors_train_segmented_128k/spk2utt \ + "ark:ivector-subtract-global-mean \ + scp:$ivec_dir/ivectors_train_segmented_128k/ivector.scp ark:- \ + | transform-vec $ivec_dir/ivectors_dihard_2018_dev/transform.mat ark:- ark:- \ + | ivector-normalize-length ark:- ark:- |" \ + $ivec_dir/ivectors_dihard_2018_dev/plda || exit 1; +fi + +# Perform PLDA scoring +if [ $stage -le 6 ]; then + # Perform PLDA scoring on all pairs of segments for each recording. + diarization/score_plda.sh --cmd "$train_cmd --mem 4G" \ + --nj 20 $ivec_dir/ivectors_dihard_2018_dev $ivec_dir/ivectors_dihard_2018_dev \ + $ivec_dir/ivectors_dihard_2018_dev/plda_scores + + diarization/score_plda.sh --cmd "$train_cmd --mem 4G" \ + --nj 20 $ivec_dir/ivectors_dihard_2018_dev $ivec_dir/ivectors_dihard_2018_eval \ + $ivec_dir/ivectors_dihard_2018_eval/plda_scores +fi + +# Cluster the PLDA scores using a stopping threshold. +if [ $stage -le 7 ]; then + # First, we find the threshold that minimizes the DER on DIHARD 2018 development set. + mkdir -p $ivec_dir/tuning + echo "Tuning clustering threshold for DIHARD 2018 development set" + best_der=100 + best_threshold=0 + + # The threshold is in terms of the log likelihood ratio provided by the + # PLDA scores. In a perfectly calibrated system, the threshold is 0. + # In the following loop, we evaluate DER performance on DIHARD 2018 development + # set using some reasonable thresholds for a well-calibrated system. + for threshold in -0.5 -0.4 -0.3 -0.2 -0.1 -0.05 0 0.05 0.1 0.2 0.3 0.4 0.5; do + diarization/cluster.sh --cmd "$train_cmd --mem 4G" --nj 20 \ + --threshold $threshold --rttm-channel 1 $ivec_dir/ivectors_dihard_2018_dev/plda_scores \ + $ivec_dir/ivectors_dihard_2018_dev/plda_scores_t$threshold + + md-eval.pl -r data/dihard_2018_dev/rttm \ + -s $ivec_dir/ivectors_dihard_2018_dev/plda_scores_t$threshold/rttm \ + 2> $ivec_dir/tuning/dihard_2018_dev_t${threshold}.log \ + > $ivec_dir/tuning/dihard_2018_dev_t${threshold} + + der=$(grep -oP 'DIARIZATION\ ERROR\ =\ \K[0-9]+([.][0-9]+)?' \ + $ivec_dir/tuning/dihard_2018_dev_t${threshold}) + if [ $(perl -e "print ($der < $best_der ? 1 : 0);") -eq 1 ]; then + best_der=$der + best_threshold=$threshold + fi + done + echo "$best_threshold" > $ivec_dir/tuning/dihard_2018_dev_best + + diarization/cluster.sh --cmd "$train_cmd --mem 4G" --nj 20 \ + --threshold $(cat $ivec_dir/tuning/dihard_2018_dev_best) --rttm-channel 1 \ + $ivec_dir/ivectors_dihard_2018_dev/plda_scores $ivec_dir/ivectors_dihard_2018_dev/plda_scores + + # Cluster DIHARD 2018 evaluation set using the best threshold found for the DIHARD + # 2018 development set. The DIHARD 2018 development set is used as the validation + # set to tune the parameters. + diarization/cluster.sh --cmd "$train_cmd --mem 4G" --nj 20 \ + --threshold $(cat $ivec_dir/tuning/dihard_2018_dev_best) --rttm-channel 1 \ + $ivec_dir/ivectors_dihard_2018_eval/plda_scores $ivec_dir/ivectors_dihard_2018_eval/plda_scores + + mkdir -p $ivec_dir/results + # Compute the DER on the DIHARD 2018 evaluation set. We use the official metrics of + # the DIHARD challenge. The DER is calculated with no unscored collars and including + # overlapping speech. + md-eval.pl -r data/dihard_2018_eval/rttm \ + -s $ivec_dir/ivectors_dihard_2018_eval/plda_scores/rttm 2> $ivec_dir/results/threshold.log \ + > $ivec_dir/results/DER_threshold.txt + der=$(grep -oP 'DIARIZATION\ ERROR\ =\ \K[0-9]+([.][0-9]+)?' \ + $ivec_dir/results/DER_threshold.txt) + # Using supervised calibration, DER: 28.51% + echo "Using supervised calibration, DER: $der%" +fi + +# Cluster the PLDA scores using the oracle number of speakers +if [ $stage -le 8 ]; then + # In this section, we show how to do the clustering if the number of speakers + # (and therefore, the number of clusters) per recording is known in advance. + diarization/cluster.sh --cmd "$train_cmd --mem 4G" --nj 20 \ + --reco2num-spk data/dihard_2018_eval/reco2num_spk --rttm-channel 1 \ + $ivec_dir/ivectors_dihard_2018_eval/plda_scores $ivec_dir/ivectors_dihard_2018_eval/plda_scores_num_spk + + md-eval.pl -r data/dihard_2018_eval/rttm \ + -s $ivec_dir/ivectors_dihard_2018_eval/plda_scores_num_spk/rttm 2> $ivec_dir/results/num_spk.log \ + > $ivec_dir/results/DER_num_spk.txt + der=$(grep -oP 'DIARIZATION\ ERROR\ =\ \K[0-9]+([.][0-9]+)?' \ + $ivec_dir/results/DER_num_spk.txt) + # Using the oracle number of speakers, DER: 24.42% + echo "Using the oracle number of speakers, DER: $der%" +fi diff --git a/egs/dihard_2018/v1/sid b/egs/dihard_2018/v1/sid new file mode 120000 index 00000000000..893a12f30c9 --- /dev/null +++ b/egs/dihard_2018/v1/sid @@ -0,0 +1 @@ +../../sre08/v1/sid \ No newline at end of file diff --git a/egs/dihard_2018/v1/steps b/egs/dihard_2018/v1/steps new file mode 120000 index 00000000000..6e99bf5b5ad --- /dev/null +++ b/egs/dihard_2018/v1/steps @@ -0,0 +1 @@ +../../wsj/s5/steps \ No newline at end of file diff --git a/egs/dihard_2018/v1/utils b/egs/dihard_2018/v1/utils new file mode 120000 index 00000000000..b240885218f --- /dev/null +++ b/egs/dihard_2018/v1/utils @@ -0,0 +1 @@ +../../wsj/s5/utils \ No newline at end of file diff --git a/egs/dihard_2018/v2/README.txt b/egs/dihard_2018/v2/README.txt new file mode 100644 index 00000000000..5487a911184 --- /dev/null +++ b/egs/dihard_2018/v2/README.txt @@ -0,0 +1,17 @@ + This recipe is the speaker diarization recipe for The First DIHARD Speech + Diarization Challenge (DIHARD 2018). There are two tracks in the DIHARD 2018 + competition , one uses oracle SAD (track1) and the other required that SAD + was performed from scratch (track2). This script is for track1. + + The recipe is closely based on the following paper: + http://www.danielpovey.com/files/2018_interspeech_dihard.pdf but doesn't + contain the VB refinement. The whole system mainly contains training and + extract x-vectors, PLDA scoring and agglomerative hierarchical clustering. + The VoxCeleb datasets are used for training x-vectors and PLDA. The + development set of the DIHARD 2018 competition is used as validation set to + tune parameters. The system is tested on the DIHARD 2018 evaluation set. + + We also use the following datasets for augmentation. + + MUSAN http://www.openslr.org/17 + RIR_NOISES http://www.openslr.org/28 diff --git a/egs/dihard_2018/v2/cmd.sh b/egs/dihard_2018/v2/cmd.sh new file mode 100755 index 00000000000..c35cd18f287 --- /dev/null +++ b/egs/dihard_2018/v2/cmd.sh @@ -0,0 +1,15 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl" + + diff --git a/egs/dihard_2018/v2/conf/mfcc.conf b/egs/dihard_2018/v2/conf/mfcc.conf new file mode 100755 index 00000000000..9e125706aae --- /dev/null +++ b/egs/dihard_2018/v2/conf/mfcc.conf @@ -0,0 +1,7 @@ +--sample-frequency=16000 +--frame-length=25 # the default is 25 +--low-freq=20 # the default. +--high-freq=7600 # the default is zero meaning use the Nyquist (8k in this case). +--num-mel-bins=30 +--num-ceps=30 +--snip-edges=false diff --git a/egs/dihard_2018/v2/conf/vad.conf b/egs/dihard_2018/v2/conf/vad.conf new file mode 100755 index 00000000000..c9f5e8b3072 --- /dev/null +++ b/egs/dihard_2018/v2/conf/vad.conf @@ -0,0 +1,4 @@ +--vad-energy-threshold=5.5 +--vad-energy-mean-scale=0.5 +--vad-proportion-threshold=0.12 +--vad-frames-context=2 diff --git a/egs/dihard_2018/v2/diarization b/egs/dihard_2018/v2/diarization new file mode 120000 index 00000000000..bad937c1444 --- /dev/null +++ b/egs/dihard_2018/v2/diarization @@ -0,0 +1 @@ +../../callhome_diarization/v1/diarization \ No newline at end of file diff --git a/egs/dihard_2018/v2/local/make_dihard_2018_dev.py b/egs/dihard_2018/v2/local/make_dihard_2018_dev.py new file mode 120000 index 00000000000..3c69bc08240 --- /dev/null +++ b/egs/dihard_2018/v2/local/make_dihard_2018_dev.py @@ -0,0 +1 @@ +../../v1/local/make_dihard_2018_dev.py \ No newline at end of file diff --git a/egs/dihard_2018/v2/local/make_dihard_2018_dev.sh b/egs/dihard_2018/v2/local/make_dihard_2018_dev.sh new file mode 120000 index 00000000000..6fe340e9df2 --- /dev/null +++ b/egs/dihard_2018/v2/local/make_dihard_2018_dev.sh @@ -0,0 +1 @@ +../../v1/local/make_dihard_2018_dev.sh \ No newline at end of file diff --git a/egs/dihard_2018/v2/local/make_dihard_2018_eval.py b/egs/dihard_2018/v2/local/make_dihard_2018_eval.py new file mode 120000 index 00000000000..d107a5446ca --- /dev/null +++ b/egs/dihard_2018/v2/local/make_dihard_2018_eval.py @@ -0,0 +1 @@ +../../v1/local/make_dihard_2018_eval.py \ No newline at end of file diff --git a/egs/dihard_2018/v2/local/make_dihard_2018_eval.sh b/egs/dihard_2018/v2/local/make_dihard_2018_eval.sh new file mode 120000 index 00000000000..0c01aee4fa7 --- /dev/null +++ b/egs/dihard_2018/v2/local/make_dihard_2018_eval.sh @@ -0,0 +1 @@ +../../v1/local/make_dihard_2018_eval.sh \ No newline at end of file diff --git a/egs/dihard_2018/v2/local/make_voxceleb1.pl b/egs/dihard_2018/v2/local/make_voxceleb1.pl new file mode 120000 index 00000000000..c54d69af919 --- /dev/null +++ b/egs/dihard_2018/v2/local/make_voxceleb1.pl @@ -0,0 +1 @@ +../../v1/local/make_voxceleb1.pl \ No newline at end of file diff --git a/egs/dihard_2018/v2/local/make_voxceleb1_v2.pl b/egs/dihard_2018/v2/local/make_voxceleb1_v2.pl new file mode 120000 index 00000000000..2e7a22eaadc --- /dev/null +++ b/egs/dihard_2018/v2/local/make_voxceleb1_v2.pl @@ -0,0 +1 @@ +../../v1/local/make_voxceleb1_v2.pl \ No newline at end of file diff --git a/egs/dihard_2018/v2/local/make_voxceleb2.pl b/egs/dihard_2018/v2/local/make_voxceleb2.pl new file mode 120000 index 00000000000..701225dfa57 --- /dev/null +++ b/egs/dihard_2018/v2/local/make_voxceleb2.pl @@ -0,0 +1 @@ +../../v1/local/make_voxceleb2.pl \ No newline at end of file diff --git a/egs/dihard_2018/v2/local/nnet3/xvector/prepare_feats.sh b/egs/dihard_2018/v2/local/nnet3/xvector/prepare_feats.sh new file mode 100755 index 00000000000..4ad2c42d8b9 --- /dev/null +++ b/egs/dihard_2018/v2/local/nnet3/xvector/prepare_feats.sh @@ -0,0 +1,86 @@ +#!/bin/bash +# +# Apache 2.0. + +# This script applies sliding window CMVN and writes the features to disk. +# +# Although this kind of script isn't necessary in speaker recognition recipes, +# it can be helpful in the diarization recipes. The script +# diarization/nnet3/xvector/extract_xvectors.sh extracts x-vectors from very +# short (e.g., 1-2 seconds) segments. Therefore, in order to apply the sliding +# window CMVN in a meaningful way, it must be performed prior to performing +# the subsegmentation. + +nj=40 +cmd="run.pl" +stage=0 +norm_vars=false +center=true +compress=true +cmn_window=300 + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; +if [ $# != 3 ]; then + echo "Usage: $0 " + echo "e.g.: $0 data/train data/train_no_sil exp/make_xvector_features" + echo "Options: " + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --norm-vars # If true, normalize variances in the sliding window cmvn" + exit 1; +fi + +data_in=$1 +data_out=$2 +dir=$3 + +name=`basename $data_in` + +for f in $data_in/feats.scp ; do + [ ! -f $f ] && echo "$0: No such file $f" && exit 1; +done + +# Set various variables. +mkdir -p $dir/log +mkdir -p $data_out +featdir=$(utils/make_absolute.sh $dir) + +if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $featdir/storage ]; then + utils/create_split_dir.pl \ + /export/b{14,15,16,17}/$USER/kaldi-data/egs/dihard_2018/v2/xvector-$(date +'%m_%d_%H_%M')/xvector_cmvn_feats/storage $featdir/storage +fi + +for n in $(seq $nj); do + # the next command does nothing unless $featdir/storage/ exists, see + # utils/create_data_link.pl for more info. + utils/create_data_link.pl $featdir/xvector_cmvn_feats_${name}.${n}.ark +done + +cp $data_in/utt2spk $data_out/utt2spk +cp $data_in/spk2utt $data_out/spk2utt +cp $data_in/wav.scp $data_out/wav.scp + +write_num_frames_opt="--write-num-frames=ark,t:$featdir/log/utt2num_frames.JOB" + +sdata_in=$data_in/split$nj; +utils/split_data.sh $data_in $nj || exit 1; + +$cmd JOB=1:$nj $dir/log/create_xvector_cmvn_feats_${name}.JOB.log \ + apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=$cmn_window \ + scp:${sdata_in}/JOB/feats.scp ark:- \| \ + copy-feats --compress=$compress $write_num_frames_opt ark:- \ + ark,scp:$featdir/xvector_cmvn_feats_${name}.JOB.ark,$featdir/xvector_cmvn_feats_${name}.JOB.scp || exit 1; + +for n in $(seq $nj); do + cat $featdir/xvector_cmvn_feats_${name}.$n.scp || exit 1; +done > ${data_out}/feats.scp || exit 1 + +for n in $(seq $nj); do + cat $featdir/log/utt2num_frames.$n || exit 1; +done > $data_out/utt2num_frames || exit 1 +rm $featdir/log/utt2num_frames.* + +echo "$0: Succeeded creating xvector features for $name" diff --git a/egs/dihard_2018/v2/local/nnet3/xvector/prepare_feats_for_egs.sh b/egs/dihard_2018/v2/local/nnet3/xvector/prepare_feats_for_egs.sh new file mode 100755 index 00000000000..1d8ac6153e7 --- /dev/null +++ b/egs/dihard_2018/v2/local/nnet3/xvector/prepare_feats_for_egs.sh @@ -0,0 +1,84 @@ +#!/bin/bash +# +# Copied from egs/sre16/v1/local/nnet3/xvector/prepare_feats_for_egs.sh (commit 3ea534070fd2cccd2e4ee21772132230033022ce). +# +# Apache 2.0. + +# This script applies sliding window cmvn and removes silence frames. This +# is performed on the raw features prior to generating examples for training +# the xvector system. + +nj=40 +cmd="run.pl" +stage=0 +norm_vars=false +center=true +compress=true +cmn_window=300 + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; +if [ $# != 3 ]; then + echo "Usage: $0 " + echo "e.g.: $0 data/train data/train_no_sil exp/make_xvector_features" + echo "Options: " + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --norm-vars # If true, normalize variances in the sliding window cmvn" + exit 1; +fi + +data_in=$1 +data_out=$2 +dir=$3 + +name=`basename $data_in` + +for f in $data_in/feats.scp $data_in/vad.scp ; do + [ ! -f $f ] && echo "$0: No such file $f" && exit 1; +done + +# Set various variables. +mkdir -p $dir/log +mkdir -p $data_out +featdir=$(utils/make_absolute.sh $dir) + +if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $featdir/storage ]; then + utils/create_split_dir.pl \ + /export/b{14,15,16,17}/$USER/kaldi-data/egs/dihard_2018/v2/xvector-$(date +'%m_%d_%H_%M')/xvector_feats/storage $featdir/storage +fi + +for n in $(seq $nj); do + # the next command does nothing unless $featdir/storage/ exists, see + # utils/create_data_link.pl for more info. + utils/create_data_link.pl $featdir/xvector_feats_${name}.${n}.ark +done + +cp $data_in/utt2spk $data_out/utt2spk +cp $data_in/spk2utt $data_out/spk2utt +cp $data_in/wav.scp $data_out/wav.scp + +write_num_frames_opt="--write-num-frames=ark,t:$featdir/log/utt2num_frames.JOB" + +sdata_in=$data_in/split$nj; +utils/split_data.sh $data_in $nj || exit 1; + +$cmd JOB=1:$nj $dir/log/create_xvector_feats_${name}.JOB.log \ + apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=$cmn_window \ + scp:${sdata_in}/JOB/feats.scp ark:- \| \ + select-voiced-frames ark:- scp,s,cs:${sdata_in}/JOB/vad.scp ark:- \| \ + copy-feats --compress=$compress $write_num_frames_opt ark:- \ + ark,scp:$featdir/xvector_feats_${name}.JOB.ark,$featdir/xvector_feats_${name}.JOB.scp || exit 1; + +for n in $(seq $nj); do + cat $featdir/xvector_feats_${name}.$n.scp || exit 1; +done > ${data_out}/feats.scp || exit 1 + +for n in $(seq $nj); do + cat $featdir/log/utt2num_frames.$n || exit 1; +done > $data_out/utt2num_frames || exit 1 +rm $featdir/log/utt2num_frames.* + +echo "$0: Succeeded creating xvector features for $name" diff --git a/egs/dihard_2018/v2/local/nnet3/xvector/run_xvector.sh b/egs/dihard_2018/v2/local/nnet3/xvector/run_xvector.sh new file mode 120000 index 00000000000..585b63fd2dd --- /dev/null +++ b/egs/dihard_2018/v2/local/nnet3/xvector/run_xvector.sh @@ -0,0 +1 @@ +tuning/run_xvector_1a.sh \ No newline at end of file diff --git a/egs/dihard_2018/v2/local/nnet3/xvector/tuning/run_xvector_1a.sh b/egs/dihard_2018/v2/local/nnet3/xvector/tuning/run_xvector_1a.sh new file mode 100755 index 00000000000..4ee472b1c71 --- /dev/null +++ b/egs/dihard_2018/v2/local/nnet3/xvector/tuning/run_xvector_1a.sh @@ -0,0 +1,155 @@ +#!/bin/bash +# Copyright 2017 David Snyder +# 2017 Johns Hopkins University (Author: Daniel Garcia-Romero) +# 2017 Johns Hopkins University (Author: Daniel Povey) +# +# Copied from egs/sre16/v1/local/nnet3/xvector/tuning/run_xvector_1a.sh (commit e082c17d4a8f8a791428ae4d9f7ceb776aef3f0b). +# +# Apache 2.0. + +# This script trains a DNN similar to the recipe described in +# http://www.danielpovey.com/files/2018_icassp_xvectors.pdf + +. ./cmd.sh +set -e + +stage=1 +train_stage=0 +use_gpu=true +remove_egs=false + +data=data/train +nnet_dir=exp/xvector_nnet_1a/ +egs_dir=exp/xvector_nnet_1a/egs + +. ./path.sh +. ./cmd.sh +. ./utils/parse_options.sh + +num_pdfs=$(awk '{print $2}' $data/utt2spk | sort | uniq -c | wc -l) + +# Now we create the nnet examples using sid/nnet3/xvector/get_egs.sh. +# The argument --num-repeats is related to the number of times a speaker +# repeats per archive. If it seems like you're getting too many archives +# (e.g., more than 200) try increasing the --frames-per-iter option. The +# arguments --min-frames-per-chunk and --max-frames-per-chunk specify the +# minimum and maximum length (in terms of number of frames) of the features +# in the examples. +# +# To make sense of the egs script, it may be necessary to put an "exit 1" +# command immediately after stage 3. Then, inspect +# exp//egs/temp/ranges.* . The ranges files specify the examples that +# will be created, and which archives they will be stored in. Each line of +# ranges.* has the following form: +# +# For example: +# 100304-f-sre2006-kacg-A 1 2 4079 881 23 + +# If you're satisfied with the number of archives (e.g., 50-150 archives is +# reasonable) and with the number of examples per speaker (e.g., 1000-5000 +# is reasonable) then you can let the script continue to the later stages. +# Otherwise, try increasing or decreasing the --num-repeats option. You might +# need to fiddle with --frames-per-iter. Increasing this value decreases the +# the number of archives and increases the number of examples per archive. +# Decreasing this value increases the number of archives, while decreasing the +# number of examples per archive. +if [ $stage -le 6 ]; then + echo "$0: Getting neural network training egs"; + # dump egs. + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b{03,04,05,06}/$USER/kaldi-data/egs/dihard_2018/v2/xvector-$(date +'%m_%d_%H_%M')/$egs_dir/storage $egs_dir/storage + fi + sid/nnet3/xvector/get_egs.sh --cmd "$train_cmd" \ + --nj 8 \ + --stage 0 \ + --frames-per-iter 1000000000 \ + --frames-per-iter-diagnostic 100000 \ + --min-frames-per-chunk 200 \ + --max-frames-per-chunk 400 \ + --num-diagnostic-archives 3 \ + --num-repeats 50 \ + "$data" $egs_dir +fi + +if [ $stage -le 7 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + num_targets=$(wc -w $egs_dir/pdf2num | awk '{print $1}') + feat_dim=$(cat $egs_dir/info/feat_dim) + + # This chunk-size corresponds to the maximum number of frames the + # stats layer is able to pool over. In this script, it corresponds + # to 100 seconds. If the input recording is greater than 100 seconds, + # we will compute multiple xvectors from the same recording and average + # to produce the final xvector. + max_chunk_size=10000 + + # The smallest number of frames we're comfortable computing an xvector from. + # Note that the hard minimum is given by the left and right context of the + # frame-level layers. + min_chunk_size=25 + mkdir -p $nnet_dir/configs + cat < $nnet_dir/configs/network.xconfig + # please note that it is important to have input layer with the name=input + + # The frame-level layers + input dim=${feat_dim} name=input + relu-batchnorm-layer name=tdnn1 input=Append(-2,-1,0,1,2) dim=512 + relu-batchnorm-layer name=tdnn2 input=Append(-2,0,2) dim=512 + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=512 + relu-batchnorm-layer name=tdnn4 dim=512 + relu-batchnorm-layer name=tdnn5 dim=1500 + + # The stats pooling layer. Layers after this are segment-level. + # In the config below, the first and last argument (0, and ${max_chunk_size}) + # means that we pool over an input segment starting at frame 0 + # and ending at frame ${max_chunk_size} or earlier. The other arguments (1:1) + # mean that no subsampling is performed. + stats-layer name=stats config=mean+stddev(0:1:1:${max_chunk_size}) + + # This is where we usually extract the embedding (aka xvector) from. + relu-batchnorm-layer name=tdnn6 dim=512 input=stats + + # This is where another layer the embedding could be extracted + # from, but usually the previous one works better. + relu-batchnorm-layer name=tdnn7 dim=512 + output-layer name=output include-log-softmax=true dim=${num_targets} +EOF + + steps/nnet3/xconfig_to_configs.py \ + --xconfig-file $nnet_dir/configs/network.xconfig \ + --config-dir $nnet_dir/configs/ + cp $nnet_dir/configs/final.config $nnet_dir/nnet.config + + # These three files will be used by sid/nnet3/xvector/extract_xvectors.sh + echo "output-node name=output input=tdnn6.affine" > $nnet_dir/extract.config + echo "$max_chunk_size" > $nnet_dir/max_chunk_size + echo "$min_chunk_size" > $nnet_dir/min_chunk_size +fi + +dropout_schedule='0,0@0.20,0.1@0.50,0' +srand=123 +if [ $stage -le 8 ]; then + steps/nnet3/train_raw_dnn.py --stage=$train_stage \ + --cmd="$train_cmd" \ + --trainer.optimization.proportional-shrink 10 \ + --trainer.optimization.momentum=0.5 \ + --trainer.optimization.num-jobs-initial=3 \ + --trainer.optimization.num-jobs-final=8 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.minibatch-size=64 \ + --trainer.srand=$srand \ + --trainer.max-param-change=2 \ + --trainer.num-epochs=3 \ + --trainer.dropout-schedule="$dropout_schedule" \ + --trainer.shuffle-buffer-size=1000 \ + --egs.frames-per-eg=1 \ + --egs.dir="$egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --cleanup.preserve-model-interval=10 \ + --use-gpu=true \ + --dir=$nnet_dir || exit 1; +fi + +exit 0; diff --git a/egs/dihard_2018/v2/path.sh b/egs/dihard_2018/v2/path.sh new file mode 100755 index 00000000000..851c14e27c3 --- /dev/null +++ b/egs/dihard_2018/v2/path.sh @@ -0,0 +1,5 @@ +export KALDI_ROOT=`pwd`/../../.. +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$KALDI_ROOT/tools/sctk/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C diff --git a/egs/dihard_2018/v2/run.sh b/egs/dihard_2018/v2/run.sh new file mode 100755 index 00000000000..6cd6630a838 --- /dev/null +++ b/egs/dihard_2018/v2/run.sh @@ -0,0 +1,316 @@ +#!/bin/bash +# Copyright 2017 Johns Hopkins University (Author: Daniel Garcia-Romero) +# 2017 Johns Hopkins University (Author: Daniel Povey) +# 2017-2018 David Snyder +# 2018 Ewald Enzinger +# 2018 Zili Huang +# Apache 2.0. +# +# See ../README.txt for more info on data required. +# Results (diarization error rate) are inline in comments below. + +. ./cmd.sh +. ./path.sh +set -e +mfccdir=`pwd`/mfcc +vaddir=`pwd`/mfcc + +voxceleb1_root=/export/corpora/VoxCeleb1 +voxceleb2_root=/export/corpora/VoxCeleb2 +nnet_dir=exp/xvector_nnet_1a +musan_root=/export/corpora/JHU/musan +dihard_2018_dev=/export/corpora/LDC/LDC2018E31 +dihard_2018_eval=/export/corpora/LDC/LDC2018E32v1.1 + +stage=0 + +if [ $stage -le 0 ]; then + local/make_voxceleb2.pl $voxceleb2_root dev data/voxceleb2_train + local/make_voxceleb2.pl $voxceleb2_root test data/voxceleb2_test + + # Now prepare the VoxCeleb1 train and test data. If you downloaded the corpus soon + # after it was first released, you may need to use an older version of the script, which + # can be invoked as follows: + # local/make_voxceleb1.pl $voxceleb1_root data + local/make_voxceleb1_v2.pl $voxceleb1_root dev data/voxceleb1_train + local/make_voxceleb1_v2.pl $voxceleb1_root test data/voxceleb1_test + + # We'll train on all of VoxCeleb2, plus the training portion of VoxCeleb1. + # This should give 7,351 speakers and 1,277,503 utterances. + utils/combine_data.sh data/train data/voxceleb2_train data/voxceleb2_test data/voxceleb1_train + + # Prepare the development and evaluation set for DIHARD 2018. + local/make_dihard_2018_dev.sh $dihard_2018_dev data/dihard_2018_dev + local/make_dihard_2018_eval.sh $dihard_2018_eval data/dihard_2018_eval +fi + +if [ $stage -le 1 ]; then + # Make MFCCs for each dataset. + for name in train dihard_2018_dev dihard_2018_eval; do + steps/make_mfcc.sh --write-utt2num-frames true --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd --max-jobs-run 20" \ + data/${name} exp/make_mfcc $mfccdir + utils/fix_data_dir.sh data/${name} + done + + # Compute the energy-based VAD for training set. + sid/compute_vad_decision.sh --nj 40 --cmd "$train_cmd" \ + data/train exp/make_vad $vaddir + utils/fix_data_dir.sh data/train + + # This writes features to disk after applying the sliding window CMN. + # Although this is somewhat wasteful in terms of disk space, for diarization + # it ends up being preferable to performing the CMN in memory. If the CMN + # were performed in memory (e.g., we used --apply-cmn true in + # diarization/nnet3/xvector/extract_xvectors.sh) it would need to be + # performed after the subsegmentation, which leads to poorer results. + for name in train dihard_2018_dev dihard_2018_eval; do + local/nnet3/xvector/prepare_feats.sh --nj 40 --cmd "$train_cmd" \ + data/$name data/${name}_cmn exp/${name}_cmn + if [ -f data/$name/vad.scp ]; then + cp data/$name/vad.scp data/${name}_cmn/ + fi + if [ -f data/$name/segments ]; then + cp data/$name/segments data/${name}_cmn/ + fi + utils/fix_data_dir.sh data/${name}_cmn + done + + echo "0.01" > data/dihard_2018_dev_cmn/frame_shift + echo "0.01" > data/dihard_2018_eval_cmn/frame_shift + echo "0.01" > data/train_cmn/frame_shift + # Create segments to extract x-vectors from for PLDA training data. + # The segments are created using an energy-based speech activity + # detection (SAD) system, but this is not necessary. You can replace + # this with segments computed from your favorite SAD. + diarization/vad_to_segments.sh --nj 40 --cmd "$train_cmd" \ + data/train_cmn data/train_cmn_segmented +fi + +# In this section, we augment the training data with reverberation, +# noise, music, and babble, and combine it with the clean data. +if [ $stage -le 2 ]; then + frame_shift=0.01 + awk -v frame_shift=$frame_shift '{print $1, $2*frame_shift;}' data/train/utt2num_frames > data/train/reco2dur + + if [ ! -d "RIRS_NOISES" ]; then + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + + # Make a version with reverberated speech + rvb_opts=() + rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list") + rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list") + + # Make a reverberated version of the training data. Note that we don't add any + # additive noise here. + steps/data/reverberate_data_dir.py \ + "${rvb_opts[@]}" \ + --speech-rvb-probability 1 \ + --pointsource-noise-addition-probability 0 \ + --isotropic-noise-addition-probability 0 \ + --num-replications 1 \ + --source-sampling-rate 16000 \ + data/train data/train_reverb + cp data/train/vad.scp data/train_reverb/ + utils/copy_data_dir.sh --utt-suffix "-reverb" data/train_reverb data/train_reverb.new + rm -rf data/train_reverb + mv data/train_reverb.new data/train_reverb + + # Prepare the MUSAN corpus, which consists of music, speech, and noise + # suitable for augmentation. + steps/data/make_musan.sh --sampling-rate 16000 $musan_root data + + # Get the duration of the MUSAN recordings. This will be used by the + # script augment_data_dir.py. + for name in speech noise music; do + utils/data/get_utt2dur.sh data/musan_${name} + mv data/musan_${name}/utt2dur data/musan_${name}/reco2dur + done + + # Augment with musan_noise + steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0" --fg-noise-dir "data/musan_noise" data/train data/train_noise + # Augment with musan_music + steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/train data/train_music + # Augment with musan_speech + steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "data/musan_speech" data/train data/train_babble + + # Combine reverb, noise, music, and babble into one directory. + utils/combine_data.sh data/train_aug data/train_reverb data/train_noise data/train_music data/train_babble +fi + +if [ $stage -le 3 ]; then + # Take a random subset of the augmentations + utils/subset_data_dir.sh data/train_aug 1000000 data/train_aug_1m + utils/fix_data_dir.sh data/train_aug_1m + + # Make MFCCs for the augmented data. Note that we do not compute a new + # vad.scp file here. Instead, we use the vad.scp from the clean version of + # the list. + steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd --max-jobs-run 20" \ + data/train_aug_1m exp/make_mfcc $mfccdir + + # Combine the clean and augmented training data. This is now roughly + # double the size of the original clean list. + utils/combine_data.sh data/train_combined data/train_aug_1m data/train +fi + +# Now we prepare the features to generate examples for xvector training. +if [ $stage -le 4 ]; then + # This script applies CMVN and removes nonspeech frames. Note that this is somewhat + # wasteful, as it roughly doubles the amount of training data on disk. After + # creating training examples, this can be removed. + local/nnet3/xvector/prepare_feats_for_egs.sh --nj 40 --cmd "$train_cmd" \ + data/train_combined data/train_combined_no_sil exp/train_combined_no_sil + utils/fix_data_dir.sh data/train_combined_no_sil +fi + +if [ $stage -le 5 ]; then + # Now, we need to remove features that are too short after removing silence + # frames. We want at least 4s (400 frames) per utterance. + min_len=400 + mv data/train_combined_no_sil/utt2num_frames data/train_combined_no_sil/utt2num_frames.bak + awk -v min_len=${min_len} '$2 > min_len {print $1, $2}' data/train_combined_no_sil/utt2num_frames.bak > data/train_combined_no_sil/utt2num_frames + utils/filter_scp.pl data/train_combined_no_sil/utt2num_frames data/train_combined_no_sil/utt2spk > data/train_combined_no_sil/utt2spk.new + mv data/train_combined_no_sil/utt2spk.new data/train_combined_no_sil/utt2spk + utils/fix_data_dir.sh data/train_combined_no_sil + + # We also want several utterances per speaker. Now we'll throw out speakers + # with fewer than 8 utterances. + min_num_utts=8 + awk '{print $1, NF-1}' data/train_combined_no_sil/spk2utt > data/train_combined_no_sil/spk2num + awk -v min_num_utts=${min_num_utts} '$2 >= min_num_utts {print $1, $2}' data/train_combined_no_sil/spk2num | utils/filter_scp.pl - data/train_combined_no_sil/spk2utt > data/train_combined_no_sil/spk2utt.new + mv data/train_combined_no_sil/spk2utt.new data/train_combined_no_sil/spk2utt + utils/spk2utt_to_utt2spk.pl data/train_combined_no_sil/spk2utt > data/train_combined_no_sil/utt2spk + + utils/filter_scp.pl data/train_combined_no_sil/utt2spk data/train_combined_no_sil/utt2num_frames > data/train_combined_no_sil/utt2num_frames.new + mv data/train_combined_no_sil/utt2num_frames.new data/train_combined_no_sil/utt2num_frames + + # Now we're ready to create training examples. + utils/fix_data_dir.sh data/train_combined_no_sil +fi + +# Stages 6 through 8 are handled in run_xvector.sh, a TDNN embedding extractor is trained. +local/nnet3/xvector/run_xvector.sh --stage $stage --train-stage -1 \ + --data data/train_combined_no_sil --nnet-dir $nnet_dir \ + --egs-dir $nnet_dir/egs + +if [ $stage -le 9 ]; then + # Extract x-vectors for DIHARD 2018 development and evaluation set. + diarization/nnet3/xvector/extract_xvectors.sh --cmd "$train_cmd --mem 5G" \ + --nj 40 --window 1.5 --period 0.75 --apply-cmn false \ + --min-segment 0.5 $nnet_dir \ + data/dihard_2018_dev_cmn $nnet_dir/xvectors_dihard_2018_dev + + diarization/nnet3/xvector/extract_xvectors.sh --cmd "$train_cmd --mem 5G" \ + --nj 40 --window 1.5 --period 0.75 --apply-cmn false \ + --min-segment 0.5 $nnet_dir \ + data/dihard_2018_eval_cmn $nnet_dir/xvectors_dihard_2018_eval + + # Reduce the amount of training data for the PLDA training. + utils/subset_data_dir.sh data/train_cmn_segmented 128000 data/train_cmn_segmented_128k + # Extract x-vectors for the VoxCeleb, which is our PLDA training + # data. A long period is used here so that we don't compute too + # many x-vectors for each recording. + diarization/nnet3/xvector/extract_xvectors.sh --cmd "$train_cmd --mem 10G" \ + --nj 40 --window 3.0 --period 10.0 --min-segment 1.5 --apply-cmn false \ + --hard-min true $nnet_dir \ + data/train_cmn_segmented_128k $nnet_dir/xvectors_train_segmented_128k +fi + +# Train PLDA models +if [ $stage -le 10 ]; then + # Train a PLDA model on VoxCeleb, using DIHARD 2018 development set to whiten. + "$train_cmd" $nnet_dir/xvectors_dihard_2018_dev/log/plda.log \ + ivector-compute-plda ark:$nnet_dir/xvectors_train_segmented_128k/spk2utt \ + "ark:ivector-subtract-global-mean \ + scp:$nnet_dir/xvectors_train_segmented_128k/xvector.scp ark:- \ + | transform-vec $nnet_dir/xvectors_dihard_2018_dev/transform.mat ark:- ark:- \ + | ivector-normalize-length ark:- ark:- |" \ + $nnet_dir/xvectors_dihard_2018_dev/plda || exit 1; +fi + +# Perform PLDA scoring +if [ $stage -le 11 ]; then + # Perform PLDA scoring on all pairs of segments for each recording. + diarization/nnet3/xvector/score_plda.sh --cmd "$train_cmd --mem 4G" \ + --nj 20 $nnet_dir/xvectors_dihard_2018_dev $nnet_dir/xvectors_dihard_2018_dev \ + $nnet_dir/xvectors_dihard_2018_dev/plda_scores + + diarization/nnet3/xvector/score_plda.sh --cmd "$train_cmd --mem 4G" \ + --nj 20 $nnet_dir/xvectors_dihard_2018_dev $nnet_dir/xvectors_dihard_2018_eval \ + $nnet_dir/xvectors_dihard_2018_eval/plda_scores +fi + +# Cluster the PLDA scores using a stopping threshold. +if [ $stage -le 12 ]; then + # First, we find the threshold that minimizes the DER on DIHARD 2018 development set. + mkdir -p $nnet_dir/tuning + echo "Tuning clustering threshold for DIHARD 2018 development set" + best_der=100 + best_threshold=0 + + # The threshold is in terms of the log likelihood ratio provided by the + # PLDA scores. In a perfectly calibrated system, the threshold is 0. + # In the following loop, we evaluate DER performance on DIHARD 2018 development + # set using some reasonable thresholds for a well-calibrated system. + for threshold in -0.5 -0.4 -0.3 -0.2 -0.1 -0.05 0 0.05 0.1 0.2 0.3 0.4 0.5; do + diarization/cluster.sh --cmd "$train_cmd --mem 4G" --nj 20 \ + --threshold $threshold --rttm-channel 1 $nnet_dir/xvectors_dihard_2018_dev/plda_scores \ + $nnet_dir/xvectors_dihard_2018_dev/plda_scores_t$threshold + + md-eval.pl -r data/dihard_2018_dev/rttm \ + -s $nnet_dir/xvectors_dihard_2018_dev/plda_scores_t$threshold/rttm \ + 2> $nnet_dir/tuning/dihard_2018_dev_t${threshold}.log \ + > $nnet_dir/tuning/dihard_2018_dev_t${threshold} + + der=$(grep -oP 'DIARIZATION\ ERROR\ =\ \K[0-9]+([.][0-9]+)?' \ + $nnet_dir/tuning/dihard_2018_dev_t${threshold}) + if [ $(perl -e "print ($der < $best_der ? 1 : 0);") -eq 1 ]; then + best_der=$der + best_threshold=$threshold + fi + done + echo "$best_threshold" > $nnet_dir/tuning/dihard_2018_dev_best + + diarization/cluster.sh --cmd "$train_cmd --mem 4G" --nj 20 \ + --threshold $(cat $nnet_dir/tuning/dihard_2018_dev_best) --rttm-channel 1 \ + $nnet_dir/xvectors_dihard_2018_dev/plda_scores $nnet_dir/xvectors_dihard_2018_dev/plda_scores + + # Cluster DIHARD 2018 evaluation set using the best threshold found for the DIHARD + # 2018 development set. The DIHARD 2018 development set is used as the validation + # set to tune the parameters. + diarization/cluster.sh --cmd "$train_cmd --mem 4G" --nj 20 \ + --threshold $(cat $nnet_dir/tuning/dihard_2018_dev_best) --rttm-channel 1 \ + $nnet_dir/xvectors_dihard_2018_eval/plda_scores $nnet_dir/xvectors_dihard_2018_eval/plda_scores + + mkdir -p $nnet_dir/results + # Compute the DER on the DIHARD 2018 evaluation set. We use the official metrics of + # the DIHARD challenge. The DER is calculated with no unscored collars and including + # overlapping speech. + md-eval.pl -r data/dihard_2018_eval/rttm \ + -s $nnet_dir/xvectors_dihard_2018_eval/plda_scores/rttm 2> $nnet_dir/results/threshold.log \ + > $nnet_dir/results/DER_threshold.txt + der=$(grep -oP 'DIARIZATION\ ERROR\ =\ \K[0-9]+([.][0-9]+)?' \ + $nnet_dir/results/DER_threshold.txt) + # Using supervised calibration, DER: 26.30% + echo "Using supervised calibration, DER: $der%" +fi + +# Cluster the PLDA scores using the oracle number of speakers +if [ $stage -le 13 ]; then + # In this section, we show how to do the clustering if the number of speakers + # (and therefore, the number of clusters) per recording is known in advance. + diarization/cluster.sh --cmd "$train_cmd --mem 4G" --nj 20 \ + --reco2num-spk data/dihard_2018_eval/reco2num_spk --rttm-channel 1 \ + $nnet_dir/xvectors_dihard_2018_eval/plda_scores $nnet_dir/xvectors_dihard_2018_eval/plda_scores_num_spk + + md-eval.pl -r data/dihard_2018_eval/rttm \ + -s $nnet_dir/xvectors_dihard_2018_eval/plda_scores_num_spk/rttm 2> $nnet_dir/results/num_spk.log \ + > $nnet_dir/results/DER_num_spk.txt + der=$(grep -oP 'DIARIZATION\ ERROR\ =\ \K[0-9]+([.][0-9]+)?' \ + $nnet_dir/results/DER_num_spk.txt) + # Using the oracle number of speakers, DER: 23.42% + echo "Using the oracle number of speakers, DER: $der%" +fi diff --git a/egs/dihard_2018/v2/sid b/egs/dihard_2018/v2/sid new file mode 120000 index 00000000000..893a12f30c9 --- /dev/null +++ b/egs/dihard_2018/v2/sid @@ -0,0 +1 @@ +../../sre08/v1/sid \ No newline at end of file diff --git a/egs/dihard_2018/v2/steps b/egs/dihard_2018/v2/steps new file mode 120000 index 00000000000..6e99bf5b5ad --- /dev/null +++ b/egs/dihard_2018/v2/steps @@ -0,0 +1 @@ +../../wsj/s5/steps \ No newline at end of file diff --git a/egs/dihard_2018/v2/utils b/egs/dihard_2018/v2/utils new file mode 120000 index 00000000000..b240885218f --- /dev/null +++ b/egs/dihard_2018/v2/utils @@ -0,0 +1 @@ +../../wsj/s5/utils \ No newline at end of file diff --git a/egs/fame/s5/run.sh b/egs/fame/s5/run.sh index 26a8485ff7d..de6fe46b7c4 100755 --- a/egs/fame/s5/run.sh +++ b/egs/fame/s5/run.sh @@ -106,8 +106,8 @@ fi if [ $stage -le 7 ]; then echo "Starting SGMM training." steps/align_fmllr.sh --nj $train_nj --cmd "$train_cmd" data/train data/lang exp/tri3 exp/tri3_ali || exit 1; - steps/train_ubm.sh --cmd "$train_cmd" $numGaussUBM data/train data/lang exp/tri3_ali exp/ubm || exit 1; - steps/train_sgmm2.sh --cmd "$train_cmd" $numLeavesSGMM $numGaussSGMM data/train data/lang exp/tri3_ali exp/ubm/final.ubm exp/sgmm2 || exit 1; + steps/train_ubm.sh --cmd "$train_cmd" $numGaussUBM data/train data/lang exp/tri3_ali exp/ubm || exit 1; + steps/train_sgmm2.sh --cmd "$train_cmd" $numLeavesSGMM $numGaussSGMM data/train data/lang exp/tri3_ali exp/ubm/final.ubm exp/sgmm2 || exit 1; echo "SGMM training done." echo "Decoding the development and test sets using SGMM models" diff --git a/egs/fame/v1/local/prepare_for_eer.py b/egs/fame/v1/local/prepare_for_eer.py index 59d2985e7c2..f1dbcfa9ab6 100755 --- a/egs/fame/v1/local/prepare_for_eer.py +++ b/egs/fame/v1/local/prepare_for_eer.py @@ -1,3 +1,4 @@ +from __future__ import print_function # Copyright 2015 David Snyder # Apache 2.0. # @@ -12,4 +13,4 @@ spkrutt2target[spkr+utt]=target for line in scores: spkr, utt, score = line.strip().split() - print score, spkrutt2target[spkr+utt] + print(score, spkrutt2target[spkr+utt]) diff --git a/egs/farsdat/s5/local/nnet/run_dnn.sh b/egs/farsdat/s5/local/nnet/run_dnn.sh index fbb3db72e3e..a02894a7322 100755 --- a/egs/farsdat/s5/local/nnet/run_dnn.sh +++ b/egs/farsdat/s5/local/nnet/run_dnn.sh @@ -53,7 +53,7 @@ if [ $stage -le 1 ]; then # Pre-train DBN, i.e. a stack of RBMs (small database, smaller DNN) dir=exp/dnn4_pretrain-dbn (tail --pid=$$ -F $dir/log/pretrain_dbn.log 2>/dev/null)& # forward log - $cuda_cmd $dir/log/pretrain_dbn.log \ + "$train_cmd" --gpu 1 $dir/log/pretrain_dbn.log \ steps/nnet/pretrain_dbn.sh --hid-dim 1024 --rbm-iter 20 $data_fmllr/train $dir || exit 1; fi @@ -65,7 +65,7 @@ if [ $stage -le 2 ]; then dbn=exp/dnn4_pretrain-dbn/6.dbn (tail --pid=$$ -F $dir/log/train_nnet.log 2>/dev/null)& # forward log # Train - $cuda_cmd $dir/log/train_nnet.log \ + "$train_cmd" --gpu 1 $dir/log/train_nnet.log \ steps/nnet/train.sh --feature-transform $feature_transform --dbn $dbn --hid-layers 0 --learn-rate 0.008 \ $data_fmllr/train_tr90 $data_fmllr/train_cv10 data/lang $ali $ali $dir || exit 1; # Decode (reuse HCLG graph) @@ -93,7 +93,7 @@ fi if [ $stage -le 4 ]; then # Re-train the DNN by 6 iterations of sMBR - steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 6 --acwt $acwt --do-smbr true \ + steps/nnet/train_mpe.sh --cmd ""$train_cmd" --gpu 1" --num-iters 6 --acwt $acwt --do-smbr true \ $data_fmllr/train data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir || exit 1 # Decode for ITER in 1 6; do diff --git a/egs/farsdat/s5/run.sh b/egs/farsdat/s5/run.sh index 81f353c301c..4c3d3c5882b 100755 --- a/egs/farsdat/s5/run.sh +++ b/egs/farsdat/s5/run.sh @@ -8,7 +8,7 @@ # farsdat, description of the database: # http://www.assta.org/sst/SST-94-Vol-ll/cache/SST-94-VOL2-Chapter15-p20.pdf -. ./cmd.sh +. ./cmd.sh [ -f path.sh ] && . ./path.sh set -e @@ -54,7 +54,7 @@ echo =========================================================================== # Now make MFCC features. mfccdir=mfcc -for x in train dev test; do +for x in train dev test; do steps/make_mfcc.sh --cmd "$train_cmd" --nj $feats_nj data/$x exp/make_mfcc/$x $mfccdir steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir done diff --git a/egs/fisher_callhome_spanish/s5/conf/mfcc_hires.conf b/egs/fisher_callhome_spanish/s5/conf/mfcc_hires.conf new file mode 100644 index 00000000000..d870ab04c38 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5/conf/mfcc_hires.conf @@ -0,0 +1,10 @@ +# config for high-resolution MFCC features, intended for neural network training. +# Note: we keep all cepstra, so it has the same info as filterbank features, +# but MFCC is more easily compressible (because less correlated) which is why +# we prefer this method. +--use-energy=false # use average of log energy, not energy. +--sample-frequency=8000 # Switchboard is sampled at 8kHz +--num-mel-bins=40 # similar to Google's setup. +--num-ceps=40 # there is no dimensionality reduction. +--low-freq=40 # low cutoff frequency for mel bins +--high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800) diff --git a/egs/fisher_callhome_spanish/s5/conf/online_cmvn.conf b/egs/fisher_callhome_spanish/s5/conf/online_cmvn.conf new file mode 100644 index 00000000000..7748a4a4dd3 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5/conf/online_cmvn.conf @@ -0,0 +1 @@ +# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh diff --git a/egs/fisher_callhome_spanish/s5/local/callhome_get_lattices.py b/egs/fisher_callhome_spanish/s5/local/callhome_get_lattices.py index 9112d868c25..4c96e01ce7e 100755 --- a/egs/fisher_callhome_spanish/s5/local/callhome_get_lattices.py +++ b/egs/fisher_callhome_spanish/s5/local/callhome_get_lattices.py @@ -5,6 +5,7 @@ # The list of files in the conversations for which 1 best output has to be extracted # words.txt +from __future__ import print_function import os import sys import subprocess @@ -76,7 +77,7 @@ def findLattice(timeDetail): # Concatenate lattices mergedTranslation = latticeConcatenate(mergedTranslation, tmp) - print mergedTranslation + print(mergedTranslation) if mergedTranslation != "": # Sanjeev's Recipe : Remove epsilons and topo sort @@ -95,16 +96,16 @@ def findLattice(timeDetail): # file so it can be checked later proc = subprocess.Popen("/export/a04/gkumar/moses/mosesdecoder/checkplf < " + finalPLFFile + " 2>&1 | awk 'FNR == 2 {print}'", stdout=subprocess.PIPE, shell=True) line = proc.stdout.readline() - print line + " " + str(lineNo) + print("{} {}".format(line, lineNo)) if line.strip() != "PLF format appears to be correct.": os.system("cp " + finalFST + " " + invalidplfdir + "/" + timeInfo[0]) invalidPLF.write(invalidplfdir + "/" + timeInfo[0] + "\n") - rmLines.write(str(lineNo) + "\n") + rmLines.write("{}\n".format(lineNo)) else: provFile.write(PLFline) else: blankPLF.write(timeInfo[0] + "\n") - rmLines.write(str(lineNo) + "\n") + rmLines.write("{}\n".format(lineNo)) # Now convert to PLF lineNo += 1 diff --git a/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh b/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh new file mode 100755 index 00000000000..7f407552c2e --- /dev/null +++ b/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh @@ -0,0 +1,288 @@ +#!/bin/bash + +# 1g is like 1f but upgrading to a "resnet-style TDNN-F model", i.e. +# with bypass resnet connections, and re-tuned. +# compute-wer --text --mode=present ark:exp/chain/multipsplice_tdnn/decode_fsp_train_test/scoring_kaldi/test_filt.txt ark,p:- +# %WER 22.21 [ 8847 / 39831, 1965 ins, 2127 del, 4755 sub ] +# %SER 56.98 [ 3577 / 6278 ] +# Scored 6278 sentences, 0 not present in hyp. + +# steps/info/chain_dir_info.pl exp/chain/multipsplice_tdnn +# exp/chain/multipsplice_tdnn: num-iters=296 nj=1..2 num-params=8.2M dim=40+100->2489 combine=-0.170->-0.165 (over 8) xent:train/valid[196,295,final]=(-2.30,-1.93,-1.83/-2.24,-1.96,-1.86) logprob:train/valid[196,295,final]=(-0.208,-0.169,-0.164/-0.189,-0.161,-0.158) + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +train_set=train +test_sets="test dev" +gmm=tri5a # this is the source gmm-dir that we'll use for alignments; it + # should have alignments for the specified training data. +num_threads_ubm=32 +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. + +# Options which are not passed through to run_ivector_common.sh +affix=1g #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +common_egs_dir= +reporting_email= + +# LSTM/chain options +train_stage=-10 +xent_regularize=0.1 +dropout_schedule='0,0@0.20,0.3@0.50,0' + +# training chunk-options +chunk_width=140,100,160 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 + +# training options +srand=0 +remove_egs=true + +#decode options +test_online_decoding=false # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 17 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 18 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 19 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) + tdnn_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim-continuous=true" + tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66" + linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0" + prefinal_opts="l2-regularize=0.01" + output_opts="l2-regularize=0.005" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=1024 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=0 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + linear-component name=prefinal-l dim=192 $linear_opts + + + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 20 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.srand $srand \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.frames-per-iter 5000000 \ + --trainer.optimization.num-jobs-initial 1 \ + --trainer.optimization.num-jobs-final=2 \ + --trainer.optimization.initial-effective-lrate 0.0005 \ + --trainer.optimization.final-effective-lrate 0.00005 \ + --trainer.num-chunk-per-minibatch 128,64 \ + --trainer.optimization.momentum 0.0 \ + --egs.chunk-width $chunk_width \ + --egs.chunk-left-context 0 \ + --egs.chunk-right-context 0 \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --cleanup.remove-egs $remove_egs \ + --use-gpu true \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir exp/tri5a_lats_nodup_sp \ + --dir $dir || exit 1; +fi + +if [ $stage -le 21 ]; then + # The reason we are using data/lang_test here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + #LM was trained only on Fisher Spanish train subset. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test \ + $tree_dir $tree_dir/graph_fsp_train || exit 1; + +fi + +rnnlmdir=exp/rnnlm_lstm_tdnn_1b +if [ $stage -le 22 ]; then + local/rnnlm/train_rnnlm.sh --dir $rnnlmdir || exit 1; +fi + +if [ $stage -le 23 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l &1 | awk 'FNR == 2 {print}'", stdout=subprocess.PIPE, shell=True) line = proc.stdout.readline() - print line + " " + str(lineNo) + print("{} {}".format(line, lineNo)) if line.strip() != "PLF format appears to be correct.": os.system("cp " + finalFST + " " + invalidplfdir + "/" + timeInfo[0]) invalidPLF.write(invalidplfdir + "/" + timeInfo[0] + "\n") - rmLines.write(str(lineNo) + "\n") + rmLines.write("{}\n".format(lineNo)) else: provFile.write(PLFline) else: blankPLF.write(timeInfo[0] + "\n") - rmLines.write(str(lineNo) + "\n") + rmLines.write("{}\n".format(lineNo)) # Now convert to PLF lineNo += 1 diff --git a/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py b/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py index 5c09f09bc35..c7aa6affb11 100755 --- a/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py +++ b/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py @@ -1,10 +1,11 @@ #!/usr/bin/env python -# Copyright 2014 Gaurav Kumar. Apache 2.0 # -*- coding: utf-8 -*- # +# Copyright 2014 Gaurav Kumar. Apache 2.0 +# 2018 Nagendra Kumar Goel, Saikiran Valluri, GoVivace inc., Avaaya # Merges unique words from Spanish Fisher, Gigaword and the LDC spanish lexicon - -import sys +from __future__ import print_function +import sys, re import json import codecs import operator @@ -16,6 +17,7 @@ uw_gigaword = tmpdir + "/es_wordlist.json" uw_LDC = ldc_lexicon + "/callhome_spanish_lexicon_970908/preferences" +filtered_letters = re.compile(u'[¡¥ª°º¿àçèëìîôö0123456789]') merged_lexicon = [] # All three lexicons are in different formats # First add the data from lexicon_fisher (A) into the dictionary @@ -24,8 +26,7 @@ merged_lexicon.append(line.strip()) fisher.close() -print "After adding the fisher data, the lexicon contains " \ - + str(len(merged_lexicon)) + " entries." +print("After adding the fisher data, the lexicon contains {} entries".format(len(merged_lexicon))) # Now add data from the LDC lexicon ldc = codecs.open(uw_LDC, encoding='iso-8859-1') @@ -34,12 +35,11 @@ if entries[0].lower() not in merged_lexicon: merged_lexicon.append(entries[0].lower()) -print "After adding the LDC data, the lexicon contains " \ - + str(len(merged_lexicon)) + " entries." +print("After adding the LDC data, the lexicon contains {} entries".format(len(merged_lexicon))) # Finally add the gigaword data gigaword = json.load(open(uw_gigaword)) -gigaword = reversed(sorted(gigaword.iteritems(), key=operator.itemgetter(1))) +gigaword = reversed(sorted(gigaword.items(), key=operator.itemgetter(1))) for item in gigaword: # We need a maximum of wordlimit words in the lexicon @@ -49,16 +49,16 @@ if item[0].lower() not in merged_lexicon: merged_lexicon.append(item[0].lower()) -print "After adding the Gigaword data, the lexicon contains " \ - + str(len(merged_lexicon)) + " entries." +print("After adding the Gigaword data, the lexicon contains {} entries".format(len(merged_lexicon))) # Now write the uniquewords to a file lf = codecs.open(tmpdir + '/uniquewords64k', encoding='utf-8', mode='w+') ltuples = sorted(merged_lexicon) for item in ltuples: - lf.write(item + "\n") + if not item==u'ñ' and not re.search(filtered_letters, item): + lf.write(item + "\n") lf.close() -print "Finshed writing unique words" +print("Finshed writing unique words") diff --git a/egs/fisher_callhome_spanish/s5/local/nnet3/run_ivector_common.sh b/egs/fisher_callhome_spanish/s5/local/nnet3/run_ivector_common.sh new file mode 100755 index 00000000000..cc9de4d26c5 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5/local/nnet3/run_ivector_common.sh @@ -0,0 +1,187 @@ +#!/bin/bash + +set -e -o pipefail + +# This script is called from scripts like local/nnet3/run_tdnn.sh and +# local/chain/run_tdnn.sh (and may eventually be called by more scripts). It +# contains the common feature preparation and iVector-related parts of the +# script. See those scripts for examples of usage. + + +stage=7 +nj=30 +train_set=train # you might set this to e.g. train. +test_sets="test dev" +gmm=tri5a # This specifies a GMM-dir from the features of the type you're training the system on; + # it should contain alignments for 'train_set'. + +num_threads_ubm=32 +nnet3_affix= # affix for exp/nnet3 directory to put iVector stuff in (e.g. + # in the tedlium recip it's _cleaned). + +. ./cmd.sh +. ./path.sh +. utils/parse_options.sh + + +gmm_dir=exp/${gmm} +ali_dir=exp/${gmm}_ali_${train_set}_sp + +for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do + if [ ! -f $f ]; then + echo "$0: expected file $f to exist" + exit 1 + fi +done + + + +if [ $stage -le 7 ] && [ -f data/${train_set}_sp_hires/feats.scp ]; then + echo "$0: data/${train_set}_sp_hires/feats.scp already exists." + echo " ... Please either remove it, or rerun this script with stage > 7." + exit 1 +fi + + +if [ $stage -le 8 ]; then + echo "$0: preparing directory for speed-perturbed data" + utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp +fi + +if [ $stage -le 9 ]; then + echo "$0: creating high-resolution MFCC features" + + # this shows how you can split across multiple file-systems. we'll split the + # MFCC dir across multiple locations. You might want to be careful here, if you + # have multiple copies of Kaldi checked out and run the same recipe, not to let + # them overwrite each other. + mfccdir=data/${train_set}_sp_hires/data + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/mfcc/wsj-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage + fi + + for datadir in ${train_set}_sp ${test_sets}; do + utils/copy_data_dir.sh data/$datadir data/${datadir}_hires + done + + # do volume-perturbation on the training data prior to extracting hires + # features; this helps make trained nnets more invariant to test data volume. + utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires + + for datadir in ${train_set}_sp ${test_sets}; do + steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/${datadir}_hires + steps/compute_cmvn_stats.sh data/${datadir}_hires + utils/fix_data_dir.sh data/${datadir}_hires + done +fi + +if [ $stage -le 10 ]; then + echo "$0: computing a subset of data to train the diagonal UBM." + + mkdir -p exp/nnet3${nnet3_affix}/diag_ubm + temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm + + # train a diagonal UBM using a subset of about a quarter of the data + num_utts_total=$(wc -l $text_dir/ami.txt + cat $dev | cut -d ' ' -f2- > $text_dir/dev.txt +fi + +if [ $stage -le 1 ]; then + cp $wordlist $dir/config/ + n=`cat $dir/config/words.txt | wc -l` + echo " $n" >> $dir/config/words.txt + + # words that are not present in words.txt but are in the training or dev data, will be + # mapped to during training. + echo "" >$dir/config/oov.txt + + cat > $dir/config/data_weights.txt <$dir/config/unigram_probs.txt + + # choose features + rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \ + --use-constant-feature=true \ + --top-word-features 10000 \ + --min-frequency 1.0e-03 \ + --special-words=',,,,[noise],[laughter]' \ + $dir/config/words.txt > $dir/config/features.txt + +lstm_opts="l2-regularize=$comp_l2" +tdnn_opts="l2-regularize=$comp_l2" +output_opts="l2-regularize=$output_l2" + + cat >$dir/config/xconfig <&1 | awk 'FNR == 2 {print}'", stdout=subprocess.PIPE, shell=True) line = proc.stdout.readline() - print line + " " + str(lineNo) + print("{} {}".format(line, lineNo)) if line.strip() != "PLF format appears to be correct.": os.system("cp " + finalFST + " " + invalidplfdir + "/" + timeInfo[0]) invalidPLF.write(invalidplfdir + "/" + timeInfo[0] + "\n") - rmLines.write(str(lineNo) + "\n") + rmLines.write("{}\n".format(lineNo)) else: provFile.write(PLFline) else: blankPLF.write(timeInfo[0] + "\n") - rmLines.write(str(lineNo) + "\n") + rmLines.write("{}\n".format(lineNo)) # Now convert to PLF lineNo += 1 diff --git a/egs/fisher_callhome_spanish/s5/path.sh b/egs/fisher_callhome_spanish/s5/path.sh index 1a6fb5f891b..17ffb0369f8 100755 --- a/egs/fisher_callhome_spanish/s5/path.sh +++ b/egs/fisher_callhome_spanish/s5/path.sh @@ -3,3 +3,4 @@ export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 . $KALDI_ROOT/tools/config/common_path.sh export LC_ALL=C +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/dpovey/libs diff --git a/egs/fisher_callhome_spanish/s5/rnnlm b/egs/fisher_callhome_spanish/s5/rnnlm new file mode 120000 index 00000000000..fb754622d5e --- /dev/null +++ b/egs/fisher_callhome_spanish/s5/rnnlm @@ -0,0 +1 @@ +../../wsj/s5/rnnlm \ No newline at end of file diff --git a/egs/fisher_callhome_spanish/s5/run.sh b/egs/fisher_callhome_spanish/s5/run.sh index 57902a98fed..6e2752a7b68 100755 --- a/egs/fisher_callhome_spanish/s5/run.sh +++ b/egs/fisher_callhome_spanish/s5/run.sh @@ -1,20 +1,22 @@ #!/bin/bash # +# Copyright 2018 Nagendra Goel, Saikiran Valluri Apache 2.0 # Copyright 2014 Gaurav Kumar. Apache 2.0 # Recipe for Fisher/Callhome-Spanish -# Made to integrate KALDI with JOSHUA for end-to-end ASR and SMT stage=0 +train_stage=-20 +train_sgmm2=false # call the next line with the directory where the Spanish Fisher data is # (the values below are just an example). -sfisher_speech=/veu4/jadrian/data/LDC/LDC2010S01 -sfisher_transcripts=/veu4/jadrian/data/LDC/LDC2010T04 -spanish_lexicon=/veu4/jadrian/data/LDC/LDC96L16 +sfisher_speech=/export/corpora/LDC/LDC2010S01 +sfisher_transcripts=/export/corpora/LDC/LDC2010T04 +spanish_lexicon=/export/corpora/LDC/LDC96L16 split=local/splits/split_fisher -callhome_speech=/veu4/jadrian/data/LDC/LDC96S35 -callhome_transcripts=/veu4/jadrian/data/LDC/LDC96T17 +callhome_speech=/export/corpora/LDC/LDC96S35 +callhome_transcripts=/export/corpora/LDC/LDC96T17 split_callhome=local/splits/split_callhome mfccdir=`pwd`/mfcc @@ -25,7 +27,7 @@ if [ -f path.sh ]; then . ./path.sh; fi set -e -if [ $stage -lt 1 ]; then +if [ $stage -le 1 ]; then local/fsp_data_prep.sh $sfisher_speech $sfisher_transcripts local/callhome_data_prep.sh $callhome_speech $callhome_transcripts @@ -95,7 +97,7 @@ if [ $stage -lt 1 ]; then local/callhome_create_splits.sh $split_callhome fi -if [ $stage -lt 2 ]; then +if [ $stage -le 2 ]; then # Now compute CMVN stats for the train, dev and test subsets steps/compute_cmvn_stats.sh data/dev exp/make_mfcc/dev $mfccdir steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test $mfccdir @@ -124,90 +126,95 @@ if [ $stage -lt 2 ]; then utils/subset_data_dir.sh --speakers data/train 90000 data/train_100k fi +if [ $stage -le 3 ]; then + steps/train_mono.sh --nj 10 --cmd "$train_cmd" \ + data/train_10k_nodup data/lang exp/mono0a -steps/train_mono.sh --nj 10 --cmd "$train_cmd" \ - data/train_10k_nodup data/lang exp/mono0a + steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_30k data/lang exp/mono0a exp/mono0a_ali || exit 1; -steps/align_si.sh --nj 30 --cmd "$train_cmd" \ - data/train_30k data/lang exp/mono0a exp/mono0a_ali || exit 1; - -steps/train_deltas.sh --cmd "$train_cmd" \ + steps/train_deltas.sh --cmd "$train_cmd" \ 2500 20000 data/train_30k data/lang exp/mono0a_ali exp/tri1 || exit 1; -(utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph - steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ - exp/tri1/graph data/dev exp/tri1/decode_dev)& + (utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph + steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + exp/tri1/graph data/dev exp/tri1/decode_dev)& -steps/align_si.sh --nj 30 --cmd "$train_cmd" \ - data/train_30k data/lang exp/tri1 exp/tri1_ali || exit 1; + steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_30k data/lang exp/tri1 exp/tri1_ali || exit 1; -steps/train_deltas.sh --cmd "$train_cmd" \ + steps/train_deltas.sh --cmd "$train_cmd" \ 2500 20000 data/train_30k data/lang exp/tri1_ali exp/tri2 || exit 1; -( - utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph || exit 1; - steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ - exp/tri2/graph data/dev exp/tri2/decode_dev || exit 1; -)& - + ( + utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph || exit 1; + steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + exp/tri2/graph data/dev exp/tri2/decode_dev || exit 1; + )& +fi -steps/align_si.sh --nj 30 --cmd "$train_cmd" \ - data/train_100k data/lang exp/tri2 exp/tri2_ali || exit 1; +if [ $stage -le 4 ]; then + steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_100k data/lang exp/tri2 exp/tri2_ali || exit 1; # Train tri3a, which is LDA+MLLT, on 100k data. -steps/train_lda_mllt.sh --cmd "$train_cmd" \ + steps/train_lda_mllt.sh --cmd "$train_cmd" \ --splice-opts "--left-context=3 --right-context=3" \ 3000 40000 data/train_100k data/lang exp/tri2_ali exp/tri3a || exit 1; -( - utils/mkgraph.sh data/lang_test exp/tri3a exp/tri3a/graph || exit 1; - steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ - exp/tri3a/graph data/dev exp/tri3a/decode_dev || exit 1; -)& - + ( + utils/mkgraph.sh data/lang_test exp/tri3a exp/tri3a/graph || exit 1; + steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + exp/tri3a/graph data/dev exp/tri3a/decode_dev || exit 1; + )& +fi +if [ $stage -le 5 ]; then # Next we'll use fMLLR and train with SAT (i.e. on # fMLLR features) -steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ - data/train_100k data/lang exp/tri3a exp/tri3a_ali || exit 1; + steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ + data/train_100k data/lang exp/tri3a exp/tri3a_ali || exit 1; -steps/train_sat.sh --cmd "$train_cmd" \ - 4000 60000 data/train_100k data/lang exp/tri3a_ali exp/tri4a || exit 1; + steps/train_sat.sh --cmd "$train_cmd" \ + 4000 60000 data/train_100k data/lang exp/tri3a_ali exp/tri4a || exit 1; -( - utils/mkgraph.sh data/lang_test exp/tri4a exp/tri4a/graph - steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ - exp/tri4a/graph data/dev exp/tri4a/decode_dev + ( + utils/mkgraph.sh data/lang_test exp/tri4a exp/tri4a/graph + steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + exp/tri4a/graph data/dev exp/tri4a/decode_dev )& -steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ - data/train data/lang exp/tri4a exp/tri4a_ali || exit 1; + steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ + data/train data/lang exp/tri4a exp/tri4a_ali || exit 1; # Reduce the number of gaussians -steps/train_sat.sh --cmd "$train_cmd" \ - 5000 120000 data/train data/lang exp/tri4a_ali exp/tri5a || exit 1; + steps/train_sat.sh --cmd "$train_cmd" \ + 5000 120000 data/train data/lang exp/tri4a_ali exp/tri5a || exit 1; -( - utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph - steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ - exp/tri5a/graph data/dev exp/tri5a/decode_dev - steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ - exp/tri5a/graph data/test exp/tri5a/decode_test + ( + utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph + steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + exp/tri5a/graph data/dev exp/tri5a/decode_dev + steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + exp/tri5a/graph data/test exp/tri5a/decode_test # Decode CALLHOME - steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ - exp/tri5a/graph data/callhome_test exp/tri5a/decode_callhome_test - steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ - exp/tri5a/graph data/callhome_dev exp/tri5a/decode_callhome_dev - steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ - exp/tri5a/graph data/callhome_train exp/tri5a/decode_callhome_train -) & - + steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + exp/tri5a/graph data/callhome_test exp/tri5a/decode_callhome_test + steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + exp/tri5a/graph data/callhome_dev exp/tri5a/decode_callhome_dev + steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + exp/tri5a/graph data/callhome_train exp/tri5a/decode_callhome_train + ) & + + + steps/align_fmllr.sh \ + --boost-silence 0.5 --nj 32 --cmd "$train_cmd" \ + data/train data/lang exp/tri5a exp/tri5a_ali +fi -steps/align_fmllr.sh \ - --boost-silence 0.5 --nj 32 --cmd "$train_cmd" \ - data/train data/lang exp/tri5a exp/tri5a_ali +if $train_sgmm2; then steps/train_ubm.sh \ --cmd "$train_cmd" 750 \ @@ -258,22 +265,7 @@ for iter in 1 2 3 4; do done ) & -dnn_cpu_parallel_opts=(--minibatch-size 128 --max-change 10 --num-jobs-nnet 8 --num-threads 16 \ - --parallel-opts "--num-threads 16") -dnn_gpu_parallel_opts=(--minibatch-size 512 --max-change 40 --num-jobs-nnet 4 --num-threads 1 \ - --parallel-opts "--gpu 1") - -steps/nnet2/train_pnorm_ensemble.sh \ - --mix-up 5000 --initial-learning-rate 0.008 --final-learning-rate 0.0008\ - --num-hidden-layers 4 --pnorm-input-dim 2000 --pnorm-output-dim 200\ - --cmd "$train_cmd" \ - "${dnn_gpu_parallel_opts[@]}" \ - --ensemble-size 4 --initial-beta 0.1 --final-beta 5 \ - data/train data/lang exp/tri5a_ali exp/tri6a_dnn +fi -( - steps/nnet2/decode.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 \ - --scoring-opts "--min-lmwt 8 --max-lmwt 16" --transform-dir exp/tri5a/decode_dev exp/tri5a/graph data/dev exp/tri6a_dnn/decode_dev -) & -wait +local/chain/run_tdnn_1g.sh --stage $stage --train-stage $train_stage || exit 1; exit 0; diff --git a/egs/fisher_english/s5/local/chain/run_tdnn.sh b/egs/fisher_english/s5/local/chain/run_tdnn.sh index 14174e617c4..1fd0f1fdf3a 100755 --- a/egs/fisher_english/s5/local/chain/run_tdnn.sh +++ b/egs/fisher_english/s5/local/chain/run_tdnn.sh @@ -112,7 +112,7 @@ if [ $stage -le 12 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) mkdir -p $dir/configs cat < $dir/configs/network.xconfig diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_1a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_1a.sh index e95de232304..07636a8b3c8 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_1a.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_1a.sh @@ -209,11 +209,11 @@ diff $sup_tree_dir/tree $sup_chain_dir/tree || { echo "$0: $sup_tree_dir/tree an # steps/nnet3/chain/build_tree_multiple_sources.sh \ # --use-fmllr false --context-opts "--context-width=2 --central-position=1" \ # --frame-subsampling-factor $frame_subsampling_factor \ -# 7000 $lang \ +# 7000 $unsup_decode_lang \ # data/${supervised_set_perturbed} \ # ${sup_tree_dir} \ # data/${unsupervised_set_perturbed} \ -# $chaindir/best_path_${unsupervised_set_perturbed} \ +# ${sup_chain_dir}/best_path_${unsupervised_set_perturbed} \ # $treedir || exit 1 # fi # @@ -231,7 +231,7 @@ if [ $stage -le 11 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $sup_tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) mkdir -p $dir/configs cat < $dir/configs/network.xconfig diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_1a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_1a.sh index e76df666e8a..b1c133942ef 100755 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_1a.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_1a.sh @@ -142,7 +142,7 @@ if [ $stage -le 13 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) mkdir -p $dir/configs cat < $dir/configs/network.xconfig diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_1a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_1a.sh index 2d5b2f8480e..04244014502 100755 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_1a.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_1a.sh @@ -228,11 +228,11 @@ diff $sup_tree_dir/tree $sup_chain_dir/tree || { echo "$0: $sup_tree_dir/tree an # steps/nnet3/chain/build_tree_multiple_sources.sh \ # --use-fmllr false --context-opts "--context-width=2 --central-position=1" \ # --frame-subsampling-factor $frame_subsampling_factor \ -# 7000 $lang \ +# 7000 $unsup_decode_lang \ # data/${supervised_set_perturbed} \ # ${sup_tree_dir} \ # data/${unsupervised_set_perturbed} \ -# $chaindir/best_path_${unsupervised_set_perturbed} \ +# ${sup_chain_dir}/best_path_${unsupervised_set_perturbed}_big \ # $treedir || exit 1 # fi # @@ -250,7 +250,7 @@ if [ $stage -le 11 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $sup_tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) mkdir -p $dir/configs cat < $dir/configs/network.xconfig diff --git a/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh b/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh index cbf0ef6cb6c..c12f604f26b 100755 --- a/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh +++ b/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh @@ -133,7 +133,7 @@ if [ $stage -le 12 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) mkdir -p $dir/configs cat < $dir/configs/network.xconfig diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_7c.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_7c.sh index 12b3187a5fa..efcd1eced4a 100644 --- a/egs/fisher_swbd/s5/local/chain/run_tdnn_7c.sh +++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_7c.sh @@ -129,7 +129,7 @@ if [ $stage -le 12 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) mkdir -p $dir/configs cat < $dir/configs/network.xconfig diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_7d.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_7d.sh index 7d640c3262a..e4a555abfdd 100644 --- a/egs/fisher_swbd/s5/local/chain/run_tdnn_7d.sh +++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_7d.sh @@ -134,7 +134,7 @@ if [ $stage -le 12 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) opts="l2-regularize=0.002" linear_opts="orthonormal-constraint=1.0" output_opts="l2-regularize=0.0005 bottleneck-dim=256" diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh index 07e88b59ddc..5650cedca28 100755 --- a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh +++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh @@ -142,7 +142,7 @@ if [ $stage -le 12 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) lstm_opts="decay-time=20" mkdir -p $dir/configs diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a_svd.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a_svd.sh new file mode 100644 index 00000000000..5beb2e74a9a --- /dev/null +++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a_svd.sh @@ -0,0 +1,448 @@ +#!/bin/bash +# +# Copyright 2018 Nagendra Kumar Goel, +# Saikiran Valluri, Govivace.Inc - Apache 2.0 + +# The script is organized as below. +# First we train the baseline LSTMP-TDNN config chain model for few epochs on the (Fisher+swbd)-english data, +# Then, we perform SVD based refactoring of all the Affine components in this baseline final.mdl, +# in order to reduce the overall model parameters size, +# as determined by the bottleneck dim value or Energy and Shrinkage threshold values. +# Then, we finetune the weight parameters of the refactored model using entire Fisher + switchboard data for single epoch. + +# Command used for comparing WERs of decoding on different testsets using pre-SVD and SVD models: +# ./local/chain/compare_wer_general.sh --looped tdnn_lstm_1a_sp tdnn_lstm_1a_svd_sp +# +# Please run this entire script till the end before running the above WER compare command... + + +# System tdnn_lstm_1a_sp +# WER on eval2000(tg) 12.3 +# [looped:] 12.2 +# WER on eval2000(fg) 12.1 +# [looped:] 12.1 +# WER on eval2000(fg) +# [SVD retrained + looped] 12.1 +# WER on rt03(tg) 11.6 +# [looped:] 11.6 +# WER on rt03(tg) +# [SVD retrained] 12 +# WER on rt03(fg) 11.3 +# [looped:] 11.3 +# Final train prob -0.074 +# Final valid prob -0.084 +# Final train prob (xent) -0.882 +# Final valid prob (xent) -0.9393 + +# WER stats for eval2000 using tdnn_lstm_1a_sp +# | #Snt #Wrd | Corr Sub Del Ins Err S.Err | +# %WER 16.0 | 2628 21594 | 86.3 9.0 4.7 2.3 16.0 54.4 | exp/chain/tdnn_lstm_1a_sp/decode_eval2000_fsh_sw1_tg/score_7_0.0/eval2000_hires.ctm.callhm.filt.sys +# %WER 12.3 | 4459 42989 | 89.4 7.1 3.5 1.7 12.3 49.8 | exp/chain/tdnn_lstm_1a_sp/decode_eval2000_fsh_sw1_tg/score_8_0.0/eval2000_hires.ctm.filt.sys +# %WER 8.4 | 1831 21395 | 92.7 5.1 2.2 1.1 8.4 42.3 | exp/chain/tdnn_lstm_1a_sp/decode_eval2000_fsh_sw1_tg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys +# %WER 15.9 | 2628 21594 | 86.4 8.9 4.7 2.3 15.9 54.3 | exp/chain/tdnn_lstm_1a_sp/decode_eval2000_fsh_sw1_fg/score_7_0.0/eval2000_hires.ctm.callhm.filt.sys +# %WER 12.1 | 4459 42989 | 89.6 6.9 3.5 1.7 12.1 49.2 | exp/chain/tdnn_lstm_1a_sp/decode_eval2000_fsh_sw1_fg/score_8_0.0/eval2000_hires.ctm.filt.sys +# %WER 8.2 | 1831 21395 | 93.1 5.1 1.8 1.3 8.2 41.7 | exp/chain/tdnn_lstm_1a_sp/decode_eval2000_fsh_sw1_fg/score_8_0.0/eval2000_hires.ctm.swbd.filt.sys + +# WER stats for rt03 using tdnn_lstm_1a_sp +# %WER 9.6 | 3970 36721 | 91.5 5.5 3.0 1.1 9.6 41.2 | exp/chain/tdnn_lstm_1a_sp/decode_rt03_fsh_sw1_tg/score_7_0.0/rt03_hires.ctm.fsh.filt.sys +# %WER 11.6 | 8420 76157 | 89.7 6.8 3.4 1.4 11.6 43.0 | exp/chain/tdnn_lstm_1a_sp/decode_rt03_fsh_sw1_tg/score_7_0.0/rt03_hires.ctm.filt.sys +# %WER 13.3 | 4450 39436 | 88.0 7.4 4.6 1.3 13.3 44.5 | exp/chain/tdnn_lstm_1a_sp/decode_rt03_fsh_sw1_tg/score_9_0.0/rt03_hires.ctm.swbd.filt.sys +# %WER 9.4 | 3970 36721 | 91.8 5.3 2.9 1.1 9.4 40.3 | exp/chain/tdnn_lstm_1a_sp/decode_rt03_fsh_sw1_fg/score_7_0.0/rt03_hires.ctm.fsh.filt.sys +# %WER 11.3 | 8420 76157 | 89.9 6.4 3.7 1.2 11.3 42.4 | exp/chain/tdnn_lstm_1a_sp/decode_rt03_fsh_sw1_fg/score_8_0.0/rt03_hires.ctm.filt.sys +# %WER 13.1 | 4450 39436 | 88.3 7.5 4.2 1.4 13.1 44.0 | exp/chain/tdnn_lstm_1a_sp/decode_rt03_fsh_sw1_fg/score_8_0.0/rt03_hires.ctm.swbd.filt.sys + +# WER stats for rt03 using tdnn_lstm_1a_svd_sp +# %WER 9.7 | 3970 36721 | 91.3 5.9 2.8 1.0 9.7 40.0 | exp/chain/tdnn_lstm_1a_svd_sp/decode_rt03_fsh_sw1_tg/score_8_0.0/rt03_hires.ctm.fsh.filt.sys +# %WER 12 | 8420 76157 | 89.3 7.3 3.4 1.3 12.0 42.0 | exp/chain/tdnn_lstm_1a_svd_sp/decode_rt03_fsh_sw1_tg/score_8_0.0/rt03_hires.ctm.filt.sys +# %WER 14.1 | 4450 39436 | 87.4 8.2 4.3 1.5 14.1 44.6 | exp/chain/tdnn_lstm_1a_svd_sp/decode_rt03_fsh_sw1_tg/score_9_0.0/rt03_hires.ctm.swbd.filt.sys + + +set -e + +# configs for 'chain' +stage=0 +train_stage=-20 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_lstm_1a # Note: _sp will get added to this if $speed_perturb == true. +svd_dir=${dir}_svd # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= +decode_dir_affix= + +# training options +leftmost_questions_truncate=-1 +chunk_width=150 +chunk_left_context=40 +chunk_right_context=0 +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 +# decode options +extra_left_context=50 +extra_right_context=0 +frames_per_chunk= + +remove_egs=false +common_egs_dir= + +affix= + +# config for svd +apply_svd=true +energy_threshold=0.81 +shrinkage_threshold=0.64 +primary_lr_factor=0.25 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 11000 data/$build_tree_train_set $lang $build_tree_ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + lstm_opts="decay-time=20" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2, ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=1024 + relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024 + relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024 + + # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults + lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024 + relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024 + lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024 + relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024 + lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.num-chunk-per-minibatch 64 \ + --trainer.frames-per-iter 1200000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $chunk_width \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri5a_lats_nodup$suffix \ + --dir $dir || exit 1; +fi + +src_mdl=${dir}/final.mdl +if $apply_svd && [ $stage -le 14 ]; then + # model compression using SVD + + # threshold configs for tdnn layers + mkdir -p $svd_dir/configs + edits_config=$svd_dir/configs/final.config + common_egs_dir=$dir/egs + cat < ${edits_config} + set-learning-rate-factor learning-rate-factor=$primary_lr_factor + apply-svd name=* energy-threshold=$energy_threshold shrinkage-threshold=$shrinkage_threshold +EOF + + # Copy files / directories from source directory + cp ${dir}/{cmvn_opts,tree,frame_subsampling_factor,0.trans_mdl,normalization.fst,den.fst} $svd_dir/. + + # Generate initial model from trained model + $train_cmd $svd_dir/log/generate_input_mdl.log \ + nnet3-am-copy --edits-config=$edits_config $src_mdl $svd_dir/input.raw + + # Retrain the model for 1 epoch + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --trainer.input-model $svd_dir/input.raw \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.num-chunk-per-minibatch 64 \ + --trainer.frames-per-iter 1200000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 1 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $chunk_width \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri5a_lats_nodup$suffix \ + --dir ${svd_dir} || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg +fi + +decode_suff=fsh_sw1_tg +graph_dir=$dir/graph_fsh_sw1_tg + +if [ $stage -le 16 ]; then + [ -z $extra_left_context ] && extra_left_context=$chunk_left_context; + [ -z $extra_right_context ] && extra_right_context=$chunk_right_context; + [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width; + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in rt03 eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk "$frames_per_chunk" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1; + fi + ) & + done +fi + +test_online_decoding=true +lang=data/lang_fsh_sw1_tg +if $test_online_decoding && [ $stage -le 17 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang exp/nnet3/extractor $dir ${dir}_online + + rm $dir/.error 2>/dev/null || true + for decode_set in rt03 eval2000; do + ( + # note: we just give it "$decode_set" as it only uses the wav.scp, the + # feature type does not matter. + + steps/online/nnet3/decode.sh --nj 50 --cmd "$decode_cmd" $iter_opts \ + --acwt 1.0 --post-decode-acwt 10.0 \ + $graph_dir data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1; + fi + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in online decoding" + exit 1 + fi +fi + +if $apply_svd; then + # Decoding the svd retrained model. + dir=$svd_dir +fi + +if [ $stage -le 18 ]; then + [ -z $extra_left_context ] && extra_left_context=$chunk_left_context; + [ -z $extra_right_context ] && extra_right_context=$chunk_right_context; + [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width; + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in rt03 eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk "$frames_per_chunk" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1; + fi + ) & + done +fi + +test_online_decoding=true +lang=data/lang_fsh_sw1_tg +if $test_online_decoding && [ $stage -le 19 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang exp/nnet3/extractor $dir ${dir}_online + + rm $dir/.error 2>/dev/null || true + for decode_set in rt03 eval2000; do + ( + # note: we just give it "$decode_set" as it only uses the wav.scp, the + # feature type does not matter. + + steps/online/nnet3/decode.sh --nj 50 --cmd "$decode_cmd" $iter_opts \ + --acwt 1.0 --post-decode-acwt 10.0 \ + $graph_dir data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1; + fi + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in online decoding" + exit 1 + fi +fi + +exit 0; diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1b.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1b.sh index c9d50d1f7bd..f3cc869e6de 100755 --- a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1b.sh +++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1b.sh @@ -151,7 +151,7 @@ if [ $stage -le 12 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) lstm_opts="decay-time=20 dropout-proportion=0.0" mkdir -p $dir/configs diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh index 1cce08abeee..059a81e15fc 100755 --- a/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh +++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh @@ -148,7 +148,7 @@ if [ $stage -le 12 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) gru_opts="dropout-per-frame=true dropout-proportion=0.0 " mkdir -p $dir/configs diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1b.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1b.sh index 2334c6a1bc1..d86b699d6f6 100755 --- a/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1b.sh +++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1b.sh @@ -149,7 +149,7 @@ if [ $stage -le 12 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) gru_opts="dropout-per-frame=true dropout-proportion=0.0 " mkdir -p $dir/configs diff --git a/egs/fisher_swbd/s5/local/fisher_data_prep.sh b/egs/fisher_swbd/s5/local/fisher_data_prep.sh index 909e53aaf30..186f7d7e122 100755 --- a/egs/fisher_swbd/s5/local/fisher_data_prep.sh +++ b/egs/fisher_swbd/s5/local/fisher_data_prep.sh @@ -118,7 +118,7 @@ if [ $stage -le 1 ]; then $line1 =~ m/# (.+)\.sph/ || die "Bad first line $line1 in file $file"; $call_id eq $1 || die "Mismatch call-id $call_id vs $1\n"; while () { - if (m/([0-9.]+)\s+([0-9.]+) ([AB]):\s*(\S.+\S|\S)\s*$/) { + if (m/([0-9.]+)\s+([0-9.]+) ([AB]):\s*(\S.*\S|\S)\s*$/) { $start = sprintf("%06d", $1 * 100.0); $end = sprintf("%06d", $2 * 100.0); length($end) > 6 && die "Time too long $end in file $file"; diff --git a/egs/fisher_swbd/s5/local/format_acronyms_ctm_eval2000.py b/egs/fisher_swbd/s5/local/format_acronyms_ctm_eval2000.py index 3c447c5976a..75cc4458d85 100755 --- a/egs/fisher_swbd/s5/local/format_acronyms_ctm_eval2000.py +++ b/egs/fisher_swbd/s5/local/format_acronyms_ctm_eval2000.py @@ -10,6 +10,7 @@ # en_4156 B 414.58 0.16 l # en_4156 B 414.74 0.17 a +from __future__ import division import argparse,re __author__ = 'Minhua Wu' @@ -27,7 +28,7 @@ if items[4].find(".") != -1: letters = items[4].split("._") acronym_period = round(float(items[3]), 2) - letter_slot = round(acronym_period / len(letters), 2) + letter_slot = round(acronym_period/len(letters), 2) time_start = round(float(items[2]), 2) for l in letters[:-1]: time = " %.2f %.2f " % (time_start, letter_slot) diff --git a/egs/fisher_swbd/s5/local/format_acronyms_ctm_rt03.py b/egs/fisher_swbd/s5/local/format_acronyms_ctm_rt03.py index 59814beb4ea..c3f9af09c99 100755 --- a/egs/fisher_swbd/s5/local/format_acronyms_ctm_rt03.py +++ b/egs/fisher_swbd/s5/local/format_acronyms_ctm_rt03.py @@ -10,6 +10,7 @@ # en_4156 B 414.58 0.16 l # en_4156 B 414.74 0.17 a +from __future__ import division import argparse,re __author__ = 'Minhua Wu' @@ -27,7 +28,7 @@ if items[4].find(".") != -1: letters = items[4].split("._") acronym_period = round(float(items[3]), 2) - letter_slot = round(acronym_period / len(letters), 2) + letter_slot = round(acronym_period/ len(letters), 2) time_start = round(float(items[2]), 2) for l in letters[:-1]: time = " %.2f %.2f " % (time_start, letter_slot) diff --git a/egs/formosa/README.txt b/egs/formosa/README.txt new file mode 100644 index 00000000000..3b9d78dad92 --- /dev/null +++ b/egs/formosa/README.txt @@ -0,0 +1,22 @@ +### Welcome to the demo recipe of the Formosa Speech in the Wild (FSW) Project ### + +The language habits of Taiwanese people are different from other Mandarin speakers (both accents and cultures) [1]. Especially Tainwaese use tranditional Chinese characters, i.e., 繁體中文). To address this issue, a Taiwanese speech corpus collection project "Formosa Speech in the Wild (FSW)" was initiated in 2017 to improve the development of Taiwanese-specific speech recognition techniques. + +FSW corpus will be a large-scale database of real-Life/multi-gene Taiwanese Spontaneous speech collected and transcribed from various sources (radio, TV, open courses, etc.). To demostrate that this database is a reasonable data resource for Taiwanese spontaneous speech recognition research, a baseline recipe is provied here for everybody, especially students, to develop their own systems easily and quickly. + +This recipe is based on the "NER-Trs-Vol1" corpus (about 150 hours broadcast radio speech selected from FSW). For more details, please visit: +* Formosa Speech in the Wild (FSW) project (https://sites.google.com/speech.ntut.edu.tw/fsw) + +If you want to apply the NER-Trs-Vol1 corpus, please contact Yuan-Fu Liao (廖元甫) via "yfliao@mail.ntut.edu.tw". This corpus is only for non-commercial research/education use and will be distributed via our GitLab server in https://speech.nchc.org.tw. + +Any bug, errors, comments or suggestions are very welcomed. + +Yuan-Fu Liao (廖元甫) +Associate Professor +Department of electronic Engineering, +National Taipei University of Technology +http://www.ntut.edu.tw/~yfliao +yfliao@mail.ntut.edu.tw + +............ +[1] The languages of Taiwan consist of several varieties of languages under families of the Austronesian languages and the Sino-Tibetan languages. Taiwanese Mandarin, Hokkien, Hakka and Formosan languages are used by 83.5%, 81.9%, 6.6% and 1.4% of the population respectively (2010). Given the prevalent use of Taiwanese Hokkien, the Mandarin spoken in Taiwan has been to a great extent influenced by it. diff --git a/egs/formosa/s5/RESULTS b/egs/formosa/s5/RESULTS new file mode 100644 index 00000000000..b047e5cefe4 --- /dev/null +++ b/egs/formosa/s5/RESULTS @@ -0,0 +1,43 @@ +# +# Reference results +# +# Experimental settings: +# +# training set: show CS, BG, DA, QG, SR, SY and WK, in total 18977 utt., 1,088,948 words +# test set: show JZ, GJ, KX and YX, in total 2112 utt., 135,972 words +# eval set: show JX, TD and WJ, in total 2222 utt., 104,648 words +# +# lexicon: 274,036 words +# phones (IPA): 196 (tonal) +# + +# WER: test + +%WER 61.32 [ 83373 / 135972, 5458 ins, 19156 del, 58759 sub ] exp/mono/decode_test/wer_11_0.0 +%WER 41.00 [ 55742 / 135972, 6725 ins, 12763 del, 36254 sub ] exp/tri1/decode_test/wer_15_0.0 +%WER 40.41 [ 54948 / 135972, 7366 ins, 11505 del, 36077 sub ] exp/tri2/decode_test/wer_14_0.0 +%WER 38.67 [ 52574 / 135972, 6855 ins, 11250 del, 34469 sub ] exp/tri3a/decode_test/wer_15_0.0 +%WER 35.70 [ 48546 / 135972, 7197 ins, 9717 del, 31632 sub ] exp/tri4a/decode_test/wer_17_0.0 +%WER 32.11 [ 43661 / 135972, 6112 ins, 10185 del, 27364 sub ] exp/tri5a/decode_test/wer_17_0.5 +%WER 31.36 [ 42639 / 135972, 6846 ins, 8860 del, 26933 sub ] exp/tri5a_cleaned/decode_test/wer_17_0.5 +%WER 24.43 [ 33218 / 135972, 5524 ins, 7583 del, 20111 sub ] exp/nnet3/tdnn_sp/decode_test/wer_12_0.0 +%WER 23.95 [ 32568 / 135972, 4457 ins, 10271 del, 17840 sub ] exp/chain/tdnn_1a_sp/decode_test/wer_10_0.0 +%WER 23.54 [ 32006 / 135972, 4717 ins, 8644 del, 18645 sub ] exp/chain/tdnn_1b_sp/decode_test/wer_10_0.0 +%WER 20.64 [ 28067 / 135972, 4434 ins, 7946 del, 15687 sub ] exp/chain/tdnn_1c_sp/decode_test/wer_11_0.0 +%WER 20.98 [ 28527 / 135972, 4706 ins, 7816 del, 16005 sub ] exp/chain/tdnn_1d_sp/decode_test/wer_10_0.0 + +# CER: test + +%WER 54.09 [ 116688 / 215718, 4747 ins, 24510 del, 87431 sub ] exp/mono/decode_test/cer_10_0.0 +%WER 32.61 [ 70336 / 215718, 5866 ins, 16282 del, 48188 sub ] exp/tri1/decode_test/cer_13_0.0 +%WER 32.10 [ 69238 / 215718, 6186 ins, 15772 del, 47280 sub ] exp/tri2/decode_test/cer_13_0.0 +%WER 30.40 [ 65583 / 215718, 6729 ins, 13115 del, 45739 sub ] exp/tri3a/decode_test/cer_12_0.0 +%WER 27.53 [ 59389 / 215718, 6311 ins, 13008 del, 40070 sub ] exp/tri4a/decode_test/cer_15_0.0 +%WER 24.21 [ 52232 / 215718, 6425 ins, 11543 del, 34264 sub ] exp/tri5a/decode_test/cer_15_0.0 +%WER 23.41 [ 50492 / 215718, 6645 ins, 10997 del, 32850 sub ] exp/tri5a_cleaned/decode_test/cer_17_0.0 +%WER 17.07 [ 36829 / 215718, 4734 ins, 9938 del, 22157 sub ] exp/nnet3/tdnn_sp/decode_test/cer_12_0.0 +%WER 16.83 [ 36305 / 215718, 4772 ins, 10810 del, 20723 sub ] exp/chain/tdnn_1a_sp/decode_test/cer_9_0.0 +%WER 16.44 [ 35459 / 215718, 4216 ins, 11278 del, 19965 sub ] exp/chain/tdnn_1b_sp/decode_test/cer_10_0.0 +%WER 13.72 [ 29605 / 215718, 4678 ins, 8066 del, 16861 sub ] exp/chain/tdnn_1c_sp/decode_test/cer_10_0.0 +%WER 14.08 [ 30364 / 215718, 5182 ins, 7588 del, 17594 sub ] exp/chain/tdnn_1d_sp/decode_test/cer_9_0.0 + diff --git a/egs/formosa/s5/cmd.sh b/egs/formosa/s5/cmd.sh new file mode 100755 index 00000000000..66ae9090820 --- /dev/null +++ b/egs/formosa/s5/cmd.sh @@ -0,0 +1,27 @@ +# "queue.pl" uses qsub. The options to it are +# options to qsub. If you have GridEngine installed, +# change this to a queue you have access to. +# Otherwise, use "run.pl", which will run jobs locally +# (make sure your --num-jobs options are no more than +# the number of cpus on your machine. + +# Run locally: +#export train_cmd=run.pl +#export decode_cmd=run.pl + +# JHU cluster (or most clusters using GridEngine, with a suitable +# conf/queue.conf). +export train_cmd="queue.pl" +export decode_cmd="queue.pl --mem 4G" + +host=$(hostname -f) +if [ ${host#*.} == "fit.vutbr.cz" ]; then + queue_conf=$HOME/queue_conf/default.conf # see example /homes/kazi/iveselyk/queue_conf/default.conf, + export train_cmd="queue.pl --config $queue_conf --mem 2G --matylda 0.2" + export decode_cmd="queue.pl --config $queue_conf --mem 3G --matylda 0.1" + export cuda_cmd="queue.pl --config $queue_conf --gpu 1 --mem 10G --tmp 40G" +elif [ ${host#*.} == "cm.cluster" ]; then + # MARCC bluecrab cluster: + export train_cmd="slurm.pl --time 4:00:00 " + export decode_cmd="slurm.pl --mem 4G --time 4:00:00 " +fi diff --git a/egs/formosa/s5/conf/decode.config b/egs/formosa/s5/conf/decode.config new file mode 100644 index 00000000000..d91f86183af --- /dev/null +++ b/egs/formosa/s5/conf/decode.config @@ -0,0 +1,5 @@ +beam=11.0 # beam for decoding. Was 13.0 in the scripts. +first_beam=8.0 # beam for 1st-pass decoding in SAT. + + + diff --git a/egs/formosa/s5/conf/mfcc.conf b/egs/formosa/s5/conf/mfcc.conf new file mode 100644 index 00000000000..a1aa3d6c158 --- /dev/null +++ b/egs/formosa/s5/conf/mfcc.conf @@ -0,0 +1,2 @@ +--use-energy=false # only non-default option. +--sample-frequency=16000 diff --git a/egs/formosa/s5/conf/mfcc_hires.conf b/egs/formosa/s5/conf/mfcc_hires.conf new file mode 100644 index 00000000000..ca067e77b37 --- /dev/null +++ b/egs/formosa/s5/conf/mfcc_hires.conf @@ -0,0 +1,10 @@ +# config for high-resolution MFCC features, intended for neural network training. +# Note: we keep all cepstra, so it has the same info as filterbank features, +# but MFCC is more easily compressible (because less correlated) which is why +# we prefer this method. +--use-energy=false # use average of log energy, not energy. +--sample-frequency=16000 # Switchboard is sampled at 8kHz +--num-mel-bins=40 # similar to Google's setup. +--num-ceps=40 # there is no dimensionality reduction. +--low-freq=40 # low cutoff frequency for mel bins +--high-freq=-200 # high cutoff frequently, relative to Nyquist of 8000 (=3800) diff --git a/egs/formosa/s5/conf/online_cmvn.conf b/egs/formosa/s5/conf/online_cmvn.conf new file mode 100644 index 00000000000..591367e7ae9 --- /dev/null +++ b/egs/formosa/s5/conf/online_cmvn.conf @@ -0,0 +1 @@ +# configuration file for apply-cmvn-online, used when invoking online2-wav-nnet3-latgen-faster. diff --git a/egs/formosa/s5/conf/pitch.conf b/egs/formosa/s5/conf/pitch.conf new file mode 100644 index 00000000000..e959a19d5b8 --- /dev/null +++ b/egs/formosa/s5/conf/pitch.conf @@ -0,0 +1 @@ +--sample-frequency=16000 diff --git a/egs/formosa/s5/local/chain/run_tdnn.sh b/egs/formosa/s5/local/chain/run_tdnn.sh new file mode 120000 index 00000000000..e1adaa9346d --- /dev/null +++ b/egs/formosa/s5/local/chain/run_tdnn.sh @@ -0,0 +1 @@ +tuning/run_tdnn_1d.sh \ No newline at end of file diff --git a/egs/formosa/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/formosa/s5/local/chain/tuning/run_tdnn_1a.sh new file mode 100755 index 00000000000..66c5ad3335f --- /dev/null +++ b/egs/formosa/s5/local/chain/tuning/run_tdnn_1a.sh @@ -0,0 +1,181 @@ +#!/bin/bash + +# This script is based on run_tdnn_7h.sh in swbd chain recipe. + +set -e + +# configs for 'chain' +affix=1a +stage=0 +train_stage=-10 +get_egs_stage=-10 +dir=exp/chain/tdnn # Note: _sp will get added to this +decode_iter= + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=2 +num_jobs_final=12 +minibatch_size=128 +frames_per_eg=150,110,90 +remove_egs=false +common_egs_dir= +xent_regularize=0.1 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 9 ]; then + # Build a tree using our new topology. This is the critically different + # step compared with other recipes. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 5000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 10 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=43 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=625 + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=625 + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=625 + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=625 + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=625 + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=625 + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=625 target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=625 target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 11 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri5a_sp_lats \ + --use-gpu wait \ + --dir $dir || exit 1; +fi + +if [ $stage -le 12 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph +fi + +graph_dir=$dir/graph +if [ $stage -le 13 ]; then + for test_set in test eval; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 10 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_$test_set \ + $graph_dir data/${test_set}_hires $dir/decode_${test_set} || exit 1; + done + wait; +fi + +exit 0; diff --git a/egs/formosa/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/formosa/s5/local/chain/tuning/run_tdnn_1b.sh new file mode 100755 index 00000000000..1981bb0530d --- /dev/null +++ b/egs/formosa/s5/local/chain/tuning/run_tdnn_1b.sh @@ -0,0 +1,188 @@ +#!/bin/bash + +# This script shows improvement arising from data cleaning. + +# CER: +# %WER 16.83 [ 36305 / 215718, 4772 ins, 10810 del, 20723 sub ] exp/chain/tdnn_1a_sp/decode_test/cer_9_0.0 +# %WER 16.44 [ 35459 / 215718, 4216 ins, 11278 del, 19965 sub ] exp/chain/tdnn_1b_sp/decode_test/cer_10_0.0 + +# steps/info/chain_dir_info.pl exp/chain/tdnn_1b_sp +# exp/chain/tdnn_1b_sp: num-iters=133 nj=2..12 num-params=12.5M dim=43+100->4528 combine=-0.073->-0.073 (over 2) xent:train/valid[87,132,final]=(-1.05,-0.964,-0.963/-1.10,-1.06,-1.05) logprob:train/valid[87,132,final]=(-0.079,-0.065,-0.065/-0.094,-0.092,-0.092) + +set -e + +# configs for 'chain' +affix=1b +nnet3_affix=_1b +stage=0 +train_stage=-10 +get_egs_stage=-10 +dir=exp/chain/tdnn # Note: _sp will get added to this +decode_iter= + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=2 +num_jobs_final=12 +minibatch_size=128 +frames_per_eg=150,110,90 +remove_egs=false +common_egs_dir= +xent_regularize=0.1 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 9 ]; then + # Build a tree using our new topology. This is the critically different + # step compared with other recipes. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 5000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 10 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=43 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=625 + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=625 + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=625 + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=625 + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=625 + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=625 + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=625 target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=625 target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 11 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri5a_sp_lats \ + --use-gpu wait \ + --dir $dir || exit 1; +fi + +if [ $stage -le 12 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph +fi + +graph_dir=$dir/graph +if [ $stage -le 13 ]; then + for test_set in test eval; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 10 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_$test_set \ + $graph_dir data/${test_set}_hires $dir/decode_${test_set} || exit 1; + done + wait; +fi +exit 0; diff --git a/egs/formosa/s5/local/chain/tuning/run_tdnn_1c.sh b/egs/formosa/s5/local/chain/tuning/run_tdnn_1c.sh new file mode 100755 index 00000000000..6fa10344cfc --- /dev/null +++ b/egs/formosa/s5/local/chain/tuning/run_tdnn_1c.sh @@ -0,0 +1,191 @@ +#!/bin/bash + +# CER: +# %WER 16.44 [ 35459 / 215718, 4216 ins, 11278 del, 19965 sub ] exp/chain/tdnn_1b_sp/decode_test/cer_10_0.0 +# %WER 13.72 [ 29605 / 215718, 4678 ins, 8066 del, 16861 sub ] exp/chain/tdnn_1c_sp/decode_test/cer_10_0.0 + +# steps/info/chain_dir_info.pl exp/chain/tdnn_1c_sp +# exp/chain/tdnn_1c_sp: num-iters=147 nj=3..16 num-params=17.9M dim=43+100->4528 combine=-0.041->-0.041 (over 2) xent:train/valid[97,146,final]=(-0.845,-0.625,-0.618/-0.901,-0.710,-0.703) logprob:train/valid[97,146,final]=(-0.064,-0.040,-0.039/-0.072,-0.058,-0.057) + +set -e + +# configs for 'chain' +affix=1c +nnet3_affix=_1b +stage=0 +train_stage=-10 +get_egs_stage=-10 +dir=exp/chain/tdnn # Note: _sp will get added to this +decode_iter= + +# training options +num_epochs=6 +initial_effective_lrate=0.00025 +final_effective_lrate=0.000025 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=64 +frames_per_eg=150,110,90 +remove_egs=false +common_egs_dir= +xent_regularize=0.1 +dropout_schedule='0,0@0.20,0.5@0.50,0' + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 9 ]; then + # Build a tree using our new topology. This is the critically different + # step compared with other recipes. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 5000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 10 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) + affine_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true" + tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66" + linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0" + prefinal_opts="l2-regularize=0.01" + output_opts="l2-regularize=0.002" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=43 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + linear-component name=prefinal-l dim=256 $linear_opts + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 11 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3$nnet3_affix/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --constrained false" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri5a_sp_lats \ + --use-gpu wait \ + --dir $dir || exit 1; +fi + +if [ $stage -le 12 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph +fi + +graph_dir=$dir/graph +if [ $stage -le 13 ]; then + for test_set in test eval; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 10 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3${nnet3_affix:+_$nnet3_affix}/ivectors_$test_set \ + $graph_dir data/${test_set}_hires $dir/decode_${test_set} || exit 1; + done + wait; +fi + +exit 0; diff --git a/egs/formosa/s5/local/chain/tuning/run_tdnn_1d.sh b/egs/formosa/s5/local/chain/tuning/run_tdnn_1d.sh new file mode 100755 index 00000000000..1f4b7e12850 --- /dev/null +++ b/egs/formosa/s5/local/chain/tuning/run_tdnn_1d.sh @@ -0,0 +1,190 @@ +#!/bin/bash + +# CER: +# 1a: %WER 16.83 [ 36305 / 215718, 4772 ins, 10810 del, 20723 sub ] exp/chain/tdnn_1a_sp/decode_test/cer_9_0.0 +# 1d: %WER 14.08 [ 30364 / 215718, 5182 ins, 7588 del, 17594 sub ] exp/chain/tdnn_1d_sp/decode_test/cer_9_0.0 + +# steps/info/chain_dir_info.pl exp/chain/tdnn_1d_sp +# exp/chain/tdnn_1d_sp: num-iters=157 nj=3..16 num-params=18.6M dim=43+100->5792 combine=-0.050->-0.050 (over 1) xent:train/valid[103,156,final]=(-0.977,-0.735,-0.725/-0.953,-0.772,-0.768) logprob:train/valid[103,156,final]=(-0.077,-0.052,-0.052/-0.079,-0.065,-0.066) + +set -e + +# configs for 'chain' +affix=1d +stage=0 +train_stage=-10 +get_egs_stage=-10 +dir=exp/chain/tdnn # Note: _sp will get added to this +decode_iter= + +# training options +num_epochs=6 +initial_effective_lrate=0.00025 +final_effective_lrate=0.000025 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=64 +frames_per_eg=150,110,90 +remove_egs=false +common_egs_dir= +xent_regularize=0.1 +dropout_schedule='0,0@0.20,0.5@0.50,0' + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 9 ]; then + # Build a tree using our new topology. This is the critically different + # step compared with other recipes. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 10 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) + affine_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true" + tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66" + linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0" + prefinal_opts="l2-regularize=0.01" + output_opts="l2-regularize=0.002" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=43 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + linear-component name=prefinal-l dim=256 $linear_opts + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 11 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3$nnet3_affix/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --constrained false" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri5a_sp_lats \ + --use-gpu wait \ + --dir $dir || exit 1; +fi + +if [ $stage -le 12 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph +fi + +graph_dir=$dir/graph +if [ $stage -le 13 ]; then + for test_set in test eval; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 10 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3${nnet3_affix:+_$nnet3_affix}/ivectors_$test_set \ + $graph_dir data/${test_set}_hires $dir/decode_${test_set} || exit 1; + done + wait; +fi + +exit 0; diff --git a/egs/formosa/s5/local/nnet3/run_ivector_common.sh b/egs/formosa/s5/local/nnet3/run_ivector_common.sh new file mode 100755 index 00000000000..723589ddd2e --- /dev/null +++ b/egs/formosa/s5/local/nnet3/run_ivector_common.sh @@ -0,0 +1,145 @@ +#!/bin/bash + +set -euo pipefail + +# This script is modified based on mini_librispeech/s5/local/nnet3/run_ivector_common.sh + +# This script is called from local/nnet3/run_tdnn.sh and +# local/chain/run_tdnn.sh (and may eventually be called by more +# scripts). It contains the common feature preparation and +# iVector-related parts of the script. See those scripts for examples +# of usage. + +stage=0 +train_set=train +test_sets="test eval" +gmm=tri5a + +nnet3_affix= + +. ./cmd.sh +. ./path.sh +. utils/parse_options.sh + +gmm_dir=exp/${gmm} +ali_dir=exp/${gmm}_sp_ali + +for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do + if [ ! -f $f ]; then + echo "$0: expected file $f to exist" + exit 1 + fi +done + +if [ $stage -le 1 ]; then + # Although the nnet will be trained by high resolution data, we still have to + # perturb the normal data to get the alignment _sp stands for speed-perturbed + echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)" + utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp + echo "$0: making MFCC features for low-resolution speed-perturbed data" + steps/make_mfcc_pitch.sh --cmd "$train_cmd" --nj 70 data/${train_set}_sp \ + exp/make_mfcc/${train_set}_sp mfcc_perturbed || exit 1; + steps/compute_cmvn_stats.sh data/${train_set}_sp \ + exp/make_mfcc/${train_set}_sp mfcc_perturbed || exit 1; + utils/fix_data_dir.sh data/${train_set}_sp +fi + +if [ $stage -le 2 ]; then + echo "$0: aligning with the perturbed low-resolution data" + steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ + data/${train_set}_sp data/lang $gmm_dir $ali_dir || exit 1 +fi + +if [ $stage -le 3 ]; then + # Create high-resolution MFCC features (with 40 cepstra instead of 13). + # this shows how you can split across multiple file-systems. + echo "$0: creating high-resolution MFCC features" + mfccdir=mfcc_perturbed_hires + + for datadir in ${train_set}_sp ${test_sets}; do + utils/copy_data_dir.sh data/$datadir data/${datadir}_hires + done + + # do volume-perturbation on the training data prior to extracting hires + # features; this helps make trained nnets more invariant to test data volume. + utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires || exit 1; + + for datadir in ${train_set}_sp ${test_sets}; do + steps/make_mfcc_pitch.sh --nj 10 --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1; + steps/compute_cmvn_stats.sh data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1; + utils/fix_data_dir.sh data/${datadir}_hires || exit 1; + # create MFCC data dir without pitch to extract iVector + utils/data/limit_feature_dim.sh 0:39 data/${datadir}_hires data/${datadir}_hires_nopitch || exit 1; + steps/compute_cmvn_stats.sh data/${datadir}_hires_nopitch exp/make_hires/$datadir $mfccdir || exit 1; + done +fi + +if [ $stage -le 4 ]; then + echo "$0: computing a subset of data to train the diagonal UBM." + # We'll use about a quarter of the data. + mkdir -p exp/nnet3${nnet3_affix}/diag_ubm + temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm + + num_utts_total=$(wc -l $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=43 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=850 + relu-batchnorm-layer name=tdnn2 dim=850 input=Append(-1,0,2) + relu-batchnorm-layer name=tdnn3 dim=850 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn4 dim=850 input=Append(-7,0,2) + relu-batchnorm-layer name=tdnn5 dim=850 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn6 dim=850 + output-layer name=output input=tdnn6 dim=$num_targets max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 8 ]; then + steps/nnet3/train_dnn.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --cleanup.preserve-model-interval 500 \ + --use-gpu wait \ + --feat-dir=data/${train_set}_hires \ + --ali-dir $ali_dir \ + --lang data/lang \ + --reporting.email="$reporting_email" \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 9 ]; then + # this version of the decoding treats each utterance separately + # without carrying forward speaker information. + + for decode_set in test eval; do + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + decode_dir=${dir}/decode_$decode_set + steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $decode_dir || exit 1; + done + wait; +fi + +exit 0; diff --git a/egs/formosa/s5/local/prepare_data.sh b/egs/formosa/s5/local/prepare_data.sh new file mode 100755 index 00000000000..68f342e1549 --- /dev/null +++ b/egs/formosa/s5/local/prepare_data.sh @@ -0,0 +1,60 @@ +#!/bin/bash +# Copyright 2015-2016 Sarah Flora Juan +# Copyright 2016 Johns Hopkins University (Author: Yenda Trmal) +# Copyright 2018 Yuan-Fu Liao, National Taipei University of Technology +# AsusTek Computer Inc. (Author: Alex Hung) + +# Apache 2.0 + +set -e -o pipefail + +train_dir=NER-Trs-Vol1/Train +eval_dir=NER-Trs-Vol1-Eval +eval_key_dir=NER-Trs-Vol1-Eval-Key + +. ./path.sh +. parse_options.sh + +for x in $train_dir $eval_dir; do + if [ ! -d "$x" ] ; then + echo >&2 "The directory $x does not exist" + fi +done + +if [ -z "$(command -v dos2unix 2>/dev/null)" ]; then + echo "dos2unix not found on PATH. Please install it manually." + exit 1; +fi + +# have to remvoe previous files to avoid filtering speakers according to cmvn.scp and feats.scp +rm -rf data/all data/train data/test data/eval data/local/train +mkdir -p data/all data/train data/test data/eval data/local/train + + +# make utt2spk, wav.scp and text +find $train_dir -name *.wav -exec sh -c 'x={}; y=$(basename -s .wav $x); printf "%s %s\n" $y $y' \; | dos2unix > data/all/utt2spk +find $train_dir -name *.wav -exec sh -c 'x={}; y=$(basename -s .wav $x); printf "%s %s\n" $y $x' \; | dos2unix > data/all/wav.scp +find $train_dir -name *.txt -exec sh -c 'x={}; y=$(basename -s .txt $x); printf "%s " $y; cat $x' \; | dos2unix > data/all/text + +# fix_data_dir.sh fixes common mistakes (unsorted entries in wav.scp, +# duplicate entries and so on). Also, it regenerates the spk2utt from +# utt2spk +utils/fix_data_dir.sh data/all + +echo "Preparing train and test data" +# test set: JZ, GJ, KX, YX +grep -E "(JZ|GJ|KX|YX)_" data/all/utt2spk | awk '{print $1}' > data/all/cv.spk +utils/subset_data_dir_tr_cv.sh --cv-spk-list data/all/cv.spk data/all data/train data/test + +# for LM training +echo "cp data/train/text data/local/train/text for language model training" +cat data/train/text | awk '{$1=""}1;' | awk '{$1=$1}1;' > data/local/train/text + +# preparing EVAL set. +find $eval_dir -name *.wav -exec sh -c 'x={}; y=$(basename -s .wav $x); printf "%s %s\n" $y $y' \; | dos2unix > data/eval/utt2spk +find $eval_dir -name *.wav -exec sh -c 'x={}; y=$(basename -s .wav $x); printf "%s %s\n" $y $x' \; | dos2unix > data/eval/wav.scp +find $eval_key_dir -name *.txt -exec sh -c 'x={}; y=$(basename -s .txt $x); printf "%s " $y; cat $x' \; | dos2unix > data/eval/text +utils/fix_data_dir.sh data/eval + +echo "Data preparation completed." +exit 0; diff --git a/egs/formosa/s5/local/prepare_dict.sh b/egs/formosa/s5/local/prepare_dict.sh new file mode 100755 index 00000000000..4e580f5f6e8 --- /dev/null +++ b/egs/formosa/s5/local/prepare_dict.sh @@ -0,0 +1,55 @@ +#!/bin/bash +# Copyright 2015-2016 Sarah Flora Juan +# Copyright 2016 Johns Hopkins University (Author: Yenda Trmal) +# Copyright 2018 Yuan-Fu Liao, National Taipei University of Technology +# Apache 2.0 + +source_dir=NER-Trs-Vol1/Language +dict_dir=data/local/dict +rm -rf $dict_dir +mkdir -p $dict_dir + +# +# +# +rm -f $dict_dir/lexicon.txt +touch $dict_dir/lexicon.txt +cat $source_dir/lexicon.txt > $dict_dir/lexicon.txt +echo " SIL" >> $dict_dir/lexicon.txt + +# +# define silence phone +# +rm -f $dict_dir/silence_phones.txt +touch $dict_dir/silence_phones.txt + +echo "SIL" > $dict_dir/silence_phones.txt + +# +# find nonsilence phones +# +rm -f $dict_dir/nonsilence_phones.txt +touch $dict_dir/nonsilence_phones.txt + +cat $source_dir/lexicon.txt | grep -v -F -f $dict_dir/silence_phones.txt | \ + perl -ane 'print join("\n", @F[1..$#F]) . "\n"; ' | \ + sort -u > $dict_dir/nonsilence_phones.txt + +# +# add optional silence phones +# + +rm -f $dict_dir/optional_silence.txt +touch $dict_dir/optional_silence.txt +echo "SIL" > $dict_dir/optional_silence.txt + +# +# extra questions +# +rm -f $dict_dir/extra_questions.txt +touch $dict_dir/extra_questions.txt +cat $dict_dir/silence_phones.txt | awk '{printf("%s ", $1);} END{printf "\n";}' > $dict_dir/extra_questions.txt || exit 1; +cat $dict_dir/nonsilence_phones.txt | awk '{printf("%s ", $1);} END{printf "\n";}' >> $dict_dir/extra_questions.txt || exit 1; + +echo "Dictionary preparation succeeded" +exit 0; diff --git a/egs/formosa/s5/local/prepare_lm.sh b/egs/formosa/s5/local/prepare_lm.sh new file mode 100755 index 00000000000..59fe1529658 --- /dev/null +++ b/egs/formosa/s5/local/prepare_lm.sh @@ -0,0 +1,42 @@ +#!/bin/bash +# Copyright 2015-2016 Sarah Flora Juan +# Copyright 2016 Johns Hopkins University (Author: Yenda Trmal) +# Apache 2.0 + +set -e -o pipefail + +# To create G.fst from ARPA language model +. ./path.sh || die "path.sh expected"; + +local/train_lms_srilm.sh --train-text data/train/text data/ data/srilm + +#nl -nrz -w10 corpus/LM/iban-bp-2012.txt | utils/shuffle_list.pl > data/local/external_text +local/train_lms_srilm.sh --train-text data/local/external_text data/ data/srilm_external + +# let's do ngram interpolation of the previous two LMs +# the lm.gz is always symlink to the model with the best perplexity, so we use that + +mkdir -p data/srilm_interp +for w in 0.9 0.8 0.7 0.6 0.5; do + ngram -lm data/srilm/lm.gz -mix-lm data/srilm_external/lm.gz \ + -lambda $w -write-lm data/srilm_interp/lm.${w}.gz + echo -n "data/srilm_interp/lm.${w}.gz " + ngram -lm data/srilm_interp/lm.${w}.gz -ppl data/srilm/dev.txt | paste -s - +done | sort -k15,15g > data/srilm_interp/perplexities.txt + +# for basic decoding, let's use only a trigram LM +[ -d data/lang_test/ ] && rm -rf data/lang_test +cp -R data/lang data/lang_test +lm=$(cat data/srilm/perplexities.txt | grep 3gram | head -n1 | awk '{print $1}') +local/arpa2G.sh $lm data/lang_test data/lang_test + +# for decoding using bigger LM let's find which interpolated gave the most improvement +[ -d data/lang_big ] && rm -rf data/lang_big +cp -R data/lang data/lang_big +lm=$(cat data/srilm_interp/perplexities.txt | head -n1 | awk '{print $1}') +local/arpa2G.sh $lm data/lang_big data/lang_big + +# for really big lm, we should only decode using small LM +# and resocre using the big lm +utils/build_const_arpa_lm.sh $lm data/lang_big data/lang_big +exit 0; diff --git a/egs/formosa/s5/local/run_cleanup_segmentation.sh b/egs/formosa/s5/local/run_cleanup_segmentation.sh new file mode 100755 index 00000000000..b72cd89b4d1 --- /dev/null +++ b/egs/formosa/s5/local/run_cleanup_segmentation.sh @@ -0,0 +1,66 @@ +#!/bin/bash + +# Copyright 2016 Vimal Manohar +# 2016 Johns Hopkins University (author: Daniel Povey) +# 2017 Nagendra Kumar Goel +# 2019 AsusTek Computer Inc. (author: Alex Hung) +# Apache 2.0 + +# This script demonstrates how to re-segment training data selecting only the +# "good" audio that matches the transcripts. +# The basic idea is to decode with an existing in-domain acoustic model, and a +# biased language model built from the reference, and then work out the +# segmentation from a ctm like file. + +# For nnet3 and chain results after cleanup, see the scripts in +# local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh + +# GMM Results for speaker-independent (SI) and speaker adaptive training (SAT) systems on dev and test sets +# [will add these later]. + +set -e +set -o pipefail +set -u + +stage=0 +cleanup_stage=0 +data=data/train +cleanup_affix=cleaned +srcdir=exp/tri5a +langdir=data/lang_test +nj=20 +decode_nj=20 +decode_num_threads=1 + +. ./cmd.sh +if [ -f ./path.sh ]; then . ./path.sh; fi +. utils/parse_options.sh + +cleaned_data=${data}_${cleanup_affix} + +dir=${srcdir}_${cleanup_affix}_work +cleaned_dir=${srcdir}_${cleanup_affix} + +if [ $stage -le 1 ]; then + # This does the actual data cleanup. + steps/cleanup/clean_and_segment_data.sh --stage $cleanup_stage \ + --nj $nj --cmd "$train_cmd" \ + $data $langdir $srcdir $dir $cleaned_data +fi + +if [ $stage -le 2 ]; then + steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \ + $cleaned_data $langdir $srcdir ${srcdir}_ali_${cleanup_affix} +fi + +if [ $stage -le 3 ]; then + steps/train_sat.sh --cmd "$train_cmd" \ + 3500 100000 $cleaned_data $langdir ${srcdir}_ali_${cleanup_affix} ${cleaned_dir} +fi + +utils/data/get_utt2dur.sh data/train_cleaned +ori_avg_dur=$(awk 'BEGIN{total=0}{total += $2}END{printf("%.2f", total/NR)}' ${data}/utt2dur) +new_avg_dur=$(awk 'BEGIN{total=0}{total += $2}END{printf("%.2f", total/NR)}' ${cleaned_data}/utt2dur) +echo "average duration was reduced from ${ori_avg_dur}s to ${new_avg_dur}s." +# average duration was reduced from 21.68s to 10.97s. +exit 0; diff --git a/egs/formosa/s5/local/score.sh b/egs/formosa/s5/local/score.sh new file mode 100755 index 00000000000..a9786169973 --- /dev/null +++ b/egs/formosa/s5/local/score.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +set -e -o pipefail +set -x +steps/score_kaldi.sh "$@" +steps/scoring/score_kaldi_cer.sh --stage 2 "$@" + +echo "$0: Done" diff --git a/egs/formosa/s5/local/train_lms.sh b/egs/formosa/s5/local/train_lms.sh new file mode 100755 index 00000000000..efc5b92c573 --- /dev/null +++ b/egs/formosa/s5/local/train_lms.sh @@ -0,0 +1,63 @@ +#!/bin/bash + + +# To be run from one directory above this script. +. ./path.sh + +text=data/local/train/text +lexicon=data/local/dict/lexicon.txt + +for f in "$text" "$lexicon"; do + [ ! -f $x ] && echo "$0: No such file $f" && exit 1; +done + +# This script takes no arguments. It assumes you have already run +# aishell_data_prep.sh. +# It takes as input the files +# data/local/train/text +# data/local/dict/lexicon.txt +dir=data/local/lm +mkdir -p $dir + +kaldi_lm=`which train_lm.sh` +if [ -z $kaldi_lm ]; then + echo "$0: train_lm.sh is not found. That might mean it's not installed" + echo "$0: or it is not added to PATH" + echo "$0: Use the script tools/extra/install_kaldi_lm.sh to install it" + exit 1 +fi + +cleantext=$dir/text.no_oov + +cat $text | awk -v lex=$lexicon 'BEGIN{while((getline0){ seen[$1]=1; } } + {for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf(" ");} } printf("\n");}' \ + > $cleantext || exit 1; + +cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \ + sort -nr > $dir/word.counts || exit 1; + +# Get counts from acoustic training transcripts, and add one-count +# for each word in the lexicon (but not silence, we don't want it +# in the LM-- we'll add it optionally later). +cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \ + cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \ + sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1; + +# note: we probably won't really make use of as there aren't any OOVs +cat $dir/unigram.counts | awk '{print $2}' | get_word_map.pl "" "" "" > $dir/word_map \ + || exit 1; + +# note: ignore 1st field of train.txt, it's the utterance-id. +cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline0)map[$1]=$2;} + { for(n=2;n<=NF;n++) { printf map[$n]; if(n$dir/train.gz \ + || exit 1; + +train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1; + +# LM is small enough that we don't need to prune it (only about 0.7M N-grams). +# Perplexity over 128254.000000 words is 90.446690 + +# note: output is +# data/local/lm/3gram-mincount/lm_unpruned.gz + +exit 0; diff --git a/egs/formosa/s5/local/wer_hyp_filter b/egs/formosa/s5/local/wer_hyp_filter new file mode 100755 index 00000000000..519d92ee80d --- /dev/null +++ b/egs/formosa/s5/local/wer_hyp_filter @@ -0,0 +1,19 @@ +#!/usr/bin/env perl + +@filters=(''); + +foreach $w (@filters) { + $bad{$w} = 1; +} + +while() { + @A = split(" ", $_); + $id = shift @A; + print "$id "; + foreach $a (@A) { + if (!defined $bad{$a}) { + print "$a "; + } + } + print "\n"; +} diff --git a/egs/formosa/s5/local/wer_output_filter b/egs/formosa/s5/local/wer_output_filter new file mode 100755 index 00000000000..06a99a43e34 --- /dev/null +++ b/egs/formosa/s5/local/wer_output_filter @@ -0,0 +1,25 @@ +#!/usr/bin/env perl +# Copyright 2012-2014 Johns Hopkins University (Author: Yenda Trmal) +# Apache 2.0 +use utf8; + +use open qw(:encoding(utf8)); +binmode STDIN, ":utf8"; +binmode STDOUT, ":utf8"; +binmode STDERR, ":utf8"; + +while (<>) { + @F = split " "; + print $F[0] . " "; + foreach $s (@F[1..$#F]) { + if (($s =~ /\[.*\]/) || ($s =~ /\<.*\>/) || ($s =~ "")) { + print ""; + } else { + print "$s" + } + print " "; + } + print "\n"; +} + + diff --git a/egs/formosa/s5/local/wer_ref_filter b/egs/formosa/s5/local/wer_ref_filter new file mode 100755 index 00000000000..519d92ee80d --- /dev/null +++ b/egs/formosa/s5/local/wer_ref_filter @@ -0,0 +1,19 @@ +#!/usr/bin/env perl + +@filters=(''); + +foreach $w (@filters) { + $bad{$w} = 1; +} + +while() { + @A = split(" ", $_); + $id = shift @A; + print "$id "; + foreach $a (@A) { + if (!defined $bad{$a}) { + print "$a "; + } + } + print "\n"; +} diff --git a/egs/formosa/s5/path.sh b/egs/formosa/s5/path.sh new file mode 100755 index 00000000000..2d17b17a84a --- /dev/null +++ b/egs/formosa/s5/path.sh @@ -0,0 +1,6 @@ +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C diff --git a/egs/formosa/s5/run.sh b/egs/formosa/s5/run.sh new file mode 100755 index 00000000000..a4d0f2dcd1d --- /dev/null +++ b/egs/formosa/s5/run.sh @@ -0,0 +1,217 @@ +#!/bin/bash +# +# Copyright 2018, Yuan-Fu Liao, National Taipei University of Technology, yfliao@mail.ntut.edu.tw +# +# Before you run this recipe, please apply, download and put or make a link of the corpus under this folder (folder name: "NER-Trs-Vol1"). +# For more detail, please check: +# 1. Formosa Speech in the Wild (FSW) project (https://sites.google.com/speech.ntut.edu.tw/fsw/home/corpus) +# 2. Formosa Speech Recognition Challenge (FSW) 2018 (https://sites.google.com/speech.ntut.edu.tw/fsw/home/challenge) +stage=-2 +num_jobs=20 + +train_dir=NER-Trs-Vol1/Train +eval_dir=NER-Trs-Vol1-Eval +eval_key_dir=NER-Trs-Vol1-Eval-Key + +# shell options +set -eo pipefail + +. ./cmd.sh +. ./utils/parse_options.sh + +# configure number of jobs running in parallel, you should adjust these numbers according to your machines +# data preparation +if [ $stage -le -2 ]; then + # Lexicon Preparation, + echo "$0: Lexicon Preparation" + local/prepare_dict.sh || exit 1; + + # Data Preparation + echo "$0: Data Preparation" + local/prepare_data.sh --train-dir $train_dir --eval-dir $eval_dir --eval-key-dir $eval_key_dir || exit 1; + + # Phone Sets, questions, L compilation + echo "$0: Phone Sets, questions, L compilation Preparation" + rm -rf data/lang + utils/prepare_lang.sh --position-dependent-phones false data/local/dict \ + "" data/local/lang data/lang || exit 1; + + # LM training + echo "$0: LM training" + rm -rf data/local/lm/3gram-mincount + local/train_lms.sh || exit 1; + + # G compilation, check LG composition + echo "$0: G compilation, check LG composition" + utils/format_lm.sh data/lang data/local/lm/3gram-mincount/lm_unpruned.gz \ + data/local/dict/lexicon.txt data/lang_test || exit 1; + +fi + +# Now make MFCC plus pitch features. +# mfccdir should be some place with a largish disk where you +# want to store MFCC features. +mfccdir=mfcc + +# mfcc +if [ $stage -le -1 ]; then + echo "$0: making mfccs" + for x in train test eval; do + steps/make_mfcc_pitch.sh --cmd "$train_cmd" --nj $num_jobs data/$x exp/make_mfcc/$x $mfccdir || exit 1; + steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1; + utils/fix_data_dir.sh data/$x || exit 1; + done +fi + +# mono +if [ $stage -le 0 ]; then + echo "$0: train mono model" + # Make some small data subsets for early system-build stages. + echo "$0: make training subsets" + utils/subset_data_dir.sh --shortest data/train 3000 data/train_mono + + # train mono + steps/train_mono.sh --boost-silence 1.25 --cmd "$train_cmd" --nj $num_jobs \ + data/train_mono data/lang exp/mono || exit 1; + + # Get alignments from monophone system. + steps/align_si.sh --boost-silence 1.25 --cmd "$train_cmd" --nj $num_jobs \ + data/train data/lang exp/mono exp/mono_ali || exit 1; + + # Monophone decoding + ( + utils/mkgraph.sh data/lang_test exp/mono exp/mono/graph || exit 1; + steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj $num_jobs \ + exp/mono/graph data/test exp/mono/decode_test + )& +fi + +# tri1 +if [ $stage -le 1 ]; then + echo "$0: train tri1 model" + # train tri1 [first triphone pass] + steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \ + 2500 20000 data/train data/lang exp/mono_ali exp/tri1 || exit 1; + + # align tri1 + steps/align_si.sh --cmd "$train_cmd" --nj $num_jobs \ + data/train data/lang exp/tri1 exp/tri1_ali || exit 1; + + # decode tri1 + ( + utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph || exit 1; + steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj $num_jobs \ + exp/tri1/graph data/test exp/tri1/decode_test + )& +fi + +# tri2 +if [ $stage -le 2 ]; then + echo "$0: train tri2 model" + # train tri2 [delta+delta-deltas] + steps/train_deltas.sh --cmd "$train_cmd" \ + 2500 20000 data/train data/lang exp/tri1_ali exp/tri2 || exit 1; + + # align tri2b + steps/align_si.sh --cmd "$train_cmd" --nj $num_jobs \ + data/train data/lang exp/tri2 exp/tri2_ali || exit 1; + + # decode tri2 + ( + utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph + steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj $num_jobs \ + exp/tri2/graph data/test exp/tri2/decode_test + )& +fi + +# tri3a +if [ $stage -le 3 ]; then + echo "$-: train tri3 model" + # Train tri3a, which is LDA+MLLT, + steps/train_lda_mllt.sh --cmd "$train_cmd" \ + 2500 20000 data/train data/lang exp/tri2_ali exp/tri3a || exit 1; + + # decode tri3a + ( + utils/mkgraph.sh data/lang_test exp/tri3a exp/tri3a/graph || exit 1; + steps/decode.sh --cmd "$decode_cmd" --nj $num_jobs --config conf/decode.config \ + exp/tri3a/graph data/test exp/tri3a/decode_test + )& +fi + +# tri4 +if [ $stage -le 4 ]; then + echo "$0: train tri4 model" + # From now, we start building a more serious system (with SAT), and we'll + # do the alignment with fMLLR. + steps/align_fmllr.sh --cmd "$train_cmd" --nj $num_jobs \ + data/train data/lang exp/tri3a exp/tri3a_ali || exit 1; + + steps/train_sat.sh --cmd "$train_cmd" \ + 2500 20000 data/train data/lang exp/tri3a_ali exp/tri4a || exit 1; + + # align tri4a + steps/align_fmllr.sh --cmd "$train_cmd" --nj $num_jobs \ + data/train data/lang exp/tri4a exp/tri4a_ali + + # decode tri4a + ( + utils/mkgraph.sh data/lang_test exp/tri4a exp/tri4a/graph + steps/decode_fmllr.sh --cmd "$decode_cmd" --nj $num_jobs --config conf/decode.config \ + exp/tri4a/graph data/test exp/tri4a/decode_test + )& +fi + +# tri5 +if [ $stage -le 5 ]; then + echo "$0: train tri5 model" + # Building a larger SAT system. + steps/train_sat.sh --cmd "$train_cmd" \ + 3500 100000 data/train data/lang exp/tri4a_ali exp/tri5a || exit 1; + + # align tri5a + steps/align_fmllr.sh --cmd "$train_cmd" --nj $num_jobs \ + data/train data/lang exp/tri5a exp/tri5a_ali || exit 1; + + # decode tri5 + ( + utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph || exit 1; + steps/decode_fmllr.sh --cmd "$decode_cmd" --nj $num_jobs --config conf/decode.config \ + exp/tri5a/graph data/test exp/tri5a/decode_test || exit 1; + )& +fi + +# nnet3 tdnn models +# commented out by default, since the chain model is usually faster and better +#if [ $stage -le 6 ]; then + # echo "$0: train nnet3 model" + # local/nnet3/run_tdnn.sh +#fi + +# chain model +if [ $stage -le 7 ]; then + # The iVector-extraction and feature-dumping parts coulb be skipped by setting "--train_stage 7" + echo "$0: train chain model" + local/chain/run_tdnn.sh +fi + +# getting results (see RESULTS file) +if [ $stage -le 8 ]; then + echo "$0: extract the results" + for test_set in test eval; do + echo "WER: $test_set" + for x in exp/*/decode_${test_set}*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done 2>/dev/null + for x in exp/*/*/decode_${test_set}*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done 2>/dev/null + echo + + echo "CER: $test_set" + for x in exp/*/decode_${test_set}*; do [ -d $x ] && grep WER $x/cer_* | utils/best_wer.sh; done 2>/dev/null + for x in exp/*/*/decode_${test_set}*; do [ -d $x ] && grep WER $x/cer_* | utils/best_wer.sh; done 2>/dev/null + echo + done +fi + +# finish +echo "$0: all done" + +exit 0; diff --git a/egs/formosa/s5/steps b/egs/formosa/s5/steps new file mode 120000 index 00000000000..6e99bf5b5ad --- /dev/null +++ b/egs/formosa/s5/steps @@ -0,0 +1 @@ +../../wsj/s5/steps \ No newline at end of file diff --git a/egs/formosa/s5/utils b/egs/formosa/s5/utils new file mode 120000 index 00000000000..b240885218f --- /dev/null +++ b/egs/formosa/s5/utils @@ -0,0 +1 @@ +../../wsj/s5/utils \ No newline at end of file diff --git a/egs/gale_arabic/s5/local/gale_format_data.sh b/egs/gale_arabic/s5/local/gale_format_data.sh index 85a946a58d9..053323dc194 100755 --- a/egs/gale_arabic/s5/local/gale_format_data.sh +++ b/egs/gale_arabic/s5/local/gale_format_data.sh @@ -57,4 +57,4 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \ echo gale_format_data succeeded. -exit 0 \ No newline at end of file +exit 0 diff --git a/egs/gale_arabic/s5/local/gale_prep_dict.sh b/egs/gale_arabic/s5/local/gale_prep_dict.sh index 74ef789eda7..f6fd83378d0 100755 --- a/egs/gale_arabic/s5/local/gale_prep_dict.sh +++ b/egs/gale_arabic/s5/local/gale_prep_dict.sh @@ -25,9 +25,8 @@ echo SIL > $dir/optional_silence.txt cat $dir/lexicon.txt | cut -d ' ' -f2- | tr -s ' ' '\n' |\ sort -u > $dir/nonsilence_phones.txt || exit 1; +perl -i -pe 'print " SIL\n" if $.==1' $dir/lexicon.txt - sed -i '1i SIL' $dir/lexicon.txt - echo Dictionary preparation succeeded exit 0 diff --git a/egs/gale_arabic/s5/local/gale_train_lms.sh b/egs/gale_arabic/s5/local/gale_train_lms.sh index 1b5d4665a19..8f8e715390f 100755 --- a/egs/gale_arabic/s5/local/gale_train_lms.sh +++ b/egs/gale_arabic/s5/local/gale_train_lms.sh @@ -113,4 +113,4 @@ fi echo train lm succeeded -exit 0 \ No newline at end of file +exit 0 diff --git a/egs/gale_arabic/s5/local/run_sgmm.sh b/egs/gale_arabic/s5/local/run_sgmm.sh index f9ba9b193a8..a5d32d18038 100755 --- a/egs/gale_arabic/s5/local/run_sgmm.sh +++ b/egs/gale_arabic/s5/local/run_sgmm.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/bash . ./path.sh @@ -10,17 +10,17 @@ nDecodeJobs=40 galeData=GALE mfccdir=mfcc - -if [[ ! -d exp/tri3b_ali ]]; then + +if [[ ! -d exp/tri3b_ali ]]; then echo "exp/tri3b_ali lattices are required for alignmnet" - exit 1 + exit 1 fi ## SGMM (subspace gaussian mixture model), excluding the "speaker-dependent weights" steps/train_ubm.sh --cmd "$train_cmd" 700 \ data/train data/lang exp/tri3b_ali exp/ubm5a || exit 1; - + steps/train_sgmm2.sh --cmd "$train_cmd" 5000 20000 data/train data/lang exp/tri3b_ali \ exp/ubm5a/final.ubm exp/sgmm_5a || exit 1; @@ -38,16 +38,16 @@ steps/align_sgmm2.sh --nj $nJobs --cmd "$train_cmd" --transform-dir exp/tri3b_al steps/make_denlats_sgmm2.sh --nj $nJobs --sub-split 30 --beam 9.0 --lattice-beam 6 \ --cmd "$decode_cmd" --transform-dir \ exp/tri3b_ali data/train data/lang exp/sgmm_5a_ali exp/sgmm_5a_denlats || exit 1; - + steps/train_mmi_sgmm2.sh --cmd "$train_cmd" --num-iters 8 --transform-dir exp/tri3b_ali --boost 0.1 \ data/train data/lang exp/sgmm_5a exp/sgmm_5a_denlats exp/sgmm_5a_mmi_b0.1 - + #decode SGMM MMI utils/mkgraph.sh data/lang_test exp/sgmm_5a_mmi_b0.1 exp/sgmm_5a_mmi_b0.1/graph steps/decode_sgmm2.sh --nj $nDecodeJobs --cmd "$decode_cmd" \ --config conf/decode.config --transform-dir exp/tri3b/decode \ exp/sgmm_5a_mmi_b0.1/graph data/test exp/sgmm_5a_mmi_b0.1/decode - + for n in 1 2 3 4; do steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $n \ --transform-dir exp/tri3b/decode data/lang_test data/test \ diff --git a/egs/gale_arabic/s5b/RESULTS b/egs/gale_arabic/s5b/RESULTS index 2260a106654..a485240ff6b 100644 --- a/egs/gale_arabic/s5b/RESULTS +++ b/egs/gale_arabic/s5b/RESULTS @@ -2,13 +2,7 @@ # This file is generated using local/split_wer.sh $galeData //galeData is a local folder to keep intermediate gale data # look at the end of run.sh in the same folder ## -##### RESULTS generated by amali at 2017-01-01-08-05-59 - Report Results WER: -%WER 9.50 [ 2124 / 22363, 160 ins, 275 del, 1689 sub ] exp/chain_cleaned/tdnn_lstm1a_sp_bi/decode/wer_report_9 -%WER 10.72 [ 2398 / 22363, 163 ins, 313 del, 1922 sub ] exp/chain_cleaned/tdnn1b_sp_bi/decode/wer_report_9 -%WER 12.04 [ 2693 / 22363, 226 ins, 271 del, 2196 sub ] exp/nnet3_cleaned/lstm_ld5_sp/decode/wer_report_9 -%WER 12.29 [ 2749 / 22363, 273 ins, 266 del, 2210 sub ] exp/nnet3_cleaned/tdnn_sp/decode/wer_report_10 %WER 17.82 [ 3986 / 22363, 315 ins, 618 del, 3053 sub ] exp/sgmm_5a_mmi_b0.1/decode/wer_report_12 %WER 18.15 [ 4059 / 22363, 335 ins, 589 del, 3135 sub ] exp/sgmm_5a_mmi_b0.1/decode4/wer_report_11 %WER 18.42 [ 4119 / 22363, 346 ins, 590 del, 3183 sub ] exp/sgmm_5a_mmi_b0.1/decode3/wer_report_11 @@ -27,10 +21,6 @@ Report Results WER: %WER 25.66 [ 5738 / 22363, 478 ins, 838 del, 4422 sub ] exp/tri2a/decode/wer_report_14 %WER 26.38 [ 5900 / 22363, 435 ins, 929 del, 4536 sub ] exp/tri1/decode/wer_report_15 Conversational Results WER: -%WER 21.59 [ 10213 / 47305, 944 ins, 3092 del, 6177 sub ] exp/chain_cleaned/tdnn_lstm1a_sp_bi/decode/wer_conversational_9 -%WER 24.77 [ 11716 / 47305, 1098 ins, 3579 del, 7039 sub ] exp/chain_cleaned/tdnn1b_sp_bi/decode/wer_conversational_9 -%WER 26.78 [ 12670 / 47305, 1741 ins, 2434 del, 8495 sub ] exp/nnet3_cleaned/lstm_ld5_sp/decode/wer_conversational_9 -%WER 27.55 [ 13032 / 47305, 1800 ins, 2666 del, 8566 sub ] exp/nnet3_cleaned/tdnn_sp/decode/wer_conversational_11 %WER 34.10 [ 16133 / 47305, 1903 ins, 3245 del, 10985 sub ] exp/sgmm_5a_mmi_b0.1/decode/wer_conversational_11 %WER 34.81 [ 16466 / 47305, 2077 ins, 3037 del, 11352 sub ] exp/sgmm_5a_mmi_b0.1/decode4/wer_conversational_10 %WER 35.19 [ 16648 / 47305, 1933 ins, 3264 del, 11451 sub ] exp/sgmm_5a_mmi_b0.1/decode3/wer_conversational_11 @@ -49,10 +39,6 @@ Conversational Results WER: %WER 45.92 [ 21724 / 47305, 1995 ins, 5213 del, 14516 sub ] exp/tri2a/decode/wer_conversational_14 %WER 46.86 [ 22166 / 47305, 2212 ins, 4819 del, 15135 sub ] exp/tri1/decode/wer_conversational_13 Combined Results for Reports and Conversational WER: -%WER 17.64 [ 12286 / 69668, 1310 ins, 2807 del, 8169 sub ] exp/chain_cleaned/tdnn_lstm1a_sp_bi/decode/wer_8 -%WER 20.26 [ 14114 / 69668, 1261 ins, 3892 del, 8961 sub ] exp/chain_cleaned/tdnn1b_sp_bi/decode/wer_9 -%WER 22.05 [ 15363 / 69668, 1967 ins, 2705 del, 10691 sub ] exp/nnet3_cleaned/lstm_ld5_sp/decode/wer_9 -%WER 22.66 [ 15786 / 69668, 2047 ins, 2955 del, 10784 sub ] exp/nnet3_cleaned/tdnn_sp/decode/wer_11 %WER 28.89 [ 20127 / 69668, 2244 ins, 3829 del, 14054 sub ] exp/sgmm_5a_mmi_b0.1/decode/wer_11 %WER 29.48 [ 20541 / 69668, 2243 ins, 3860 del, 14438 sub ] exp/sgmm_5a_mmi_b0.1/decode4/wer_11 %WER 29.81 [ 20767 / 69668, 2279 ins, 3854 del, 14634 sub ] exp/sgmm_5a_mmi_b0.1/decode3/wer_11 @@ -65,8 +51,30 @@ Combined Results for Reports and Conversational WER: %WER 32.36 [ 22542 / 69668, 2156 ins, 4184 del, 16202 sub ] exp/tri2b_mmi/decode_it4/wer_11 %WER 32.50 [ 22640 / 69668, 2393 ins, 3956 del, 16291 sub ] exp/tri2b_mmi/decode_it3/wer_11 %WER 32.79 [ 22847 / 69668, 2407 ins, 4760 del, 15680 sub ] exp/tri2b_mpe/decode_it3/wer_13 +# WER with train_sat_basis +%WER 33.35 [ 23233 / 69668, 2385 ins, 5274 del, 15574 sub ] exp/tri3b/decode/wer_16_0.5 +# WER with train_sat %WER 33.61 [ 23413 / 69668, 2817 ins, 4577 del, 16019 sub ] exp/tri3b/decode/wer_17 %WER 35.73 [ 24894 / 69668, 2630 ins, 4944 del, 17320 sub ] exp/tri3b/decode.si/wer_15 %WER 36.17 [ 25196 / 69668, 2429 ins, 5393 del, 17374 sub ] exp/tri2b/decode/wer_16 %WER 39.42 [ 27462 / 69668, 2473 ins, 6051 del, 18938 sub ] exp/tri2a/decode/wer_14 %WER 40.35 [ 28113 / 69668, 2713 ins, 5635 del, 19765 sub ] exp/tri1/decode/wer_13 + + +# Effect of GMM seed model (tri2b instead of tri3b). Using tri3b give a slightly better result +# as compared to using tri2b as seed. +%WER 16.66 [ 11610 / 69668, 1233 ins, 2747 del, 7630 sub ] exp/chain/tdnn_1a_3b_sp/decode_test/wer_10_0.0 +%WER 16.71 [ 11642 / 69668, 1145 ins, 2908 del, 7589 sub ] exp/chain/tdnn_1a_2b_sp/decode_test/wer_9_0.0 + +# Effect of Tree-size (3500, 4500, 7000, 11000) +%WER 16.66 [ 11610 / 69668, 1233 ins, 2747 del, 7630 sub ] exp/chain/tdnn_1a_3500_sp/decode_test/wer_10_0.0 +%WER 16.59 [ 11557 / 69668, 1234 ins, 2646 del, 7677 sub ] exp/chain/tdnn_1a_4500_sp/decode_test/wer_10_0.0 +%WER 16.47 [ 11474 / 69668, 1421 ins, 2207 del, 7846 sub ] exp/chain/tdnn_1a_7000_sp/decode_test/wer_9_0.0 +%WER 16.62 [ 11580 / 69668, 1164 ins, 2789 del, 7627 sub ] exp/chain/tdnn_1a_11000_sp/decode_test/wer_10_0.0 + +# Effect of l2-regularization on the output with tree-size=7000. l2 on the output (0.005,0.002) +%WER 16.54 [ 11522 / 69668, 1123 ins, 2739 del, 7660 sub ] exp/chain/tdnn_1a_7000_005_sp/decode_test/wer_9_0.5 +%WER 16.47 [ 11474 / 69668, 1421 ins, 2207 del, 7846 sub ] exp/chain/tdnn_1a_7000_002_sp/decode_test/wer_9_0.0 + +#current best 'chain' models (see local/chain/tuning/run_tdnn_1a.sh) +%WER 16.47 [ 11474 / 69668, 1421 ins, 2207 del, 7846 sub ] exp/chain/tdnn_1a_sp/decode_test/wer_9_0.0 diff --git a/egs/gale_arabic/s5b/cmd.sh b/egs/gale_arabic/s5b/cmd.sh index 71dd849a93b..ea341c98d4a 100755 --- a/egs/gale_arabic/s5b/cmd.sh +++ b/egs/gale_arabic/s5b/cmd.sh @@ -10,6 +10,6 @@ # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. -export train_cmd="queue.pl --mem 2G" -export decode_cmd="queue.pl --mem 4G" -export mkgraph_cmd="queue.pl --mem 8G" +export train_cmd="retry.pl queue.pl --mem 2G" +export decode_cmd="retry.pl queue.pl --mem 4G" +export mkgraph_cmd="retry.pl queue.pl --mem 8G" diff --git a/egs/gale_arabic/s5b/local/chain/compare_wer.sh b/egs/gale_arabic/s5b/local/chain/compare_wer.sh new file mode 100755 index 00000000000..1a40523355a --- /dev/null +++ b/egs/gale_arabic/s5b/local/chain/compare_wer.sh @@ -0,0 +1,72 @@ +#!/bin/bash + +# this script is used for comparing decoding results between systems. +# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b} + +# ./local/chain/compare_wer.sh exp/chain/cnn1a +# System cnn1a +# WER 0.61 +# CER 0.15 +# Final train prob -0.0377 +# Final valid prob -0.0380 +# Final train prob (xent) -0.0830 +# Final valid prob (xent) -0.0838 + +if [ $# == 0 ]; then + echo "Usage: $0: [ ... ]" + echo "e.g.: $0 exp/chain/cnn{1a,1b}" + exit 1 +fi + +echo "# $0 $*" +used_epochs=false + +echo -n "# System " +for x in $*; do printf "% 10s" " $(basename $x)"; done +echo + +echo -n "# WER " +for x in $*; do + wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "# CER " +for x in $*; do + cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}') + printf "% 10s" $cer +done +echo + +if $used_epochs; then + exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. +fi + +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final train prob (xent) " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob (xent) " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo diff --git a/egs/gale_arabic/s5b/local/chain/run_chain_common.sh b/egs/gale_arabic/s5b/local/chain/run_chain_common.sh new file mode 100755 index 00000000000..da37e148441 --- /dev/null +++ b/egs/gale_arabic/s5b/local/chain/run_chain_common.sh @@ -0,0 +1,82 @@ +#!/bin/bash + +# this script has common stages shared across librispeech chain recipes. +# It generates a new topology in a new lang directory, gets the alignments as +# lattices, and builds a tree for the new topology +set -e + +stage=11 + +# input directory names. These options are actually compulsory, and they have +# been named for convenience +gmm_dir= +ali_dir= +lores_train_data_dir= + +num_leaves=6000 + +# output directory names. They are also compulsory. +lang= +lat_dir= +tree_dir= +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +[ -z $lang ] && echo "Set --lang, this specifies the new lang directory which will have the new topology" && exit 1; +[ -z $lat_dir ] && echo "Set --lat-dir, this specifies the experiment directory to store lattice" && exit 1; +[ -z $tree_dir ] && echo "Set --tree-dir, this specifies the directory to store new tree " && exit 1; + +for f in $gmm_dir/final.mdl $ali_dir/ali.1.gz $lores_train_data_dir/feats.scp; do + [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1 +done + +if [ $stage -le 11 ]; then + echo "$0: creating lang directory with one state per phone." + # Create a version of the lang/ directory that has one state per phone in the + # topo file. [note, it really has two states.. the first one is only repeated + # once, the second one has zero or more repeats.] + if [ -d $lang ]; then + if [ $lang/L.fst -nt data/lang/L.fst ]; then + echo "$0: $lang already exists, not overwriting it; continuing" + else + echo "$0: $lang already exists and seems to be older than data/lang..." + echo " ... not sure what to do. Exiting." + exit 1; + fi + else + cp -r data/lang $lang + silphonelist=$(cat $lang/phones/silence.csl) || exit 1; + nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1; + # Use our special topology... note that later on may have to tune this + # topology. + steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo + fi +fi + +if [ $stage -le 12 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + nj=$(cat ${ali_dir}/num_jobs) || exit 1; + steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" ${lores_train_data_dir} \ + $lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 13 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" $num_leaves ${lores_train_data_dir} $lang $ali_dir $tree_dir +fi + +exit 0; diff --git a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh index 7afafb31ff6..bf2e45c9914 100755 --- a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh +++ b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh @@ -1,31 +1,51 @@ #!/bin/bash -#started from tedlium recipe with few edits +# ./local/chain/compare_wer.sh exp/chain/tdnn_1a_sp +# System tdnn_1a_sp +# WER 16.47 +# CER 6.68 +# Final train prob -0.0652 +# Final valid prob -0.0831 +# Final train prob (xent) -0.8965 +# Final valid prob (xent) -0.9964 +# steps/info/chain_dir_info.pl exp/chain/tdnn_1a_sp/ +# exp/chain/tdnn_1a_sp/: num-iters=441 nj=3..16 num-params=18.6M dim=40+100->5816 combine=-0.063->-0.062 (over 6) xent:train/valid[293,440,final]=(-1.22,-0.912,-0.896/-1.29,-1.01,-0.996) logprob:train/valid[293,440,final]=(-0.097,-0.066,-0.065/-0.108,-0.084,-0.083) -set -e -o pipefail -# First the options that are passed through to run_ivector_common.sh -# (some of which are also used in this script directly). +set -e -o pipefail stage=0 nj=30 -decode_nj=30 -min_seg_len=1.55 -xent_regularize=0.1 train_set=train -gmm=tri2b # the gmm for the target data +test_set=test +gmm=tri3b # this is the source gmm-dir that we'll use for alignments; it + # should have alignments for the specified training data. num_threads_ubm=32 -nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned - -# The rest are configs specific to this script. Most of the parameters -# are just hardcoded at this level, in the commands below. -train_stage=-10 #default -10 -tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. -tdnn_affix=1b #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration. -common_egs_dir= # you can set this to use previously dumped egs. +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. + +# Options which are not passed through to run_ivector_common.sh +affix=_1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +common_egs_dir= +reporting_email= + +# LSTM/chain options +train_stage=-10 +xent_regularize=0.1 +dropout_schedule='0,0@0.20,0.5@0.50,0' + +# training chunk-options +chunk_width=150,110,100 +get_egs_stage=-10 + +# training options +srand=0 +remove_egs=true +run_ivector_common=true +run_chain_common=true # End configuration section. echo "$0 $@" # Print the command line for logging + . ./cmd.sh . ./path.sh . ./utils/parse_options.sh @@ -39,169 +59,162 @@ where "nvcc" is installed. EOF fi -local/nnet3/run_ivector_common.sh --stage $stage \ - --nj $nj \ - --min-seg-len $min_seg_len \ - --train-set $train_set \ - --gmm $gmm \ - --num-threads-ubm $num_threads_ubm \ - --nnet3-affix "$nnet3_affix" - - -gmm_dir=exp/$gmm -ali_dir=exp/${gmm}_ali_${train_set}_sp_comb -tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix} -lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats -dir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}_sp_bi -train_data_dir=data/${train_set}_sp_hires_comb -lores_train_data_dir=data/${train_set}_sp_comb -train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb - +if $run_ivector_common; then + local/nnet3/run_ivector_common.sh \ + --stage $stage --nj $nj \ + --train-set $train_set --gmm $gmm \ + --num-threads-ubm $num_threads_ubm \ + --nnet3-affix "$nnet3_affix" +fi -for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \ - $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do +gmm_dir=exp/${gmm} +ali_dir=exp/${gmm}_ali_${train_set}_sp +lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats +dir=exp/chain${nnet3_affix}/tdnn${affix}_sp +train_data_dir=data/${train_set}_sp_hires +train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires +lores_train_data_dir=data/${train_set}_sp + +# note: you don't necessarily have to change the treedir name +# each time you do a new experiment-- only if you change the +# configuration in a way that affects the tree. +tree_dir=exp/chain${nnet3_affix}/tree_a_sp +# the 'lang' directory is created by this script. +# If you create such a directory with a non-standard topology +# you should probably name it differently. +lang=data/lang_chain + +for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \ + $lores_train_data_dir/feats.scp $gmm_dir/final.mdl \ + $ali_dir/ali.1.gz $gmm_dir/final.mdl; do [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1 done -if [ $stage -le 14 ]; then - echo "$0: creating lang directory with one state per phone." - # Create a version of the lang/ directory that has one state per phone in the - # topo file. [note, it really has two states.. the first one is only repeated - # once, the second one has zero or more repeats.] - if [ -d data/lang_chain ]; then - if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then - echo "$0: data/lang_chain already exists, not overwriting it; continuing" - else - echo "$0: data/lang_chain already exists and seems to be older than data/lang..." - echo " ... not sure what to do. Exiting." - exit 1; - fi - else - cp -r data/lang data/lang_chain - silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1; - nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1; - # Use our special topology... note that later on may have to tune this - # topology. - steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo - fi +# Please take this as a reference on how to specify all the options of +# local/chain/run_chain_common.sh +if $run_chain_common; then + local/chain/run_chain_common.sh --stage $stage \ + --gmm-dir $gmm_dir \ + --ali-dir $ali_dir \ + --lores-train-data-dir ${lores_train_data_dir} \ + --lang $lang \ + --lat-dir $lat_dir \ + --num-leaves 7000 \ + --tree-dir $tree_dir || exit 1; fi if [ $stage -le 15 ]; then - # Get the alignments as lattices (gives the chain training more freedom). - # use the same num-jobs as the alignments - steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ - data/lang $gmm_dir $lat_dir - rm $lat_dir/fsts.*.gz # save space -fi - -if [ $stage -le 16 ]; then - # Build a tree using our new topology. We know we have alignments for the - # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use - # those. - if [ -f $tree_dir/final.mdl ]; then - echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." - exit 1; - fi - steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ - --context-opts "--context-width=2 --central-position=1" \ - --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir -fi - -if [ $stage -le 17 ]; then mkdir -p $dir - echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) + affine_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true" + tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66" + linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0" + prefinal_opts="l2-regularize=0.01" + output_opts="l2-regularize=0.002" mkdir -p $dir/configs + cat < $dir/configs/network.xconfig input dim=100 name=ivector input dim=40 name=input - # please note that it is important to have input layer with the name=input # as the layer immediately preceding the fixed-affine-layer to enable # the use of short notation for the descriptor fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat - # the first splicing is moved before the lda layer, so no splicing here - relu-renorm-layer name=tdnn1 dim=450 - relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=450 - relu-renorm-layer name=tdnn3 input=Append(-1,0,1,2) dim=450 - relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=450 - relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=450 - relu-renorm-layer name=tdnn6 input=Append(-6,-3,0) dim=450 - - ## adding the layers for chain branch - relu-renorm-layer name=prefinal-chain input=tdnn6 dim=450 target-rms=0.5 - output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 - - # adding the layers for xent branch - # This block prints the configs for a separate output that will be - # trained with a cross-entropy objective in the 'chain' models... this - # has the effect of regularizing the hidden parts of the model. we use - # 0.5 / args.xent_regularize as the learning rate factor- the factor of - # 0.5 / args.xent_regularize is suitable as it means the xent - # final-layer learns at a rate independent of the regularization - # constant; and the 0.5 was tuned so as to make the relative progress - # similar in the xent and regular final layers. - relu-renorm-layer name=prefinal-xent input=tdnn6 dim=450 target-rms=0.5 - output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 - + relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + linear-component name=prefinal-l dim=256 $linear_opts + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts EOF steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ - fi -if [ $stage -le 18 ]; then + +if [ $stage -le 16 ]; then if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then utils/create_split_dir.pl \ - /export/b0{5,6,7,8}/$USER/kaldi-data/egs/gale_arabic-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage fi - steps/nnet3/chain/train.py --stage $train_stage \ + steps/nnet3/chain/train.py --stage $train_stage \ --cmd "$decode_cmd" \ --feat.online-ivector-dir $train_ivector_dir \ --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ --chain.xent-regularize $xent_regularize \ --chain.leaky-hmm-coefficient 0.1 \ - --chain.l2-regularize 0.00005 \ + --chain.l2-regularize 0.0 \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ - --egs.dir "$common_egs_dir" \ - --egs.opts "--frames-overlap-per-eg 0" \ - --egs.chunk-width 150 \ - --trainer.num-chunk-per-minibatch 128 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs 6 \ --trainer.frames-per-iter 1500000 \ - --trainer.num-epochs 4 \ - --trainer.optimization.num-jobs-initial 2 \ - --trainer.optimization.num-jobs-final 2 \ - --trainer.optimization.initial-effective-lrate 0.001 \ - --trainer.optimization.final-effective-lrate 0.0001 \ - --trainer.max-param-change 2.0 \ - --cleanup.remove-egs true \ - --feat-dir $train_data_dir \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.00025 \ + --trainer.optimization.final-effective-lrate 0.000025 \ + --trainer.num-chunk-per-minibatch=64,32 \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0 --constrained false" \ + --egs.stage $get_egs_stage \ + --reporting.email="$reporting_email" \ + --cleanup.remove-egs=$remove_egs \ + --feat-dir=$train_data_dir \ --tree-dir $tree_dir \ - --lat-dir $lat_dir \ - --dir $dir -fi - + --lat-dir=$lat_dir \ + --dir $dir || exit 1; +fi -if [ $stage -le 19 ]; then - # Note: it might appear that this data/lang_chain directory is mismatched, and it is as - # far as the 'topo' is concerned, but this script doesn't read the 'topo' from - # the lang directory. - utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_test $dir $dir/graph +if [ $stage -le 17 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/lang/check_phones_compatible.sh \ + data/lang_test/phones.txt $lang/phones.txt + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test \ + $tree_dir $tree_dir/graph || exit 1; fi -if [ $stage -le 20 ]; then +if [ $stage -le 18 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) rm $dir/.error 2>/dev/null || true - steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ - --acwt 1.0 --post-decode-acwt 10.0 \ - --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_test_hires \ - --scoring-opts "--min-lmwt 5 " \ - $dir/graph data/test_hires $dir/decode || exit 1; + + steps/nnet3/decode.sh \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 0 --extra-right-context 0 \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$decode_cmd" --num-threads 4 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${test_set}_hires \ + $tree_dir/graph data/${test_set}_hires ${dir}/decode_${test_set} || exit 1 fi -exit 0 diff --git a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh index 604f32a1de4..deebafc95e4 100755 --- a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh +++ b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh @@ -120,7 +120,7 @@ if [ $stage -le 17 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) mkdir -p $dir/configs cat < $dir/configs/network.xconfig diff --git a/egs/gale_arabic/s5b/local/gale_data_prep_audio.sh b/egs/gale_arabic/s5b/local/gale_data_prep_audio.sh deleted file mode 100755 index 0125272d06c..00000000000 --- a/egs/gale_arabic/s5b/local/gale_data_prep_audio.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash - -# Copyright 2014 QCRI (author: Ahmed Ali) -# Apache 2.0 - - -galeData=$(utils/make_absolute.sh "${@: -1}" ); # last argumnet; the local folder -audio_dvds=${@:1:${#}-1} # all the audio dvds for GALE corpus; ; check audio=( in ../run.sh - -mkdir -p $galeData - -# check that sox is installed -which sox &>/dev/null -if [[ $? != 0 ]]; then - echo "sox is not installed"; exit 1 -fi - -for dvd in $audio_dvds; do - dvd_full_path=$(utils/make_absolute.sh $dvd) - if [[ ! -e $dvd_full_path ]]; then - echo missing $dvd_full_path; exit 1; - fi - find $dvd_full_path \( -name "*.wav" -o -name "*.flac" \) | while read file; do - id=$(basename $file | awk '{gsub(".wav","");gsub(".flac","");print}') - echo "$id sox $file -r 16000 -t wav - |" - done -done | sort -u > $galeData/wav.scp - -echo data prep audio succeded - -exit 0 - diff --git a/egs/gale_arabic/s5b/local/gale_data_prep_split.sh b/egs/gale_arabic/s5b/local/gale_data_prep_split.sh deleted file mode 100755 index b18a4e5b105..00000000000 --- a/egs/gale_arabic/s5b/local/gale_data_prep_split.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash - -# Copyright 2014 QCRI (author: Ahmed Ali) -# Apache 2.0 - -if [ $# -ne 1 ]; then - echo "Arguments should be the "; exit 1 -fi - - -#data will data/local - -galeData=$(utils/make_absolute.sh $1) -mkdir -p data/local -dir=$(utils/make_absolute.sh data/local) - - -grep -f local/test_list $galeData/all | grep -v -f local/bad_segments > $galeData/all.test -grep -v -f local/test_list $galeData/all | grep -v -f local/bad_segments > $galeData/all.train - -for x in test train; do - outdir=$dir/$x - file=$galeData/all.$x - mkdir -p $outdir - awk '{print $2 " " $2}' $file | sort -u > $outdir/utt2spk - cp -pr $outdir/utt2spk $outdir/spk2utt - awk '{print $2 " " $1 " " $3 " " $4}' $file | sort -u > $outdir/segments - awk '{printf $2 " "; for (i=5; i<=NF; i++) {printf $i " "} printf "\n"}' $file | sort -u > $outdir/text -done - - -grep -f local/test_list $galeData/wav.scp > $dir/test/wav.scp - -cat $galeData/wav.scp | awk -v seg=$dir/train/segments 'BEGIN{while((getline0) {seen[$2]=1;}} - {if (seen[$1]) { print $0}}' > $dir/train/wav.scp - -echo data prep split succeeded - -exit 0 diff --git a/egs/gale_arabic/s5b/local/gale_data_prep_txt.sh b/egs/gale_arabic/s5b/local/gale_data_prep_txt.sh deleted file mode 100755 index 04529d88ac0..00000000000 --- a/egs/gale_arabic/s5b/local/gale_data_prep_txt.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash - -# Copyright 2014 QCRI (author: Ahmed Ali) -# Apache 2.0 - -galeData=$(utils/make_absolute.sh "${@: -1}" ); # last argumnet; the local folder -txt_dvds=${@:1:${#}-1} # all the txt cds correspoding to the audio corpus; check text=( in ../run.sh - - -top_pwd=`pwd` -txtdir=$galeData/txt -mkdir -p $txtdir; cd $txtdir - -for cdx in $txt_dvds; do - echo "Preparing $cdx" - if [[ $cdx == *.tgz ]] ; then - tar -xvf $cdx - elif [ -d "$cdx" ]; then - ln -s $cdx `basename $cdx` - else - echo "I don't really know what I shall do with $cdx " >&2 - fi -done - -find -L . -type f -name "*.tdf" | while read file; do -sed '1,3d' $file # delete the first 3 lines -done > all.tmp$$ - -perl -e ' - ($inFile,$idFile,$txtFile)= split /\s+/, $ARGV[0]; - open(IN, "$inFile"); - open(ID, ">$idFile"); - open(TXT, ">$txtFile"); - while () { - @arr= split /\t/,$_; - $start=sprintf ("%0.3f",$arr[2]);$rStart=$start;$start=~s/\.//; $start=~s/^0+$/0/; $start=~s/^0+([^0])/$1/; # remove zeros at the beginning - $end=sprintf ("%0.3f",$arr[3]);$rEnd=$end;$end=~s/^0+([^0])/$1/;$end=~s/\.//; - if ( ($arr[11] !~ m/report/) && ($arr[11] !~ m/conversational/) ){$arr[11]="UNK";} - $id="$arr[11] $arr[0] $arr[0]_${start}_${end} $rStart $rEnd\n"; - next if ($rStart == $rEnd); - $id =~ s/.sph//g; - print ID $id; - print TXT "$arr[7]\n"; - }' "all.tmp$$ allid.tmp$$ contentall.tmp$$" - - -perl ${top_pwd}/local/normalize_transcript_BW.pl contentall.tmp$$ contentall.buck.tmp$$ - -paste allid.tmp$$ contentall.buck.tmp$$ | sed 's: $::' | awk '{if (NF>5) {print $0}}' > all_1.tmp$$ - -awk '{$1="";print $0}' all_1.tmp$$ | sed 's:^ ::' > $galeData/all -awk '{if ($1 == "report") {$1="";print $0}}' all_1.tmp$$ | sed 's:^ ::' > $galeData/report -awk '{if ($1 == "conversational") {$1="";print $0}}' all_1.tmp$$ | sed 's:^ ::' > $galeData/conversational - -cd ..; -rm -fr $txtdir -cd $top_pwd -echo data prep text succeeded - -exit 0 diff --git a/egs/gale_arabic/s5b/local/gale_prep_grapheme_dict.sh b/egs/gale_arabic/s5b/local/gale_prep_grapheme_dict.sh deleted file mode 100755 index 5f101f8245b..00000000000 --- a/egs/gale_arabic/s5b/local/gale_prep_grapheme_dict.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/bin/bash - -# Copyright 2017 QCRI (author: Ahmed Ali) -# Apache 2.0 - - -# run this from ../ -dir=$(utils/make_absolute.sh data/local/dict) -mkdir -p $dir - - -# (1) Get all avaialble dictionaries, since this is a grapheme model, so we mainly need the most frequent word lists -wget http://alt.qcri.org//resources/speech/dictionary/ar-ar_grapheme_lexicon_2016-02-09.bz2 || exit 1; -wget http://alt.qcri.org//resources/speech/dictionary/ar-ar_lexicon_2014-03-17.txt.bz2 || exit 1; -bzcat ar-ar_grapheme_lexicon_2016-02-09.bz2 | sed '1,3d' | awk '{print $1}' > tmp$$ -bzcat ar-ar_lexicon_2014-03-17.txt.bz2 | sed '1,3d' | awk '{print $1}' >> tmp$$ -# (2) Now we add all the words appeared in the training data -cat data/local/train/text | cut -d ' ' -f 2- | tr -s " " "\n" | sort -u >> tmp$$ -grep -v [0-9] tmp$$ | sed -e 's:[FNKaui\~o\`]::g' -e 's:{:}:g' | sort -u > tmp1.$$ # remove vowels and rare alef wasla -cat tmp1.$$ | sed 's:\(\):\1 :g' | sed -e 's: : :g' -e 's: : :g' -e 's:\s*: :g' -e 's:\*:V:g' > tmp2.$$ -paste -d ' ' tmp1.$$ tmp2.$$ > $dir/lexicon.txt - -#(2) Dictionary preparation: - -# silence phones, one per line. -echo SIL > $dir/silence_phones.txt -echo SIL > $dir/optional_silence.txt - -# nonsilence phones; on each line is a list of phones that correspond -# really to the same base phone. -cat tmp2.$$ | tr -s ' ' '\n' | grep -v ^$ | sort -u > $dir/nonsilence_phones.txt || exit 1; - -sed -i '1i SIL' $dir/lexicon.txt # insert word with phone sil at the begining of the dictionary - -rm -fr ar-ar_lexicon_2014-03-17.txt.bz2 ar-ar_grapheme_lexicon_2016-02-09.bz2 tmp$$ tmp1.$$ tmp2.$$ -echo Dictionary preparation succeeded - -# The script is still missing dates and numbers - -exit 0 - diff --git a/egs/gale_arabic/s5b/local/gale_train_lms.sh b/egs/gale_arabic/s5b/local/gale_train_lms.sh deleted file mode 100755 index 3988ec3818f..00000000000 --- a/egs/gale_arabic/s5b/local/gale_train_lms.sh +++ /dev/null @@ -1,81 +0,0 @@ -#!/bin/bash - - -# To be run from one directory above this script. - - -lexicon=data/local/dict/lexicon.txt -[ ! -f $lexicon ] && echo "$0: No such file $lexicon" && exit 1; - - -# This script takes no arguments. It assumes you have already run -# previus steps successfully -# It takes as input the files -#data/local/train.*/text -#data/local/dict/lexicon.txt - - -export LC_ALL=C # You'll get errors about things being not sorted, if you -# have a different locale. -export PATH=$PATH:./../../../tools/kaldi_lm -( # First make sure the kaldi_lm toolkit is installed. - cd $KALDI_ROOT/tools || exit 1; - if [ -d kaldi_lm ]; then - echo Not installing the kaldi_lm toolkit since it is already there. - else - echo Downloading and installing the kaldi_lm tools - if [ ! -f kaldi_lm.tar.gz ]; then - wget http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz || exit 1; - fi - tar -xvzf kaldi_lm.tar.gz || exit 1; - cd kaldi_lm - make || exit 1; - echo Done making the kaldi_lm tools - fi -) || exit 1; - - -dir=data/local/lm - mkdir -p $dir - text=data/local/train/text - [ ! -f $text ] && echo "$0: No such file $text" && exit 1; - - cleantext=$dir/text.no_oov - - cat $text | awk -v lex=$lexicon 'BEGIN{while((getline0){ seen[$1]=1; } } - {for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf(" ",$n);} } printf("\n");}' \ - > $cleantext || exit 1; - - - cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \ - sort -nr > $dir/word.counts || exit 1; - - -# Get counts from acoustic training transcripts, and add one-count -# for each word in the lexicon (but not silence, we don't want it -# in the LM-- we'll add it optionally later). - cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \ - cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \ - sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1; - -# note: we probably won't really make use of as there aren't any OOVs - cat $dir/unigram.counts | awk '{print $2}' | get_word_map.pl "" "" "" > $dir/word_map \ - || exit 1; - -# note: ignore 1st field of train.txt, it's the utterance-id. - cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline0)map[$1]=$2;} - { for(n=2;n<=NF;n++) { printf map[$n]; if(n$dir/train.gz \ - || exit 1; - - train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1; - -# LM is small enough that we don't need to prune it (only about 0.7M N-grams). -# Perplexity over 128254.000000 words is 90.446690 - -# note: output is -# data/local/lm/3gram-mincount/lm_unpruned.gz - - -echo train lm succeeded - -exit 0 diff --git a/egs/gale_arabic/s5b/local/nnet3/run_ivector_common.sh b/egs/gale_arabic/s5b/local/nnet3/run_ivector_common.sh index f14c8441869..a03cc5b2fa3 100755 --- a/egs/gale_arabic/s5b/local/nnet3/run_ivector_common.sh +++ b/egs/gale_arabic/s5b/local/nnet3/run_ivector_common.sh @@ -2,31 +2,29 @@ set -e -o pipefail -# This script is called from local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh (and may eventually -# be called by more scripts). It contains the common feature preparation and iVector-related parts -# of the script. See those scripts for examples of usage. +# This script is called from scripts like local/nnet3/run_tdnn.sh and +# local/chain/run_tdnn.sh (and may eventually be called by more scripts). It +# contains the common feature preparation and iVector-related parts of the +# script. See those scripts for examples of usage. stage=0 nj=100 -min_seg_len=1.55 # min length in seconds... we do this because chain training - # will discard segments shorter than 1.5 seconds. Must remain in sync - # with the same option given to prepare_lores_feats_and_alignments.sh train_set=train # you might set this to e.g. train. -gmm=tri2b # This specifies a GMM-dir from the features of the type you're training the system on; +test_sets="test" +gmm=tri3b # This specifies a GMM-dir from the features of the type you're training the system on; # it should contain alignments for 'train_set'. num_threads_ubm=32 -nnet3_affix=_cleaned # affix for exp/nnet3 directory to put iVector stuff in, so it - # becomes exp/nnet3_cleaned or whatever. +nnet3_affix= # affix for exp/nnet3 directory to put iVector stuff . ./cmd.sh . ./path.sh -. ./utils/parse_options.sh +. utils/parse_options.sh gmm_dir=exp/${gmm} -ali_dir=exp/${gmm}_ali_${train_set}_sp_comb +ali_dir=exp/${gmm}_ali_${train_set}_sp for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do if [ ! -f $f ]; then @@ -61,7 +59,7 @@ if [ $stage -le 2 ]; then utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/mfcc/gale_arabic-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage fi - for datadir in ${train_set}_sp test; do + for datadir in ${train_set}_sp ${test_sets}; do utils/copy_data_dir.sh data/$datadir data/${datadir}_hires done @@ -69,7 +67,7 @@ if [ $stage -le 2 ]; then # features; this helps make trained nnets more invariant to test data volume. utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires - for datadir in ${train_set}_sp test; do + for datadir in ${train_set}_sp ${test_sets}; do steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \ --cmd "$train_cmd" data/${datadir}_hires steps/compute_cmvn_stats.sh data/${datadir}_hires @@ -78,75 +76,33 @@ if [ $stage -le 2 ]; then fi if [ $stage -le 3 ]; then - echo "$0: combining short segments of speed-perturbed high-resolution MFCC training data" - # we have to combine short segments or we won't be able to train chain models - # on those segments. - utils/data/combine_short_segments.sh \ - data/${train_set}_sp_hires $min_seg_len data/${train_set}_sp_hires_comb - - # just copy over the CMVN to avoid having to recompute it. - cp data/${train_set}_sp_hires/cmvn.scp data/${train_set}_sp_hires_comb/ - utils/fix_data_dir.sh data/${train_set}_sp_hires_comb/ -fi - -if [ $stage -le 4 ]; then - echo "$0: selecting segments of hires training data that were also present in the" - echo " ... original training data." - - # note, these data-dirs are temporary; we put them in a sub-directory - # of the place where we'll make the alignments. - temp_data_root=exp/nnet3${nnet3_affix}/tri5 - mkdir -p $temp_data_root - - utils/data/subset_data_dir.sh --utt-list data/${train_set}/feats.scp \ - data/${train_set}_sp_hires $temp_data_root/${train_set}_hires - - # note: essentially all the original segments should be in the hires data. - n1=$(wc -l /dev/null +if [[ $? != 0 ]]; then + echo "$0: sox is not installed"; exit 1 +fi + +for dvd in $dir1 $dir2 $dir3; do + dvd_full_path=$(utils/make_absolute.sh $dvd) + if [[ ! -e $dvd_full_path ]]; then + echo "$0: missing $dvd_full_path"; exit 1; + fi + find $dvd_full_path \( -name "*.wav" -o -name "*.flac" \) | while read file; do + id=$(basename $file | awk '{gsub(".wav","");gsub(".flac","");print}') + echo "$id sox $file -r 16000 -t wav - |" + done +done | sort -u > $gale_data/wav.scp +echo "$0:data prep audio succeded" + +gale_data=$(utils/make_absolute.sh "GALE" ); +top_pwd=`pwd` +txtdir=$gale_data/txt +mkdir -p $txtdir; cd $txtdir + +for cdx in $text1 $text2 $text3; do + echo "$0:Preparing $cdx" + if [[ $cdx == *.tgz ]] ; then + tar -xvf $cdx + elif [ -d "$cdx" ]; then + ln -s $cdx `basename $cdx` + else + echo "$0:I don't really know what I shall do with $cdx " >&2 + fi +done + +find -L . -type f -name "*.tdf" | while read file; do +sed '1,3d' $file # delete the first 3 lines +done > all.tmp$$ + +perl -e ' + ($inFile,$idFile,$txtFile)= split /\s+/, $ARGV[0]; + open(IN, "$inFile"); + open(ID, ">$idFile"); + open(TXT, ">$txtFile"); + while () { + @arr= split /\t/,$_; + $start=sprintf ("%0.3f",$arr[2]);$rStart=$start;$start=~s/\.//; $start=~s/^0+$/0/; $start=~s/^0+([^0])/$1/; # remove zeros at the beginning + $end=sprintf ("%0.3f",$arr[3]);$rEnd=$end;$end=~s/^0+([^0])/$1/;$end=~s/\.//; + if ( ($arr[11] !~ m/report/) && ($arr[11] !~ m/conversational/) ){$arr[11]="UNK";} + $id="$arr[11] $arr[0] $arr[0]_${start}_${end} $rStart $rEnd\n"; + next if ($rStart == $rEnd); + $id =~ s/.sph//g; + print ID $id; + print TXT "$arr[7]\n"; + }' "all.tmp$$ allid.tmp$$ contentall.tmp$$" + +perl ${top_pwd}/local/normalize_transcript_BW.pl contentall.tmp$$ contentall.buck.tmp$$ +paste allid.tmp$$ contentall.buck.tmp$$ | sed 's: $::' | awk '{if (NF>5) {print $0}}' > all_1.tmp$$ + + +awk '{$1="";print $0}' all_1.tmp$$ | sed 's:^ ::' > $gale_data/all +awk '{if ($1 == "report") {$1="";print $0}}' all_1.tmp$$ | sed 's:^ ::' > $gale_data/report +awk '{if ($1 == "conversational") {$1="";print $0}}' all_1.tmp$$ | sed 's:^ ::' > $gale_data/conversational + +cd ..; +rm -fr $txtdir +cd $top_pwd +echo "$0:dat a prep text succeeded" + +mkdir -p data +dir=$(utils/make_absolute.sh data/) +grep -f local/test_list $gale_data/all | grep -v -f local/bad_segments > $gale_data/all.test +grep -v -f local/test_list $gale_data/all | grep -v -f local/bad_segments > $gale_data/all.train + +for x in test train; do + outdir=data/$x + file=$gale_data/all.$x + mkdir -p $outdir + awk '{print $2 " " $2}' $file | sort -u > $outdir/utt2spk + cp -pr $outdir/utt2spk $outdir/spk2utt + awk '{print $2 " " $1 " " $3 " " $4}' $file | sort -u > $outdir/segments + awk '{printf $2 " "; for (i=5; i<=NF; i++) {printf $i " "} printf "\n"}' $file | sort -u > $outdir/text +done + +grep -f local/test_list $gale_data/wav.scp > $dir/test/wav.scp + +cat $gale_data/wav.scp | awk -v seg=$dir/train/segments 'BEGIN{while((getline0) {seen[$2]=1;}} + {if (seen[$1]) { print $0}}' > $dir/train/wav.scp + +echo "$0:data prep split succeeded" +exit 0 diff --git a/egs/gale_arabic/s5b/local/prepare_dict.sh b/egs/gale_arabic/s5b/local/prepare_dict.sh new file mode 100755 index 00000000000..47b5869fdf1 --- /dev/null +++ b/egs/gale_arabic/s5b/local/prepare_dict.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash + +# Copyright 2017 QCRI (author: Ahmed Ali) +# Apache 2.0 +# This script prepares the dictionary. + +set -e +dir=data/local/dict +lexicon_url1="http://alt.qcri.org//resources/speech/dictionary/ar-ar_grapheme_lexicon_2016-02-09.bz2"; +lexicon_url2="http://alt.qcri.org//resources/speech/dictionary/ar-ar_lexicon_2014-03-17.txt.bz2"; +stage=0 +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; +mkdir -p $dir data/local/lexicon_data + +if [ $stage -le 0 ]; then + echo "$0: Downloading text for lexicon... $(date)." + wget -P data/local/lexicon_data $lexicon_url1 + wget -P data/local/lexicon_data $lexicon_url2 + bzcat data/local/lexicon_data/ar-ar_grapheme_lexicon_2016-02-09.bz2 | sed '1,3d' | awk '{print $1}' > data/local/lexicon_data/grapheme_lexicon + bzcat data/local/lexicon_data/ar-ar_lexicon_2014-03-17.txt.bz2 | sed '1,3d' | awk '{print $1}' >> data/local/lexicon_data/grapheme_lexicon + cat data/train/text | cut -d ' ' -f 2- | tr -s " " "\n" | sort -u >> data/local/lexicon_data/grapheme_lexicon +fi + + +if [ $stage -le 0 ]; then + echo "$0: processing lexicon text and creating lexicon... $(date)." + # remove vowels and rare alef wasla + grep -v [0-9] data/local/lexicon_data/grapheme_lexicon | sed -e 's:[FNKaui\~o\`]::g' -e 's:{:}:g' | sort -u > data/local/lexicon_data/processed_lexicon + local/prepare_lexicon.py +fi + +cut -d' ' -f2- $dir/lexicon.txt | sed 's/SIL//g' | tr ' ' '\n' | sort -u | sed '/^$/d' >$dir/nonsilence_phones.txt || exit 1; + +sed -i '1i UNK' $dir/lexicon.txt + +echo UNK >> $dir/nonsilence_phones.txt + +echo ' SIL' >> $dir/lexicon.txt + +echo SIL > $dir/silence_phones.txt + +echo SIL >$dir/optional_silence.txt + +echo -n "" >$dir/extra_questions.txt + +echo "$0: Dictionary preparation succeeded" diff --git a/egs/gale_arabic/s5b/local/prepare_lexicon.py b/egs/gale_arabic/s5b/local/prepare_lexicon.py new file mode 100755 index 00000000000..215541585eb --- /dev/null +++ b/egs/gale_arabic/s5b/local/prepare_lexicon.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 + +# Copyright 2018 Ashish Arora +# Apache 2.0 + +# This script prepares lexicon. + +import argparse +import os + +parser = argparse.ArgumentParser(description="""Creates the list of characters and words in lexicon""") +args = parser.parse_args() + +### main ### +lex = {} +text_path = os.path.join('data','local', 'lexicon_data', 'processed_lexicon') +with open(text_path, 'r', encoding='utf-8') as f: + for line in f: + line = line.strip() + characters = list(line) + characters = " ".join(['V' if char == '*' else char for char in characters]) + lex[line] = characters + +with open(os.path.join('data','local','dict', 'lexicon.txt'), 'w', encoding='utf-8') as fp: + for key in sorted(lex): + fp.write(key + " " + lex[key] + "\n") diff --git a/egs/gale_arabic/s5b/local/prepare_lm.sh b/egs/gale_arabic/s5b/local/prepare_lm.sh new file mode 100755 index 00000000000..6fdf35f471a --- /dev/null +++ b/egs/gale_arabic/s5b/local/prepare_lm.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +# Copyright 2012 Vassil Panayotov +# 2017 Ewald Enzinger +# Apache 2.0 + +. ./path.sh || exit 1 + +echo "=== Building a language model ..." + +dir=data/local/lm/ +text=data/train/text +lexicon=data/local/dict/lexicon.txt +# Language model order +order=3 + +. utils/parse_options.sh + +# Prepare a LM training corpus from the transcripts +mkdir -p $dir + +for f in "$text" "$lexicon"; do + [ ! -f $f ] && echo "$0: No such file $f" && exit 1; +done + +loc=`which ngram-count`; +if [ -z $loc ]; then + if uname -a | grep 64 >/dev/null; then # some kind of 64 bit... + sdir=$KALDI_ROOT/tools/srilm/bin/i686-m64 + else + sdir=$KALDI_ROOT/tools/srilm/bin/i686 + fi + if [ -f $sdir/ngram-count ]; then + echo Using SRILM tools from $sdir + export PATH=$PATH:$sdir + else + echo You appear to not have SRILM tools installed, either on your path, + echo or installed in $sdir. See tools/install_srilm.sh for installation + echo instructions. + exit 1 + fi +fi + +cat data/train/text | cut -d " " -f 2- > $dir/text.txt +cut -d' ' -f1 $lexicon > $dir/wordlist + +ngram-count -text $dir/text.txt -order $order -limit-vocab -vocab $dir/wordlist \ + -unk -map-unk "" -kndiscount -interpolate -lm $dir/lm.gz + +#ngram -lm $dir/lm.gz -ppl $dir/dev.txt +echo "*** Finished building the LM model!" diff --git a/egs/gale_arabic/s5b/local/score.sh b/egs/gale_arabic/s5b/local/score.sh index 83366f7c7fc..1d84815fc69 100755 --- a/egs/gale_arabic/s5b/local/score.sh +++ b/egs/gale_arabic/s5b/local/score.sh @@ -1,60 +1,6 @@ -#!/bin/bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) -# Apache 2.0 - -[ -f ./path.sh ] && . ./path.sh - -# begin configuration section. -cmd=run.pl -stage=0 -decode_mbr=true -word_ins_penalty=0.0 -min_lmwt=7 -max_lmwt=17 -iter= #some of the scripts from steps/ seem to use it -#end configuration section. - -echo "$0 $#" - -[ -f ./path.sh ] && . ./path.sh -. parse_options.sh || exit 1; - -if [ $# -ne 3 ]; then - echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] " - echo " Options:" - echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." - echo " --stage (0|1|2) # start scoring script from part-way through." - echo " --decode_mbr (true/false) # maximum bayes risk decoding (confusion network)." - echo " --min_lmwt # minumum LM-weight for lattice rescoring " - echo " --max_lmwt # maximum LM-weight for lattice rescoring " - exit 1; -fi -data=$1 -lang_or_graph=$2 -dir=$3 - -symtab=$lang_or_graph/words.txt - -for f in $symtab $dir/lat.1.gz $data/text; do - [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1; -done - -mkdir -p $dir/scoring/log - -cat $data/text | sed 's:::g' | sed 's:::g' > $dir/scoring/test_filt.txt - -$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \ - lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ - lattice-add-penalty --word-ins-penalty=$word_ins_penalty ark:- ark:- \| \ - lattice-best-path --word-symbol-table=$symtab \ - ark:- ark,t:$dir/scoring/LMWT.tra || exit 1; +#!/bin/bash -# Note: the double level of quoting for the sed command -$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \ - cat $dir/scoring/LMWT.tra \| \ - utils/int2sym.pl -f 2- $symtab \| sed 's:\::g' \| \ - compute-wer --text --mode=present \ - ark:$dir/scoring/test_filt.txt ark,p:- ">&" $dir/wer_LMWT || exit 1; -exit 0; +steps/scoring/score_kaldi_wer.sh "$@" +steps/scoring/score_kaldi_cer.sh --stage 2 "$@" diff --git a/egs/gale_arabic/s5b/local/wer_output_filter b/egs/gale_arabic/s5b/local/wer_output_filter new file mode 100755 index 00000000000..cf48b434144 --- /dev/null +++ b/egs/gale_arabic/s5b/local/wer_output_filter @@ -0,0 +1,19 @@ +#!/usr/bin/env python3 + +# Copyright 2017 Hossein Hadian + +# Apache 2.0 +# This script converts a BPE-encoded text to normal text. It is used in scoring + +import sys, io +import string + +infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') +output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') + +for line in infile: + words = line.strip().split() + words = [word for word in words if '' not in word] + uttid = words[0] + transcript = ' '.join(words[1:]) + output.write(uttid + ' ' + transcript + '\n') diff --git a/egs/gale_arabic/s5b/run.sh b/egs/gale_arabic/s5b/run.sh index c45f5119949..3f12d22495e 100755 --- a/egs/gale_arabic/s5b/run.sh +++ b/egs/gale_arabic/s5b/run.sh @@ -3,177 +3,121 @@ # Copyright 2014 QCRI (author: Ahmed Ali) # Apache 2.0 -. ./path.sh -. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. - ## This relates to the queue. num_jobs=120 num_decode_jobs=40 +decode_gmm=true +stage=0 +overwrite=false -#NB: You can add whatever number of copora you like. The supported extensions -#NB: (formats) are wav and flac. Flac will be converted using sox and in contrast -#NB: with the old approach, the conversion will be on-the-fly and one-time-only -#NB: during the parametrization. - -#NB: Text corpora scpecification. We support either tgz files, which are unpacked -#NB: or just plain (already unpacked) directories. The list of transcript is then -#NB: obtained using find command - -#Make sure you edit this section to reflect whers you keep the LDC data on your cluster - -#This is CLSP configuration. We add the 2014 GALE data. We got around 2 % -#improvement just by including it. The gain might be large if someone would tweak -# the number of leaves and states and so on. - -#audio=( -# /export/corpora/LDC/LDC2013S02/ -# /export/corpora/LDC/LDC2013S07/ -# /export/corpora/LDC/LDC2014S07/ -#) -#text=( -# /export/corpora/LDC/LDC2013T17 -# /export/corpora/LDC/LDC2013T04 -# /export/corpora/LDC/LDC2014T17 -#) - -audio=( - /data/sls/scratch/amali/data/GALE/LDC2013S02 - /data/sls/scratch/amali/data/GALE/LDC2013S07 - /data/sls/scratch/amali/data/GALE/LDC2014S07 -) -text=( - /data/sls/scratch/amali/data/GALE/LDC2013T17.tgz - /data/sls/scratch/amali/data/GALE/LDC2013T04.tgz - /data/sls/scratch/amali/data/GALE/LDC2014T17.tgz -) +dir1=/export/corpora/LDC/LDC2013S02/ +dir2=/export/corpora/LDC/LDC2013S07/ +dir3=/export/corpora/LDC/LDC2014S07/ +text1=/export/corpora/LDC/LDC2013T17/ +text2=/export/corpora/LDC/LDC2013T04/ +text3=/export/corpora/LDC/LDC2014T17/ galeData=GALE -#prepare the data -#split train dev test -#prepare lexicon and LM - -# You can run the script from here automatically, but it is recommended to run the data preparation, -# and features extraction manually and and only once. -# By copying and pasting into your shell. - -#copy the audio files to local folder wav and convet flac files to wav -local/gale_data_prep_audio.sh "${audio[@]}" $galeData || exit 1; +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. +. ./path.sh +. ./utils/parse_options.sh # e.g. this parses the above options + # if supplied. -#get the transcription and remove empty prompts and all noise markers -local/gale_data_prep_txt.sh "${text[@]}" $galeData || exit 1; +if [ $stage -le 0 ]; then -# split the data to reports and conversational and for each class will have rain/dev and test -local/gale_data_prep_split.sh $galeData || exit 1; + if [ -f data/train/text ] && ! $overwrite; then + echo "$0: Not processing, probably script have run from wrong stage" + echo "Exiting with status 1 to avoid data corruption" + exit 1; + fi -# get all Arabic grapheme dictionaries and add silence and UNK -local/gale_prep_grapheme_dict.sh || exit 1; + echo "$0: Preparing data..." + local/prepare_data.sh --dir1 $dir1 --dir2 $dir2 --dir3 $dir3 \ + --text1 $text1 --text2 $text2 --text3 $text3 + echo "$0: Preparing lexicon and LM..." + local/prepare_dict.sh -#prepare the langauge resources -utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang || exit 1; + utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang -# LM training -local/gale_train_lms.sh || exit 1; + local/prepare_lm.sh -local/gale_format_data.sh || exit 1; -# G compilation, check LG composition + utils/format_lm.sh data/lang data/local/lm/lm.gz \ + data/local/dict/lexicon.txt data/lang_test +fi -# Now make MFCC features. -# mfccdir should be some place with a largish disk where you -# want to store MFCC features. mfccdir=mfcc - -for x in train test ; do - steps/make_mfcc.sh --cmd "$train_cmd" --nj $num_jobs \ - data/$x exp/make_mfcc/$x $mfccdir - utils/fix_data_dir.sh data/$x # some files fail to get mfcc for many reasons - steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir -done - - -# Here we start the AM - -# Let's create a subset with 10k segments to make quick flat-start training: -utils/subset_data_dir.sh data/train 10000 data/train.10K || exit 1; - -# Train monophone models on a subset of the data, 10K segment -# Note: the --boost-silence option should probably be omitted by default -steps/train_mono.sh --nj 40 --cmd "$train_cmd" \ - data/train.10K data/lang exp/mono || exit 1; - - -# Get alignments from monophone system. -steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \ - data/train data/lang exp/mono exp/mono_ali || exit 1; - -# train tri1 [first triphone pass] -steps/train_deltas.sh --cmd "$train_cmd" \ - 2500 30000 data/train data/lang exp/mono_ali exp/tri1 || exit 1; - -# First triphone decoding -utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph -steps/decode.sh --nj $num_decode_jobs --cmd "$decode_cmd" \ - exp/tri1/graph data/test exp/tri1/decode - -steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \ - data/train data/lang exp/tri1 exp/tri1_ali || exit 1; - -# Train tri2a, which is deltas+delta+deltas -steps/train_deltas.sh --cmd "$train_cmd" \ - 3000 40000 data/train data/lang exp/tri1_ali exp/tri2a || exit 1; - -# tri2a decoding -utils/mkgraph.sh data/lang_test exp/tri2a exp/tri2a/graph -steps/decode.sh --nj $num_decode_jobs --cmd "$decode_cmd" \ - exp/tri2a/graph data/test exp/tri2a/decode - -# train and decode tri2b [LDA+MLLT] -steps/train_lda_mllt.sh --cmd "$train_cmd" 4000 50000 \ - data/train data/lang exp/tri1_ali exp/tri2b || exit 1; - -utils/mkgraph.sh data/lang_test exp/tri2b exp/tri2b/graph -steps/decode.sh --nj $num_decode_jobs --cmd "$decode_cmd" \ - exp/tri2b/graph data/test exp/tri2b/decode - -# Align all data with LDA+MLLT system (tri2b) -steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \ - --use-graphs true data/train data/lang exp/tri2b exp/tri2b_ali || exit 1; - - -# From 2b system, train 3b which is LDA + MLLT + SAT. -steps/train_sat.sh --cmd "$train_cmd" \ - 5000 100000 data/train data/lang exp/tri2b_ali exp/tri3b || exit 1; - -utils/mkgraph.sh data/lang_test exp/tri3b exp/tri3b/graph -steps/decode_fmllr.sh --nj $num_decode_jobs --cmd \ - "$decode_cmd" exp/tri3b/graph data/test exp/tri3b/decode - -# From 3b system, align all data. -steps/align_fmllr.sh --nj $num_jobs --cmd "$train_cmd" \ - data/train data/lang exp/tri3b exp/tri3b_ali || exit 1; - - -# nnet3 cross-entropy -local/nnet3/run_tdnn.sh #tdnn recipe: -local/nnet3/run_lstm.sh --stage 12 #lstm recipe (we skip ivector training) - -# chain lattice-free -local/chain/run_tdnn.sh #tdnn recipe: -local/chain/run_tdnn_lstm.sh #tdnn-lstm recipe: - -time=$(date +"%Y-%m-%d-%H-%M-%S") - -#get detailed WER; reports, conversational and combined -local/split_wer.sh $galeData > RESULTS.details.$USER.$time # to make sure you keep the results timed and owned - -echo training succedded +if [ $stage -le 1 ]; then + echo "$0: Preparing the test and train feature files..." + for x in train test ; do + steps/make_mfcc.sh --cmd "$train_cmd" --nj $num_jobs \ + data/$x exp/make_mfcc/$x $mfccdir + utils/fix_data_dir.sh data/$x # some files fail to get mfcc for many reasons + steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir + done +fi + +if [ $stage -le 2 ]; then + echo "$0: creating sub-set and training monophone system" + utils/subset_data_dir.sh data/train 10000 data/train.10K || exit 1; + + steps/train_mono.sh --nj 40 --cmd "$train_cmd" \ + data/train.10K data/lang exp/mono || exit 1; +fi + +if [ $stage -le 3 ]; then + echo "$0: Aligning data using monophone system" + steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \ + data/train data/lang exp/mono exp/mono_ali || exit 1; + + echo "$0: training triphone system with delta features" + steps/train_deltas.sh --cmd "$train_cmd" \ + 2500 30000 data/train data/lang exp/mono_ali exp/tri1 || exit 1; +fi + +if [ $stage -le 4 ] && $decode_gmm; then + utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph + steps/decode.sh --nj $num_decode_jobs --cmd "$decode_cmd" \ + exp/tri1/graph data/test exp/tri1/decode +fi + +if [ $stage -le 5 ]; then + echo "$0: Aligning data and retraining and realigning with lda_mllt" + steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \ + data/train data/lang exp/tri1 exp/tri1_ali || exit 1; + + steps/train_lda_mllt.sh --cmd "$train_cmd" 4000 50000 \ + data/train data/lang exp/tri1_ali exp/tri2b || exit 1; +fi + +if [ $stage -le 6 ] && $decode_gmm; then + utils/mkgraph.sh data/lang_test exp/tri2b exp/tri2b/graph + steps/decode.sh --nj $num_decode_jobs --cmd "$decode_cmd" \ + exp/tri2b/graph data/test exp/tri2b/decode +fi + +if [ $stage -le 7 ]; then + echo "$0: Aligning data and retraining and realigning with sat_basis" + steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \ + data/train data/lang exp/tri2b exp/tri2b_ali || exit 1; + + steps/train_sat_basis.sh --cmd "$train_cmd" \ + 5000 100000 data/train data/lang exp/tri2b_ali exp/tri3b || exit 1; + + steps/align_fmllr.sh --nj $num_jobs --cmd "$train_cmd" \ + data/train data/lang exp/tri3b exp/tri3b_ali || exit 1; +fi + +if [ $stage -le 8 ] && $decode_gmm; then + utils/mkgraph.sh data/lang_test exp/tri3b exp/tri3b/graph + steps/decode_fmllr.sh --nj $num_decode_jobs --cmd \ + "$decode_cmd" exp/tri3b/graph data/test exp/tri3b/decode +fi + +if [ $stage -le 9 ]; then + echo "$0: Training a regular chain model using the e2e alignments..." + local/chain/run_tdnn.sh +fi + +echo "$0: training succedded" exit 0 - -#TODO: -#LM (4-gram and RNN) rescoring -#combine lattices -#dialect detection - - - - - diff --git a/egs/gale_arabic/s5c/RESULT b/egs/gale_arabic/s5c/RESULT new file mode 100644 index 00000000000..d56c9e2dbc6 --- /dev/null +++ b/egs/gale_arabic/s5c/RESULT @@ -0,0 +1,4 @@ +%WER 41.98 [ 29249 / 69668, 2672 ins, 5990 del, 20587 sub ] exp/tri1_subword/decode/wer_15_0.0 +%WER 37.66 [ 26239 / 69668, 2660 ins, 5255 del, 18324 sub ] exp/tri2b_subword/decode/wer_17_0.0 +%WER 35.26 [ 24565 / 69668, 2879 ins, 4892 del, 16794 sub ] exp/tri3b_subword/decode/wer_17_0.5 +%WER 17.29 [ 12049 / 69668, 1244 ins, 2758 del, 8047 sub ] exp/chain/tdnn_1a_sp/decode_test/wer_10_0.5 diff --git a/egs/gale_arabic/s5c/cmd.sh b/egs/gale_arabic/s5c/cmd.sh new file mode 100755 index 00000000000..ea341c98d4a --- /dev/null +++ b/egs/gale_arabic/s5c/cmd.sh @@ -0,0 +1,15 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="retry.pl queue.pl --mem 2G" +export decode_cmd="retry.pl queue.pl --mem 4G" +export mkgraph_cmd="retry.pl queue.pl --mem 8G" diff --git a/egs/gale_arabic/s5c/conf/decode.config b/egs/gale_arabic/s5c/conf/decode.config new file mode 100644 index 00000000000..6f503eab35e --- /dev/null +++ b/egs/gale_arabic/s5c/conf/decode.config @@ -0,0 +1 @@ +link decode_dnn.config \ No newline at end of file diff --git a/egs/gale_arabic/s5c/conf/mfcc.conf b/egs/gale_arabic/s5c/conf/mfcc.conf new file mode 100644 index 00000000000..7361509099f --- /dev/null +++ b/egs/gale_arabic/s5c/conf/mfcc.conf @@ -0,0 +1 @@ +--use-energy=false # only non-default option. diff --git a/egs/gale_arabic/s5c/conf/mfcc_hires.conf b/egs/gale_arabic/s5c/conf/mfcc_hires.conf new file mode 100644 index 00000000000..c45f2b691a9 --- /dev/null +++ b/egs/gale_arabic/s5c/conf/mfcc_hires.conf @@ -0,0 +1,10 @@ +# config for high-resolution MFCC features, intended for neural network training. +# Note: we keep all cepstra, so it has the same info as filterbank features, +# but MFCC is more easily compressible (because less correlated) which is why +# we prefer this method. +--use-energy=false # use average of log energy, not energy. +--sample-frequency=16000 +--num-mel-bins=40 +--num-ceps=40 +--low-freq=40 # low cutoff frequency for mel bins +--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) diff --git a/egs/gale_arabic/s5c/conf/online_cmvn.conf b/egs/gale_arabic/s5c/conf/online_cmvn.conf new file mode 100644 index 00000000000..cbdaf5f281c --- /dev/null +++ b/egs/gale_arabic/s5c/conf/online_cmvn.conf @@ -0,0 +1 @@ +# configuration file for apply-cmvn-online, used in the script ../local/online/run_online_decoding_nnet2.sh diff --git a/egs/gale_arabic/s5c/local/bad_segments b/egs/gale_arabic/s5c/local/bad_segments new file mode 100644 index 00000000000..c3413f0714c --- /dev/null +++ b/egs/gale_arabic/s5c/local/bad_segments @@ -0,0 +1,10 @@ +ARABIYA_FROMIRAQ_ARB_20070302_175801_2326286_2327450 +ARABIYA_BILARABI_ARB_20061005_201400_221375_223694 +LBC_NAHAR_ARB_20060911_142800_3683267_3685290 +LBC_NAHAR_ARB_20070303_145800_3249800_3251128 +LBC_NAHAR_ARB_20070303_145800_3623646_3624152 +LBC_NAHAR_ARB_20070305_035800_481003_484069 +ALAM_WITHEVENT_ARB_20070227_205800_3141876_3144152 +ALAM_NEWSRPT_ARB_20070130_015801_2875054_2876396 +ALJZ_TODHARV_ARB_20060914_155800_2947717_2949041 +ALJZ_TODHARV_ARB_20070107_145800_2417848_2419238 diff --git a/egs/gale_arabic/s5c/local/chain/compare_wer.sh b/egs/gale_arabic/s5c/local/chain/compare_wer.sh new file mode 100755 index 00000000000..1a40523355a --- /dev/null +++ b/egs/gale_arabic/s5c/local/chain/compare_wer.sh @@ -0,0 +1,72 @@ +#!/bin/bash + +# this script is used for comparing decoding results between systems. +# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b} + +# ./local/chain/compare_wer.sh exp/chain/cnn1a +# System cnn1a +# WER 0.61 +# CER 0.15 +# Final train prob -0.0377 +# Final valid prob -0.0380 +# Final train prob (xent) -0.0830 +# Final valid prob (xent) -0.0838 + +if [ $# == 0 ]; then + echo "Usage: $0: [ ... ]" + echo "e.g.: $0 exp/chain/cnn{1a,1b}" + exit 1 +fi + +echo "# $0 $*" +used_epochs=false + +echo -n "# System " +for x in $*; do printf "% 10s" " $(basename $x)"; done +echo + +echo -n "# WER " +for x in $*; do + wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "# CER " +for x in $*; do + cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}') + printf "% 10s" $cer +done +echo + +if $used_epochs; then + exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. +fi + +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final train prob (xent) " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob (xent) " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo diff --git a/egs/gale_arabic/s5c/local/chain/run_chain_common.sh b/egs/gale_arabic/s5c/local/chain/run_chain_common.sh new file mode 100755 index 00000000000..da37e148441 --- /dev/null +++ b/egs/gale_arabic/s5c/local/chain/run_chain_common.sh @@ -0,0 +1,82 @@ +#!/bin/bash + +# this script has common stages shared across librispeech chain recipes. +# It generates a new topology in a new lang directory, gets the alignments as +# lattices, and builds a tree for the new topology +set -e + +stage=11 + +# input directory names. These options are actually compulsory, and they have +# been named for convenience +gmm_dir= +ali_dir= +lores_train_data_dir= + +num_leaves=6000 + +# output directory names. They are also compulsory. +lang= +lat_dir= +tree_dir= +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +[ -z $lang ] && echo "Set --lang, this specifies the new lang directory which will have the new topology" && exit 1; +[ -z $lat_dir ] && echo "Set --lat-dir, this specifies the experiment directory to store lattice" && exit 1; +[ -z $tree_dir ] && echo "Set --tree-dir, this specifies the directory to store new tree " && exit 1; + +for f in $gmm_dir/final.mdl $ali_dir/ali.1.gz $lores_train_data_dir/feats.scp; do + [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1 +done + +if [ $stage -le 11 ]; then + echo "$0: creating lang directory with one state per phone." + # Create a version of the lang/ directory that has one state per phone in the + # topo file. [note, it really has two states.. the first one is only repeated + # once, the second one has zero or more repeats.] + if [ -d $lang ]; then + if [ $lang/L.fst -nt data/lang/L.fst ]; then + echo "$0: $lang already exists, not overwriting it; continuing" + else + echo "$0: $lang already exists and seems to be older than data/lang..." + echo " ... not sure what to do. Exiting." + exit 1; + fi + else + cp -r data/lang $lang + silphonelist=$(cat $lang/phones/silence.csl) || exit 1; + nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1; + # Use our special topology... note that later on may have to tune this + # topology. + steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo + fi +fi + +if [ $stage -le 12 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + nj=$(cat ${ali_dir}/num_jobs) || exit 1; + steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" ${lores_train_data_dir} \ + $lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 13 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" $num_leaves ${lores_train_data_dir} $lang $ali_dir $tree_dir +fi + +exit 0; diff --git a/egs/gale_arabic/s5c/local/chain/run_tdnn.sh b/egs/gale_arabic/s5c/local/chain/run_tdnn.sh new file mode 120000 index 00000000000..34499362831 --- /dev/null +++ b/egs/gale_arabic/s5c/local/chain/run_tdnn.sh @@ -0,0 +1 @@ +tuning/run_tdnn_1a.sh \ No newline at end of file diff --git a/egs/gale_arabic/s5c/local/chain/run_tdnn_lstm.sh b/egs/gale_arabic/s5c/local/chain/run_tdnn_lstm.sh new file mode 120000 index 00000000000..8e647598556 --- /dev/null +++ b/egs/gale_arabic/s5c/local/chain/run_tdnn_lstm.sh @@ -0,0 +1 @@ +tuning/run_tdnn_lstm_1a.sh \ No newline at end of file diff --git a/egs/gale_arabic/s5c/local/chain/tuning/run_tdnn_1a.sh b/egs/gale_arabic/s5c/local/chain/tuning/run_tdnn_1a.sh new file mode 100755 index 00000000000..bf2e45c9914 --- /dev/null +++ b/egs/gale_arabic/s5c/local/chain/tuning/run_tdnn_1a.sh @@ -0,0 +1,220 @@ +#!/bin/bash + +# ./local/chain/compare_wer.sh exp/chain/tdnn_1a_sp +# System tdnn_1a_sp +# WER 16.47 +# CER 6.68 +# Final train prob -0.0652 +# Final valid prob -0.0831 +# Final train prob (xent) -0.8965 +# Final valid prob (xent) -0.9964 + +# steps/info/chain_dir_info.pl exp/chain/tdnn_1a_sp/ +# exp/chain/tdnn_1a_sp/: num-iters=441 nj=3..16 num-params=18.6M dim=40+100->5816 combine=-0.063->-0.062 (over 6) xent:train/valid[293,440,final]=(-1.22,-0.912,-0.896/-1.29,-1.01,-0.996) logprob:train/valid[293,440,final]=(-0.097,-0.066,-0.065/-0.108,-0.084,-0.083) + + +set -e -o pipefail +stage=0 +nj=30 +train_set=train +test_set=test +gmm=tri3b # this is the source gmm-dir that we'll use for alignments; it + # should have alignments for the specified training data. +num_threads_ubm=32 +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. + +# Options which are not passed through to run_ivector_common.sh +affix=_1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +common_egs_dir= +reporting_email= + +# LSTM/chain options +train_stage=-10 +xent_regularize=0.1 +dropout_schedule='0,0@0.20,0.5@0.50,0' + +# training chunk-options +chunk_width=150,110,100 +get_egs_stage=-10 + +# training options +srand=0 +remove_egs=true +run_ivector_common=true +run_chain_common=true +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + linear-component name=prefinal-l dim=256 $linear_opts + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs 6 \ + --trainer.frames-per-iter 1500000 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.00025 \ + --trainer.optimization.final-effective-lrate 0.000025 \ + --trainer.num-chunk-per-minibatch=64,32 \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0 --constrained false" \ + --egs.stage $get_egs_stage \ + --reporting.email="$reporting_email" \ + --cleanup.remove-egs=$remove_egs \ + --feat-dir=$train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir=$lat_dir \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 17 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/lang/check_phones_compatible.sh \ + data/lang_test/phones.txt $lang/phones.txt + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test \ + $tree_dir $tree_dir/graph || exit 1; +fi + +if [ $stage -le 18 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + steps/nnet3/decode.sh \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 0 --extra-right-context 0 \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$decode_cmd" --num-threads 4 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${test_set}_hires \ + $tree_dir/graph data/${test_set}_hires ${dir}/decode_${test_set} || exit 1 +fi diff --git a/egs/gale_arabic/s5c/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/gale_arabic/s5c/local/chain/tuning/run_tdnn_lstm_1a.sh new file mode 100755 index 00000000000..deebafc95e4 --- /dev/null +++ b/egs/gale_arabic/s5c/local/chain/tuning/run_tdnn_lstm_1a.sh @@ -0,0 +1,222 @@ +#!/bin/bash + +#started from tedlium recipe with few edits + + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=17 +nj=30 +decode_nj=30 +min_seg_len=1.55 +chunk_left_context=40 +chunk_right_context=0 +label_delay=5 +xent_regularize=0.1 +train_set=train +gmm=tri2b # the gmm for the target data gmm for the target data +num_threads_ubm=32 +nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned +# decode options +extra_left_context=50 +extra_right_context=0 +frames_per_chunk=150 + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tdnn_lstm_affix=1a #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 15 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 16 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + + +if [ $stage -le 17 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=512 + relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1) + fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 + relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3) + relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3) + fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 + relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3) + relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3) + fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 18 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width "$frames_per_chunk" \ + --egs.chunk-left-context "$chunk_left_context" \ + --egs.chunk-right-context "$chunk_right_context" \ + --trainer.num-chunk-per-minibatch 128 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.deriv-truncate-margin 10 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 3 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --cleanup.remove-egs true \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir +fi + + + +if [ $stage -le 19 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_test $dir $dir/graph +fi + +if [ $stage -le 20 ]; then + steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --frames-per-chunk "$frames_per_chunk" \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_test_hires \ + --scoring-opts "--min-lmwt 5 " \ + $dir/graph data/test_hires $dir/decode || exit 1; +fi +exit 0 diff --git a/egs/gale_arabic/s5c/local/nnet3/run_ivector_common.sh b/egs/gale_arabic/s5c/local/nnet3/run_ivector_common.sh new file mode 100755 index 00000000000..a03cc5b2fa3 --- /dev/null +++ b/egs/gale_arabic/s5c/local/nnet3/run_ivector_common.sh @@ -0,0 +1,182 @@ +#!/bin/bash + +set -e -o pipefail + +# This script is called from scripts like local/nnet3/run_tdnn.sh and +# local/chain/run_tdnn.sh (and may eventually be called by more scripts). It +# contains the common feature preparation and iVector-related parts of the +# script. See those scripts for examples of usage. + + +stage=0 +nj=100 +train_set=train # you might set this to e.g. train. +test_sets="test" +gmm=tri3b # This specifies a GMM-dir from the features of the type you're training the system on; + # it should contain alignments for 'train_set'. + +num_threads_ubm=32 +nnet3_affix= # affix for exp/nnet3 directory to put iVector stuff + +. ./cmd.sh +. ./path.sh +. utils/parse_options.sh + + +gmm_dir=exp/${gmm} +ali_dir=exp/${gmm}_ali_${train_set}_sp + +for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do + if [ ! -f $f ]; then + echo "$0: expected file $f to exist" + exit 1 + fi +done + + + +if [ $stage -le 2 ] && [ -f data/${train_set}_sp_hires/feats.scp ]; then + echo "$0: data/${train_set}_sp_hires/feats.scp already exists." + echo " ... Please either remove it, or rerun this script with stage > 2." + exit 1 +fi + + +if [ $stage -le 1 ]; then + echo "$0: preparing directory for speed-perturbed data" + utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp +fi + +if [ $stage -le 2 ]; then + echo "$0: creating high-resolution MFCC features" + + # this shows how you can split across multiple file-systems. we'll split the + # MFCC dir across multiple locations. You might want to be careful here, if you + # have multiple copies of Kaldi checked out and run the same recipe, not to let + # them overwrite each other. + mfccdir=data/${train_set}_sp_hires/data + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/mfcc/gale_arabic-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage + fi + + for datadir in ${train_set}_sp ${test_sets}; do + utils/copy_data_dir.sh data/$datadir data/${datadir}_hires + done + + # do volume-perturbation on the training data prior to extracting hires + # features; this helps make trained nnets more invariant to test data volume. + utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires + + for datadir in ${train_set}_sp ${test_sets}; do + steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/${datadir}_hires + steps/compute_cmvn_stats.sh data/${datadir}_hires + utils/fix_data_dir.sh data/${datadir}_hires + done +fi + +if [ $stage -le 3 ]; then + echo "$0: computing a subset of data to train the diagonal UBM." + mkdir -p exp/nnet3${nnet3_affix}/diag_ubm + temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm + + # train a diagonal UBM using a subset of about a quarter of the data + num_utts_total=$(wc -l /dev/null || true + steps/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" --num-threads 4 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_test_hires \ + ${graph_dir} data/test_hires ${dir}/decode || exit 1 + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/test_hires ${dir}/decode_test ${dir}/decode_test_rescore || exit 1 +fi + +exit 0; diff --git a/egs/gale_arabic/s5c/local/nnet3/tuning/run_tdnn_1a.sh b/egs/gale_arabic/s5c/local/nnet3/tuning/run_tdnn_1a.sh new file mode 100755 index 00000000000..6619df668ef --- /dev/null +++ b/egs/gale_arabic/s5c/local/nnet3/tuning/run_tdnn_1a.sh @@ -0,0 +1,88 @@ +#!/bin/bash + +# started from tedlium recipe with few edits + +set -e -o pipefail -u + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +decode_nj=30 +min_seg_len=1.55 +train_set=train +gmm=tri2b # this is the source gmm-dir for the data-type of interest; it + # should have alignments for the specified training data. +num_threads_ubm=32 +nnet3_affix=_cleaned # cleanup affix for exp dirs, e.g. _cleaned +tdnn_affix= #affix for TDNN directory e.g. "a" or "b", in case we change the configuration. + +# Options which are not passed through to run_ivector_common.sh +train_stage=-10 +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -7,2 -3,3 0 0" +remove_egs=true +relu_dim=850 +num_epochs=3 + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < \n"; + exit (1); + } + +# <\check usage> +my $inFile = shift (@ARGV); +my $ouFile = shift(@ARGV); + + +open INFILE, "<$inFile" || die "unable to open the input file $inFile\n"; +binmode INFILE, ":encoding(utf8)"; + + +open OUTPUTFILE, ">$ouFile" or die "unable to open the output mlf file $ouFile\n"; +binmode OUTPUTFILE, ":encoding(utf8)"; + + +while () { + s/[^اأإآبتثجحخدذرزسشصضطظعغفقكلمنهويىئءؤة0-9]+/ /g; ## Removes non Arabic or numbers + my $BW = convertUTF8ToBuckwalter ($_); + print OUTPUTFILE "$BW"."\n"; +} +close INFILE; +close OUTPUTFILE; + + + +# this function is copied from MADATools.pm: MADA Tools + sub convertUTF8ToBuckwalter { + + my ($line)= (@_); + #$line = $UTF8_ENCODING_OBJ->decode($line); ## Same as Encode::decode("utf8",$line), but faster since object already created + $line =~ s/\x{0621}/\'/g; ## HAMZA + $line =~ s/\x{0622}/\|/g; ## ALEF WITH MADDA ABOVE + $line =~ s/\x{0623}/\>/g; ## ALEF WITH HAMZA ABOVE + $line =~ s/\x{0624}/\&/g; ## WAW WITH HAMZA ABOVE + $line =~ s/\x{0625}/\/dev/null +if [[ $? != 0 ]]; then + echo "$0: sox is not installed"; exit 1 +fi + +for dvd in $dir1 $dir2 $dir3; do + dvd_full_path=$(utils/make_absolute.sh $dvd) + if [[ ! -e $dvd_full_path ]]; then + echo "$0: missing $dvd_full_path"; exit 1; + fi + find $dvd_full_path \( -name "*.wav" -o -name "*.flac" \) | while read file; do + id=$(basename $file | awk '{gsub(".wav","");gsub(".flac","");print}') + echo "$id sox $file -r 16000 -t wav - |" + done +done | sort -u > $gale_data/wav.scp +echo "$0:data prep audio succeded" + +gale_data=$(utils/make_absolute.sh "GALE" ); +top_pwd=`pwd` +txtdir=$gale_data/txt +mkdir -p $txtdir; cd $txtdir + +for cdx in $text1 $text2 $text3; do + echo "$0:Preparing $cdx" + if [[ $cdx == *.tgz ]] ; then + tar -xvf $cdx + elif [ -d "$cdx" ]; then + ln -s $cdx `basename $cdx` + else + echo "$0:I don't really know what I shall do with $cdx " >&2 + fi +done + +find -L . -type f -name "*.tdf" | while read file; do +sed '1,3d' $file # delete the first 3 lines +done > all.tmp$$ + +perl -e ' + ($inFile,$idFile,$txtFile)= split /\s+/, $ARGV[0]; + open(IN, "$inFile"); + open(ID, ">$idFile"); + open(TXT, ">$txtFile"); + while () { + @arr= split /\t/,$_; + $start=sprintf ("%0.3f",$arr[2]);$rStart=$start;$start=~s/\.//; $start=~s/^0+$/0/; $start=~s/^0+([^0])/$1/; # remove zeros at the beginning + $end=sprintf ("%0.3f",$arr[3]);$rEnd=$end;$end=~s/^0+([^0])/$1/;$end=~s/\.//; + if ( ($arr[11] !~ m/report/) && ($arr[11] !~ m/conversational/) ){$arr[11]="UNK";} + $id="$arr[11] $arr[0] $arr[0]_${start}_${end} $rStart $rEnd\n"; + next if ($rStart == $rEnd); + $id =~ s/.sph//g; + print ID $id; + print TXT "$arr[7]\n"; + }' "all.tmp$$ allid.tmp$$ contentall.tmp$$" + +perl ${top_pwd}/local/normalize_transcript_BW.pl contentall.tmp$$ contentall.buck.tmp$$ +paste allid.tmp$$ contentall.buck.tmp$$ | sed 's: $::' | awk '{if (NF>5) {print $0}}' > all_1.tmp$$ + + +awk '{$1="";print $0}' all_1.tmp$$ | sed 's:^ ::' > $gale_data/all +awk '{if ($1 == "report") {$1="";print $0}}' all_1.tmp$$ | sed 's:^ ::' > $gale_data/report +awk '{if ($1 == "conversational") {$1="";print $0}}' all_1.tmp$$ | sed 's:^ ::' > $gale_data/conversational + +cd ..; +rm -fr $txtdir +cd $top_pwd +echo "$0:dat a prep text succeeded" + +mkdir -p data +dir=$(utils/make_absolute.sh data/) +grep -f local/test_list $gale_data/all | grep -v -f local/bad_segments > $gale_data/all.test +grep -v -f local/test_list $gale_data/all | grep -v -f local/bad_segments > $gale_data/all.train + +for x in test train; do + outdir=data/$x + file=$gale_data/all.$x + mkdir -p $outdir + awk '{print $2 " " $2}' $file | sort -u > $outdir/utt2spk + cp -pr $outdir/utt2spk $outdir/spk2utt + awk '{print $2 " " $1 " " $3 " " $4}' $file | sort -u > $outdir/segments + awk '{printf $2 " "; for (i=5; i<=NF; i++) {printf $i " "} printf "\n"}' $file | sort -u > $outdir/text +done + +grep -f local/test_list $gale_data/wav.scp > $dir/test/wav.scp + +cat $gale_data/wav.scp | awk -v seg=$dir/train/segments 'BEGIN{while((getline0) {seen[$2]=1;}} + {if (seen[$1]) { print $0}}' > $dir/train/wav.scp + +echo "$0:data prep split succeeded" +exit 0 diff --git a/egs/gale_arabic/s5c/local/prepare_dict.sh b/egs/gale_arabic/s5c/local/prepare_dict.sh new file mode 100755 index 00000000000..47b5869fdf1 --- /dev/null +++ b/egs/gale_arabic/s5c/local/prepare_dict.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash + +# Copyright 2017 QCRI (author: Ahmed Ali) +# Apache 2.0 +# This script prepares the dictionary. + +set -e +dir=data/local/dict +lexicon_url1="http://alt.qcri.org//resources/speech/dictionary/ar-ar_grapheme_lexicon_2016-02-09.bz2"; +lexicon_url2="http://alt.qcri.org//resources/speech/dictionary/ar-ar_lexicon_2014-03-17.txt.bz2"; +stage=0 +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; +mkdir -p $dir data/local/lexicon_data + +if [ $stage -le 0 ]; then + echo "$0: Downloading text for lexicon... $(date)." + wget -P data/local/lexicon_data $lexicon_url1 + wget -P data/local/lexicon_data $lexicon_url2 + bzcat data/local/lexicon_data/ar-ar_grapheme_lexicon_2016-02-09.bz2 | sed '1,3d' | awk '{print $1}' > data/local/lexicon_data/grapheme_lexicon + bzcat data/local/lexicon_data/ar-ar_lexicon_2014-03-17.txt.bz2 | sed '1,3d' | awk '{print $1}' >> data/local/lexicon_data/grapheme_lexicon + cat data/train/text | cut -d ' ' -f 2- | tr -s " " "\n" | sort -u >> data/local/lexicon_data/grapheme_lexicon +fi + + +if [ $stage -le 0 ]; then + echo "$0: processing lexicon text and creating lexicon... $(date)." + # remove vowels and rare alef wasla + grep -v [0-9] data/local/lexicon_data/grapheme_lexicon | sed -e 's:[FNKaui\~o\`]::g' -e 's:{:}:g' | sort -u > data/local/lexicon_data/processed_lexicon + local/prepare_lexicon.py +fi + +cut -d' ' -f2- $dir/lexicon.txt | sed 's/SIL//g' | tr ' ' '\n' | sort -u | sed '/^$/d' >$dir/nonsilence_phones.txt || exit 1; + +sed -i '1i UNK' $dir/lexicon.txt + +echo UNK >> $dir/nonsilence_phones.txt + +echo ' SIL' >> $dir/lexicon.txt + +echo SIL > $dir/silence_phones.txt + +echo SIL >$dir/optional_silence.txt + +echo -n "" >$dir/extra_questions.txt + +echo "$0: Dictionary preparation succeeded" diff --git a/egs/gale_arabic/s5c/local/prepare_dict_subword.sh b/egs/gale_arabic/s5c/local/prepare_dict_subword.sh new file mode 100755 index 00000000000..14416e8587e --- /dev/null +++ b/egs/gale_arabic/s5c/local/prepare_dict_subword.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash + +# Copyright 2017 QCRI (author: Ahmed Ali) +# 2019 Dongji Gao +# Apache 2.0 +# This script prepares the subword dictionary. + +set -e +dir=data/local/dict +lexicon_url1="http://alt.qcri.org//resources/speech/dictionary/ar-ar_grapheme_lexicon_2016-02-09.bz2"; +lexicon_url2="http://alt.qcri.org//resources/speech/dictionary/ar-ar_lexicon_2014-03-17.txt.bz2"; +num_merges=1000 +stage=0 +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; +mkdir -p $dir data/local/lexicon_data + +if [ $stage -le 0 ]; then + echo "$0: Downloading text for lexicon... $(date)." + wget -P data/local/lexicon_data $lexicon_url1 + wget -P data/local/lexicon_data $lexicon_url2 + bzcat data/local/lexicon_data/ar-ar_grapheme_lexicon_2016-02-09.bz2 | sed '1,3d' | awk '{print $1}' > data/local/lexicon_data/grapheme_lexicon + bzcat data/local/lexicon_data/ar-ar_lexicon_2014-03-17.txt.bz2 | sed '1,3d' | awk '{print $1}' >> data/local/lexicon_data/grapheme_lexicon + cat data/train/text | cut -d ' ' -f 2- | tr -s " " "\n" | sort -u >> data/local/lexicon_data/grapheme_lexicon +fi + + +if [ $stage -le 0 ]; then + echo "$0: processing lexicon text and creating lexicon... $(date)." + # remove vowels and rare alef wasla + grep -v "[0-9]" data/local/lexicon_data/grapheme_lexicon | sed -e 's:[FNKaui\~o\`]::g' -e 's:{:}:g' | sort -u > data/local/lexicon_data/processed_lexicon + local/prepare_lexicon.py +fi + +cut -d' ' -f2- $dir/lexicon.txt | sed 's/SIL//g' | tr ' ' '\n' | sort -u | sed '/^$/d' >$dir/nonsilence_phones.txt || exit 1; + +echo UNK >> $dir/nonsilence_phones.txt + +echo SIL > $dir/silence_phones.txt + +echo SIL >$dir/optional_silence.txt + +echo -n "" >$dir/extra_questions.txt + +# Make a subword lexicon based on current word lexicon +glossaries=" " +if [ $stage -le 0 ]; then + echo "$0: making subword lexicon... $(date)." + # get pair_code file + cut -d ' ' -f2- data/train/text | sed 's///g;s///g' | utils/lang/bpe/learn_bpe.py -s $num_merges > data/local/pair_code.txt + mv $dir/lexicon.txt $dir/lexicon_word.txt + # get words + cut -d ' ' -f1 $dir/lexicon_word.txt > $dir/words.txt + utils/lang/bpe/apply_bpe.py -c data/local/pair_code.txt --glossaries $glossaries < $dir/words.txt | \ + sed 's/ /\n/g' | sort -u > $dir/subwords.txt + sed 's/./& /g' $dir/subwords.txt | sed 's/@ @ //g' | sed 's/*/V/g' | paste -d ' ' $dir/subwords.txt - > $dir/lexicon.txt +fi + +sed -i '1i UNK' $dir/lexicon.txt + +echo ' SIL' >> $dir/lexicon.txt + +echo "$0: Dictionary preparation succeeded" diff --git a/egs/gale_arabic/s5c/local/prepare_lexicon.py b/egs/gale_arabic/s5c/local/prepare_lexicon.py new file mode 100755 index 00000000000..215541585eb --- /dev/null +++ b/egs/gale_arabic/s5c/local/prepare_lexicon.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 + +# Copyright 2018 Ashish Arora +# Apache 2.0 + +# This script prepares lexicon. + +import argparse +import os + +parser = argparse.ArgumentParser(description="""Creates the list of characters and words in lexicon""") +args = parser.parse_args() + +### main ### +lex = {} +text_path = os.path.join('data','local', 'lexicon_data', 'processed_lexicon') +with open(text_path, 'r', encoding='utf-8') as f: + for line in f: + line = line.strip() + characters = list(line) + characters = " ".join(['V' if char == '*' else char for char in characters]) + lex[line] = characters + +with open(os.path.join('data','local','dict', 'lexicon.txt'), 'w', encoding='utf-8') as fp: + for key in sorted(lex): + fp.write(key + " " + lex[key] + "\n") diff --git a/egs/gale_arabic/s5c/local/prepare_lm.sh b/egs/gale_arabic/s5c/local/prepare_lm.sh new file mode 100755 index 00000000000..6fdf35f471a --- /dev/null +++ b/egs/gale_arabic/s5c/local/prepare_lm.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +# Copyright 2012 Vassil Panayotov +# 2017 Ewald Enzinger +# Apache 2.0 + +. ./path.sh || exit 1 + +echo "=== Building a language model ..." + +dir=data/local/lm/ +text=data/train/text +lexicon=data/local/dict/lexicon.txt +# Language model order +order=3 + +. utils/parse_options.sh + +# Prepare a LM training corpus from the transcripts +mkdir -p $dir + +for f in "$text" "$lexicon"; do + [ ! -f $f ] && echo "$0: No such file $f" && exit 1; +done + +loc=`which ngram-count`; +if [ -z $loc ]; then + if uname -a | grep 64 >/dev/null; then # some kind of 64 bit... + sdir=$KALDI_ROOT/tools/srilm/bin/i686-m64 + else + sdir=$KALDI_ROOT/tools/srilm/bin/i686 + fi + if [ -f $sdir/ngram-count ]; then + echo Using SRILM tools from $sdir + export PATH=$PATH:$sdir + else + echo You appear to not have SRILM tools installed, either on your path, + echo or installed in $sdir. See tools/install_srilm.sh for installation + echo instructions. + exit 1 + fi +fi + +cat data/train/text | cut -d " " -f 2- > $dir/text.txt +cut -d' ' -f1 $lexicon > $dir/wordlist + +ngram-count -text $dir/text.txt -order $order -limit-vocab -vocab $dir/wordlist \ + -unk -map-unk "" -kndiscount -interpolate -lm $dir/lm.gz + +#ngram -lm $dir/lm.gz -ppl $dir/dev.txt +echo "*** Finished building the LM model!" diff --git a/egs/gale_arabic/s5c/local/prepare_lm_subword.sh b/egs/gale_arabic/s5c/local/prepare_lm_subword.sh new file mode 100755 index 00000000000..a5d5c1d1c94 --- /dev/null +++ b/egs/gale_arabic/s5c/local/prepare_lm_subword.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +# Copyright 2012 Vassil Panayotov +# 2017 Ewald Enzinger +# 2019 Dongji Gao +# Apache 2.0 + +. ./path.sh || exit 1 + +echo "=== Building a language model ..." + +dir=data/local/lm/ +text=data/train/text +lexicon=data/local/dict/lexicon.txt +# Language model order +order=6 + +. utils/parse_options.sh + +# Prepare a LM training corpus from the transcripts +mkdir -p $dir + +for f in "$text" "$lexicon"; do + [ ! -f $f ] && echo "$0: No such file $f" && exit 1; +done + +loc=`which ngram-count`; +if [ -z $loc ]; then + if uname -a | grep 64 >/dev/null; then # some kind of 64 bit... + sdir=$KALDI_ROOT/tools/srilm/bin/i686-m64 + else + sdir=$KALDI_ROOT/tools/srilm/bin/i686 + fi + if [ -f $sdir/ngram-count ]; then + echo Using SRILM tools from $sdir + export PATH=$PATH:$sdir + else + echo You appear to not have SRILM tools installed, either on your path, + echo or installed in $sdir. See tools/install_srilm.sh for installation + echo instructions. + exit 1 + fi +fi + +cat data/train/text | cut -d " " -f 2- > $dir/text.txt +cat data/test/text | cut -d ' ' -f2- > $dir/dev.txt +cut -d' ' -f1 $lexicon > $dir/wordlist + +ngram-count -text $dir/text.txt -order $order -vocab $dir/wordlist \ + -unk -map-unk "" -wbdiscount1 -kndiscount2 -kndiscount3 -kndiscount4 -kndiscount5 -kndiscount6 -interpolate -lm $dir/lm.gz + +ngram -order $order -lm $dir/lm.gz -ppl $dir/dev.txt +echo "*** Finished building the LM model!" diff --git a/egs/gale_arabic/s5c/local/score.sh b/egs/gale_arabic/s5c/local/score.sh new file mode 100755 index 00000000000..1d84815fc69 --- /dev/null +++ b/egs/gale_arabic/s5c/local/score.sh @@ -0,0 +1,6 @@ + +#!/bin/bash + + +steps/scoring/score_kaldi_wer.sh "$@" +steps/scoring/score_kaldi_cer.sh --stage 2 "$@" diff --git a/egs/gale_arabic/s5c/local/split_wer.sh b/egs/gale_arabic/s5c/local/split_wer.sh new file mode 100755 index 00000000000..d83a0f79e8c --- /dev/null +++ b/egs/gale_arabic/s5c/local/split_wer.sh @@ -0,0 +1,72 @@ +#!/bin/bash + +# Report WER for reports and conversational +# Copyright 2014 QCRI (author: Ahmed Ali) +# Apache 2.0 + +if [ $# -ne 1 ]; then + echo "Arguments should be the gale folder, see ../run.sh for example." + exit 1; +fi + +[ -f ./path.sh ] && . ./path.sh + + +galeFolder=$(utils/make_absolute.sh $1) +symtab=./data/lang/words.txt +find exp/ -maxdepth 3 -type d -name decode\* > list_decode$$ + +#split the test set per type: +awk '{print $2}' $galeFolder/all.test | sort -u > $galeFolder/test_id$$ + +# generate the report test set +awk '{print $2}' $galeFolder/report | sort -u > $galeFolder/report_id$$ +comm -1 -2 $galeFolder/test_id$$ $galeFolder/report_id$$ > $galeFolder/report.test + +# generate the conversational test set +awk '{print $2}' $galeFolder/conversational | sort -u > $galeFolder/conversational_id$$ + +comm -1 -2 $galeFolder/test_id$$ $galeFolder/conversational_id$$ > $galeFolder/conversational.test + +rm -fr $galeFolder/test_id$$ $galeFolder/report_id$$ $galeFolder/conversational_id$$ + +min_lmwt=7 +max_lmwt=20 +cat list_decode$$ | while read dir; do + for type in report conversational; do + #echo "Processing: $dir $type" + rm -fr $dir/scoring_$type + cp -pr $dir/scoring $dir/scoring_$type + ( cd $dir/scoring_$type; + for x in *.tra test_filt.txt; do + sort -u $x > tmp$$ + join tmp$$ $galeFolder/${type}.test > $x + rm -fr tmp$$ + done + ) + +utils/run.pl LMWT=$min_lmwt:$max_lmwt $dir/scoring_$type/log/score.LMWT.log \ + cat $dir/scoring_${type}/LMWT.tra \| \ + utils/int2sym.pl -f 2- $symtab \| sed 's:\::g' \| \ + compute-wer --text --mode=present \ + ark:$dir/scoring_${type}/test_filt.txt ark,p:- ">&" $dir/wer_${type}_LMWT +done +done + + +time=$(date +"%Y-%m-%d-%H-%M-%S") +echo "RESULTS generated by $USER at $time" + +echo "Report Results WER:" +cat list_decode$$ | while read x; do [ -d $x ] && grep WER $x/wer_report_* | utils/best_wer.sh; done | sort -n -k2 + +echo "Conversational Results WER:" +cat list_decode$$ | while read x; do [ -d $x ] && grep WER $x/wer_conversational_* | utils/best_wer.sh; done | sort -n -k2 + +echo "Combined Results for Reports and Conversational WER:" +cat list_decode$$ | while read x; do [ -d $x ] && grep WER $x/wer_?? $x/wer_?| utils/best_wer.sh; done | sort -n -k2 + +rm list_decode$$ + + + diff --git a/egs/gale_arabic/s5c/local/test_list b/egs/gale_arabic/s5c/local/test_list new file mode 100644 index 00000000000..d82cf498804 --- /dev/null +++ b/egs/gale_arabic/s5c/local/test_list @@ -0,0 +1,11 @@ +ALAM_WITHEVENT_ARB_20070116_205800 +ALAM_WITHEVENT_ARB_20070130_205800 +ALAM_WITHEVENT_ARB_20070206_205801 +ALAM_WITHEVENT_ARB_20070213_205800 +ALAM_WITHEVENT_ARB_20070227_205800 +ALAM_WITHEVENT_ARB_20070306_205800 +ALAM_WITHEVENT_ARB_20070313_205800 +ARABIYA_FROMIRAQ_ARB_20070216_175800 +ARABIYA_FROMIRAQ_ARB_20070223_175801 +ARABIYA_FROMIRAQ_ARB_20070302_175801 +ARABIYA_FROMIRAQ_ARB_20070309_175800 diff --git a/egs/gale_arabic/s5c/local/wer_output_filter b/egs/gale_arabic/s5c/local/wer_output_filter new file mode 100755 index 00000000000..fcd40539e7f --- /dev/null +++ b/egs/gale_arabic/s5c/local/wer_output_filter @@ -0,0 +1,4 @@ +#!/bin/sed -f +s/@@ //g +s///g +s///g diff --git a/egs/gale_arabic/s5c/path.sh b/egs/gale_arabic/s5c/path.sh new file mode 100755 index 00000000000..be11b34cbc6 --- /dev/null +++ b/egs/gale_arabic/s5c/path.sh @@ -0,0 +1,5 @@ +export KALDI_ROOT=$(pwd)/../../.. +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C diff --git a/egs/gale_arabic/s5c/run.sh b/egs/gale_arabic/s5c/run.sh new file mode 100755 index 00000000000..3e363816812 --- /dev/null +++ b/egs/gale_arabic/s5c/run.sh @@ -0,0 +1,131 @@ +#!/bin/bash -e + +# Copyright 2014 QCRI (author: Ahmed Ali) +# 2019 Dongji Gao +# Apache 2.0 + +# This is an example script for subword implementation + +num_jobs=120 +num_decode_jobs=40 +decode_gmm=true +stage=0 +overwrite=false +num_merges=1000 + +dir1=/export/corpora/LDC/LDC2013S02/ +dir2=/export/corpora/LDC/LDC2013S07/ +dir3=/export/corpora/LDC/LDC2014S07/ +text1=/export/corpora/LDC/LDC2013T17/ +text2=/export/corpora/LDC/LDC2013T04/ +text3=/export/corpora/LDC/LDC2014T17/ + +galeData=GALE +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. +. ./path.sh +. ./utils/parse_options.sh # e.g. this parses the above options + # if supplied. + +if [ $stage -le 0 ]; then + + if [ -f data/train/text ] && ! $overwrite; then + echo "$0: Not processing, probably script have run from wrong stage" + echo "Exiting with status 1 to avoid data corruption" + exit 1; + fi + + echo "$0: preparing data..." + local/prepare_data.sh --dir1 $dir1 --dir2 $dir2 --dir3 $dir3 \ + --text1 $text1 --text2 $text2 --text3 $text3 + + echo "$0: Preparing lexicon and LM..." + local/prepare_dict_subword.sh --num_merges $num_merges + + utils/subword/prepare_lang_subword.sh data/local/dict "" data/local/lang data/lang + + for set in train test; do + utils/subword/prepare_subword_text.sh data/$set/text data/local/pair_code.txt data/$set/text + done + + local/prepare_lm_subword.sh + + utils/format_lm.sh data/lang data/local/lm/lm.gz \ + data/local/dict/lexicon.txt data/lang_test +fi + +mfccdir=mfcc +if [ $stage -le 1 ]; then + echo "$0: Preparing the test and train feature files..." + for x in train test ; do + steps/make_mfcc.sh --cmd "$train_cmd" --nj $num_jobs \ + data/$x exp/make_mfcc/$x $mfccdir + utils/fix_data_dir.sh data/$x # some files fail to get mfcc for many reasons + steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir + done +fi + +if [ $stage -le 2 ]; then + echo "$0: creating sub-set and training monophone system" + utils/subset_data_dir.sh data/train 10000 data/train.10K || exit 1; + + steps/train_mono.sh --nj 40 --cmd "$train_cmd" \ + data/train.10K data/lang exp/mono_subword || exit 1; +fi + +if [ $stage -le 3 ]; then + echo "$0: Aligning data using monophone system" + steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \ + data/train data/lang exp/mono_subword exp/mono_ali_subword || exit 1; + + echo "$0: training triphone system with delta features" + steps/train_deltas.sh --cmd "$train_cmd" \ + 2500 30000 data/train data/lang exp/mono_ali_subword exp/tri1_subword || exit 1; +fi + +if [ $stage -le 4 ] && $decode_gmm; then + utils/mkgraph.sh data/lang_test exp/tri1_subword exp/tri1_subword/graph + steps/decode.sh --nj $num_decode_jobs --cmd "$decode_cmd" \ + exp/tri1_subword/graph data/test exp/tri1_subword/decode +fi + +if [ $stage -le 5 ]; then + echo "$0: Aligning data and retraining and realigning with lda_mllt" + steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \ + data/train data/lang exp/tri1_subword exp/tri1_ali_subword || exit 1; + + steps/train_lda_mllt.sh --cmd "$train_cmd" 4000 50000 \ + data/train data/lang exp/tri1_ali_subword exp/tri2b_subword || exit 1; +fi + +if [ $stage -le 6 ] && $decode_gmm; then + utils/mkgraph.sh data/lang_test exp/tri2b_subword exp/tri2b_subword/graph + steps/decode.sh --nj $num_decode_jobs --cmd "$decode_cmd" \ + exp/tri2b_subword/graph data/test exp/tri2b_subword/decode +fi + +if [ $stage -le 7 ]; then + echo "$0: Aligning data and retraining and realigning with sat_basis" + steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \ + data/train data/lang exp/tri2b_subword exp/tri2b_ali_subword || exit 1; + + steps/train_sat_basis.sh --cmd "$train_cmd" \ + 5000 100000 data/train data/lang exp/tri2b_ali_subword exp/tri3b_subword || exit 1; + + steps/align_fmllr.sh --nj $num_jobs --cmd "$train_cmd" \ + data/train data/lang exp/tri3b_subword exp/tri3b_ali_subword || exit 1; +fi + +if [ $stage -le 8 ] && $decode_gmm; then + utils/mkgraph.sh data/lang_test exp/tri3b_subword exp/tri3b_subword/graph + steps/decode_fmllr.sh --nj $num_decode_jobs --cmd \ + "$decode_cmd" exp/tri3b_subword/graph data/test exp/tri3b_subword/decode +fi + +if [ $stage -le 9 ]; then + echo "$0: Training a regular chain model using the e2e alignments..." + local/chain/run_tdnn.sh --gmm tri3b_subword +fi + +echo "$0: training succeed" +exit 0 diff --git a/egs/gale_arabic/s5c/steps b/egs/gale_arabic/s5c/steps new file mode 120000 index 00000000000..1b186770dd1 --- /dev/null +++ b/egs/gale_arabic/s5c/steps @@ -0,0 +1 @@ +../../wsj/s5/steps/ \ No newline at end of file diff --git a/egs/gale_arabic/s5c/utils b/egs/gale_arabic/s5c/utils new file mode 120000 index 00000000000..a3279dc8679 --- /dev/null +++ b/egs/gale_arabic/s5c/utils @@ -0,0 +1 @@ +../../wsj/s5/utils/ \ No newline at end of file diff --git a/egs/gale_mandarin/s5/local/gale_prep_dict.sh b/egs/gale_mandarin/s5/local/gale_prep_dict.sh index 2e2810bb713..c6a80240754 100755 --- a/egs/gale_mandarin/s5/local/gale_prep_dict.sh +++ b/egs/gale_mandarin/s5/local/gale_prep_dict.sh @@ -130,7 +130,9 @@ unset LC_ALL # are equal cat $dict_dir/ch-dict.txt |\ perl -e ' - use encoding utf8; + use utf8; + binmode(STDIN,":encoding(utf8)"); + binmode(STDOUT,":encoding(utf8)"); while () { @A = split(" ", $_); $word_len = length($A[0]); @@ -299,4 +301,3 @@ cat $dict_dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", export LC_ALL=C echo "$0: Done" - diff --git a/egs/gale_mandarin/s5/local/gale_segment.py b/egs/gale_mandarin/s5/local/gale_segment.py index 975ddb9c143..d652eb837f3 100755 --- a/egs/gale_mandarin/s5/local/gale_segment.py +++ b/egs/gale_mandarin/s5/local/gale_segment.py @@ -1,6 +1,7 @@ #!/usr/bin/env python #coding:utf-8 #!/usr/bin/env python +from __future__ import print_function import sys from mmseg import seg_txt for line in sys.stdin: @@ -12,4 +13,4 @@ continue for j in seg_txt(blks[i]): out_line += " " + j - print out_line + print(out_line) diff --git a/egs/gop/README.md b/egs/gop/README.md new file mode 100644 index 00000000000..d95f4e966fd --- /dev/null +++ b/egs/gop/README.md @@ -0,0 +1,98 @@ +There is a copy of this document on Google Docs, which renders the equations better: +[link](https://docs.google.com/document/d/1pie-PU6u2NZZC_FzocBGGm6mpfBJMiCft9UoG0uA1kA/edit?usp=sharing) + +* * * + +# GOP on Kaldi + +The Goodness of Pronunciation (GOP) is a variation of the posterior probability, for phone level pronunciation scoring. +GOP is widely used in pronunciation evaluation and mispronunciation detection tasks. + +This implementation is mainly based on the following paper: + +Hu, W., Qian, Y., Soong, F. K., & Wang, Y. (2015). Improved mispronunciation detection with deep neural network trained acoustic models and transfer learning based logistic regression classifiers. Speech Communication, 67(January), 154-166. + +## GOP-GMM + +In the conventional GMM-HMM based system, GOP was first proposed in (Witt et al., 2000). It was defined as the duration normalised log of the posterior: + +$$ +GOP(p)=\frac{1}{t_e-t_s+1} \log p(p|\mathbf o) +$$ + +where $\mathbf o$ is the input observations, $p$ is the canonical phone, $t_s, t_e$ are the start and end frame indexes. + +Assuming $p(q_i)\approx p(q_j)$ for any $q_i, q_j$, we have: + +$$ +\log p(p|\mathbf o)=\frac{p(\mathbf o|p)p(p)}{\sum_{q\in Q} p(\mathbf o|q)p(q)} + \approx\frac{p(\mathbf o|p)}{\sum_{q\in Q} p(\mathbf o|q)} +$$ + +where $Q$ is the whole phone set. + +The numerator of the equation is calculated from forced alignment result and the denominator is calculated from an Viterbi decoding with a unconstrained phone loop. + +We do not implement GOP-GMM for Kaldi, as GOP-NN performs much better than GOP-GMM. + +## GOP-NN + +The definition of GOP-NN is a bit different from the GOP-GMM. GOP-NN was defined as the log phone posterior ratio between the canonical phone and the one with the highest score (Hu et al., 2015). + +Firstly we define Log Phone Posterior (LPP): + +$$ +LPP(p)=\log p(p|\mathbf o; t_s,t_e) +$$ + +Then we define the GOP-NN using LPP: + +$$ +GOP(p)=\log \frac{LPP(p)}{\max_{q\in Q} LPP(q)} +$$ + +LPP could be calculated as: + +$$ +LPP(p) \approx \frac{1}{t_e-t_s+1} \sum_{t=t_s}^{t_e}\log p(p|o_t) +$$ + +$$ +p(p|o_t) = \sum_{s \in p} p(s|o_t) +$$ + +where $s$ is the senone label, $\{s|s \in p\}$ is the states belonging to those triphones whose current phone is $p$. + +## Phone-level Feature + +Normally the classifier-based approach archives better performance than GOP-based approach. + +Different from GOP based method, an extra supervised training process is needed. The input features for supervised training are phone-level, segmental features. The phone-level feature is defined as: + +$$ +{[LPP(p_1),\cdots,LPP(p_M), LPR(p_1|p_i), \cdots, LPR(p_j|p_i),\cdots]}^T +$$ + +where the Log Posterior Ratio (LPR) between phone $p_j$ and $p_i$ is defined as: + +$$ +LPR(p_j|p_i) = \log p(p_j|\mathbf o; t_s, t_e) - \log p(p_i|\mathbf o; t_s, t_e) +$$ + +## Implementation + +This implementation consists of a executable binary `bin/compute-gop` and some scripts. + +`compute-gop` computes GOP and extracts phone-level features using nnet output probabilities. +The output probabilities are assumed to be from a log-softmax layer. + +The script `run.sh` shows a typical pipeline based on librispeech's model and data. + +In Hu's paper, GOP was computed using a feed-forward DNN. +We have tried to use the output-xent of a chain model to compute GOP, but the result was not good. +We guess the HMM topo of chain model may not fit for GOP. + +The nnet3's TDNN (no chain) model performs well in GOP computing, so this recipe uses it. + +## Acknowledgement +The author of this recipe would like to thank Xingyu Na for his works of model tuning and his helpful suggestions. diff --git a/egs/gop/s5/cmd.sh b/egs/gop/s5/cmd.sh new file mode 100644 index 00000000000..9139633e57a --- /dev/null +++ b/egs/gop/s5/cmd.sh @@ -0,0 +1,13 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export cmd="run.pl" diff --git a/egs/gop/s5/local/make_testcase.sh b/egs/gop/s5/local/make_testcase.sh new file mode 100755 index 00000000000..884563066b1 --- /dev/null +++ b/egs/gop/s5/local/make_testcase.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +src=$1 +dst=$2 + +# Select a very small set for testing +utils/subset_data_dir.sh --shortest $src 10 $dst + +# make fake transcripts as negative examples +cp $dst/text $dst/text.ori +sed -i "s/ THERE / THOSE /" $dst/text +sed -i "s/ IN / ON /" $dst/text diff --git a/egs/gop/s5/local/remove_phone_markers.pl b/egs/gop/s5/local/remove_phone_markers.pl new file mode 100755 index 00000000000..16236a749cf --- /dev/null +++ b/egs/gop/s5/local/remove_phone_markers.pl @@ -0,0 +1,72 @@ +#!/usr/bin/env perl +# Copyright 2019 Junbo Zhang + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +use strict; +use warnings; + +my $Usage = <new phone mapping file, in which each line is: "old-integer-id new-integer-id. + +Usage: utils/remove_phone_markers.pl + e.g.: utils/remove_phone_markers.pl phones.txt phones-pure.txt phone-to-pure-phone.int +EOU + +if (@ARGV < 3) { + die $Usage; +} + +my $old_phone_symbols_filename = shift @ARGV; +my $new_phone_symbols_filename = shift @ARGV; +my $mapping_filename = shift @ARGV; + +my %id_of_old_phone; +open(IN, $old_phone_symbols_filename) or die "Can't open $old_phone_symbols_filename"; +while () { + chomp; + my ($phone, $id) = split; + next if $phone =~ /\#/; + $id_of_old_phone{$phone} = $id; +} +close IN; + +my $new_id = 0; +my %id_of_new_phone; +my %id_old_to_new; +foreach (sort { $id_of_old_phone{$a} <=> $id_of_old_phone{$b} } keys %id_of_old_phone) { + my $old_phone = $_; + s/_[BIES]//; + s/\d//; + my $new_phone = $_; + $id_of_new_phone{$new_phone} = $new_id++ if not exists $id_of_new_phone{$new_phone}; + $id_old_to_new{$id_of_old_phone{$old_phone}} = $id_of_new_phone{$new_phone}; +} + +# Write to file +open(OUT, ">$new_phone_symbols_filename") or die "Can\'t write to $new_phone_symbols_filename"; +foreach (sort { $id_of_new_phone{$a} <=> $id_of_new_phone{$b} } keys %id_of_new_phone) { + print OUT "$_\t$id_of_new_phone{$_}\n"; +} +close OUT; + +open(OUT, ">$mapping_filename") or die "Can\'t write to $mapping_filename"; +foreach (sort { $a <=> $b } keys %id_old_to_new) { + next if $_ == 0; + print OUT "$_ $id_old_to_new{$_}\n"; +} +close OUT; diff --git a/egs/gop/s5/path.sh b/egs/gop/s5/path.sh new file mode 100755 index 00000000000..03df6dd9f2b --- /dev/null +++ b/egs/gop/s5/path.sh @@ -0,0 +1,27 @@ +export KALDI_ROOT=`pwd`/../../.. +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C + +# we use this both in the (optional) LM training and the G2P-related scripts +PYTHON='python2.7' + +### Below are the paths used by the optional parts of the recipe + +# We only need the Festival stuff below for the optional text normalization(for LM-training) step +FEST_ROOT=tools/festival +NSW_PATH=${FEST_ROOT}/festival/bin:${FEST_ROOT}/nsw/bin +export PATH=$PATH:$NSW_PATH + +# SRILM is needed for LM model building +SRILM_ROOT=$KALDI_ROOT/tools/srilm +SRILM_PATH=$SRILM_ROOT/bin:$SRILM_ROOT/bin/i686-m64 +export PATH=$PATH:$SRILM_PATH + +# Sequitur G2P executable +sequitur=$KALDI_ROOT/tools/sequitur/g2p.py +sequitur_path="$(dirname $sequitur)/lib/$PYTHON/site-packages" + +# Directory under which the LM training corpus should be extracted +LM_CORPUS_ROOT=./lm-corpus diff --git a/egs/gop/s5/run.sh b/egs/gop/s5/run.sh new file mode 100755 index 00000000000..a731b913552 --- /dev/null +++ b/egs/gop/s5/run.sh @@ -0,0 +1,102 @@ +#!/bin/bash + +# Copyright 2019 Junbo Zhang +# Apache 2.0 + +# This script shows how to calculate Goodness of Pronunciation (GOP) and +# extract phone-level pronunciation feature for mispronunciations detection +# tasks. Read ../README.md or the following paper for details: +# +# "Hu et al., Improved mispronunciation detection with deep neural network +# trained acoustic models and transfer learning based logistic regression +# classifiers, 2015." + +# You might not want to do this for interactive shells. +set -e + +# Before running this recipe, you have to run the librispeech recipe firstly. +# This script assumes the following paths exist. +librispeech_eg=../../librispeech/s5 +model=$librispeech_eg/exp/nnet3_cleaned/tdnn_sp +ivector=$librispeech_eg/exp/nnet3_cleaned/ivectors_test_clean_hires +lang=$librispeech_eg/data/lang +test_data=$librispeech_eg/data/test_clean_hires + +for d in $model $ivector $lang $test_data; do + [ ! -d $d ] && echo "$0: no such path $d" && exit 1; +done + +# Global configurations +stage=0 +nj=4 + +data=test_10short +dir=exp/gop_$data + +. ./cmd.sh +. ./path.sh +. parse_options.sh + +if [ $stage -le 0 ]; then + # Prepare test data + [ -d data ] || mkdir -p data/$data + local/make_testcase.sh $test_data data/$data +fi + +if [ $stage -le 1 ]; then + # Compute Log-likelihoods + steps/nnet3/compute_output.sh --cmd "$cmd" --nj $nj \ + --online-ivector-dir $ivector data/$data $model exp/probs_$data +fi + +if [ $stage -le 2 ]; then + steps/nnet3/align.sh --cmd "$cmd" --nj $nj --use_gpu false \ + --online_ivector_dir $ivector data/$data $lang $model $dir +fi + +if [ $stage -le 3 ]; then + # make a map which converts phones to "pure-phones" + # "pure-phone" means the phone whose stress and pos-in-word markers are ignored + # eg. AE1_B --> AE, EH2_S --> EH, SIL --> SIL + local/remove_phone_markers.pl $lang/phones.txt $dir/phones-pure.txt \ + $dir/phone-to-pure-phone.int + + # Convert transition-id to pure-phone id + $cmd JOB=1:$nj $dir/log/ali_to_phones.JOB.log \ + ali-to-phones --per-frame=true $model/final.mdl "ark,t:gunzip -c $dir/ali.JOB.gz|" \ + "ark,t:-" \| utils/apply_map.pl -f 2- $dir/phone-to-pure-phone.int \| \ + gzip -c \>$dir/ali-pure-phone.JOB.gz || exit 1; +fi + +if [ $stage -le 4 ]; then + # The outputs of the binary compute-gop are the GOPs and the phone-level features. + # + # An example of the GOP result (extracted from "ark,t:$dir/gop.3.txt"): + # 4446-2273-0031 [ 1 0 ] [ 12 0 ] [ 27 -5.382001 ] [ 40 -13.91807 ] [ 1 -0.2555897 ] \ + # [ 21 -0.2897284 ] [ 5 0 ] [ 31 0 ] [ 33 0 ] [ 3 -11.43557 ] [ 25 0 ] \ + # [ 16 0 ] [ 30 -0.03224623 ] [ 5 0 ] [ 25 0 ] [ 33 0 ] [ 1 0 ] + # It is in the posterior format, where each pair stands for [pure-phone-index gop-value]. + # For example, [ 27 -5.382001 ] means the GOP of the pure-phone 27 (it corresponds to the + # phone "OW", according to "$dir/phones-pure.txt") is -5.382001, indicating the audio + # segment of this phone should be a mispronunciation. + # + # The phone-level features are in matrix format: + # 4446-2273-0031 [ -0.2462088 -10.20292 -11.35369 ... + # -8.584108 -7.629755 -13.04877 ... + # ... + # ... ] + # The row number is the phone number of the utterance. In this case, it is 17. + # The column number is 2 * (pure-phone set size), as the feature is consist of LLR + LPR. + # The phone-level features can be used to train a classifier with human labels. See Hu's + # paper for detail. + $cmd JOB=1:$nj $dir/log/compute_gop.JOB.log \ + compute-gop --phone-map=$dir/phone-to-pure-phone.int $model/final.mdl \ + "ark,t:gunzip -c $dir/ali-pure-phone.JOB.gz|" \ + "ark:exp/probs_$data/output.JOB.ark" \ + "ark,t:$dir/gop.JOB.txt" "ark,t:$dir/phonefeat.JOB.txt" || exit 1; + echo "Done compute-gop, the results: \"$dir/gop..txt\" in posterior format." + + # We set -5 as a universal empirical threshold here. You can also determine multiple phone + # dependent thresholds based on the human-labeled mispronunciation data. + echo "The phones whose gop values less than -5 could be treated as mispronunciations." +fi diff --git a/egs/gop/s5/steps b/egs/gop/s5/steps new file mode 120000 index 00000000000..6e99bf5b5ad --- /dev/null +++ b/egs/gop/s5/steps @@ -0,0 +1 @@ +../../wsj/s5/steps \ No newline at end of file diff --git a/egs/gop/s5/utils b/egs/gop/s5/utils new file mode 120000 index 00000000000..b240885218f --- /dev/null +++ b/egs/gop/s5/utils @@ -0,0 +1 @@ +../../wsj/s5/utils \ No newline at end of file diff --git a/egs/gp/s1/local/gp_convert_audio.sh b/egs/gp/s1/local/gp_convert_audio.sh index a7c2d7285c4..b3db909c9b6 100755 --- a/egs/gp/s1/local/gp_convert_audio.sh +++ b/egs/gp/s1/local/gp_convert_audio.sh @@ -108,4 +108,4 @@ done < "$INLIST" echo "sox: error converting following $nsoxerr file(s):" >&2 [ -f "$soxerr" ] && cat "$soxerr" >&2 -exit 0; \ No newline at end of file +exit 0; diff --git a/egs/gp/s1/utils/mkgraph.sh b/egs/gp/s1/utils/mkgraph.sh index 2e45296593b..3aba742832d 100755 --- a/egs/gp/s1/utils/mkgraph.sh +++ b/egs/gp/s1/utils/mkgraph.sh @@ -131,4 +131,4 @@ cp $lang/silphones.csl $dir/ # to make const fst: # fstconvert --fst_type=const $dir/HCLG.fst $dir/HCLG_c.fst -echo "Finished making decoding graphs in $dir" \ No newline at end of file +echo "Finished making decoding graphs in $dir" diff --git a/egs/heroico/s5/RESULTS b/egs/heroico/s5/RESULTS index 9717e95e6e2..7942c03b1d9 100644 --- a/egs/heroico/s5/RESULTS +++ b/egs/heroico/s5/RESULTS @@ -1,22 +1,48 @@ # for dir in $(echo exp/tri*/decode* | grep -v 'si/'); do grep WER $dir/wer* | utils/best_wer.sh; done -%WER 67.01 [ 5126 / 7650, 837 ins, 575 del, 3714 sub ] exp/tri1/decode_devtest/wer_14_1.0 -%WER 62.39 [ 4678 / 7498, 768 ins, 397 del, 3513 sub ] exp/tri1/decode_native/wer_13_1.0 -%WER 67.05 [ 6179 / 9215, 895 ins, 606 del, 4678 sub ] exp/tri1/decode_nonnative/wer_13_1.0 -%WER 64.97 [ 10859 / 16713, 1678 ins, 999 del, 8182 sub ] exp/tri1/decode_test/wer_13_1.0 -%WER 65.90 [ 5041 / 7650, 1016 ins, 416 del, 3609 sub ] exp/tri2b/decode_devtest/wer_12_1.0 -%WER 61.26 [ 4593 / 7498, 908 ins, 300 del, 3385 sub ] exp/tri2b/decode_native/wer_14_1.0 -%WER 67.51 [ 6221 / 9215, 1085 ins, 524 del, 4612 sub ] exp/tri2b/decode_nonnative/wer_14_1.0 -%WER 64.87 [ 10842 / 16713, 2004 ins, 838 del, 8000 sub ] exp/tri2b/decode_test/wer_14_1.0 -%WER 66.09 [ 5056 / 7650, 1078 ins, 402 del, 3576 sub ] exp/tri3b/decode_devtest/wer_16_1.0 -%WER 74.88 [ 5728 / 7650, 1210 ins, 426 del, 4092 sub ] exp/tri3b/decode_devtest.si/wer_15_1.0 -%WER 61.19 [ 4588 / 7498, 1038 ins, 255 del, 3295 sub ] exp/tri3b/decode_native/wer_14_1.0 -%WER 70.99 [ 5323 / 7498, 1185 ins, 301 del, 3837 sub ] exp/tri3b/decode_native.si/wer_16_1.0 -%WER 66.35 [ 6114 / 9215, 1186 ins, 421 del, 4507 sub ] exp/tri3b/decode_nonnative/wer_17_1.0 -%WER 76.36 [ 7037 / 9215, 1420 ins, 467 del, 5150 sub ] exp/tri3b/decode_nonnative.si/wer_16_1.0 -%WER 64.06 [ 10706 / 16713, 2245 ins, 657 del, 7804 sub ] exp/tri3b/decode_test/wer_15_1.0 -%WER 73.97 [ 12362 / 16713, 2608 ins, 766 del, 8988 sub ] exp/tri3b/decode_test.si/wer_16_1.0 -%WER 53.07 [ 4060 / 7650, 744 ins, 376 del, 2940 sub ] exp/chain/tdnn1e_sp/decode_devtest/wer_7_1.0 -%WER 54.47 [ 4084 / 7498, 536 ins, 475 del, 3073 sub ] exp/chain/tdnn1e_sp/decode_native/wer_7_1.0 -%WER 63.01 [ 5806 / 9215, 685 ins, 784 del, 4337 sub ] exp/chain/tdnn1e_sp/decode_nonnative/wer_7_1.0 -%WER 59.25 [ 9903 / 16713, 1226 ins, 1259 del, 7418 sub ] exp/chain/tdnn1e_sp/decode_test/wer_7_1.0 +# old results before adding Movie subtitles text corpus in LM training: +# %WER 67.01 [ 5126 / 7650, 837 ins, 575 del, 3714 sub ] exp/tri1/decode_devtest/wer_14_1.0 +# %WER 62.39 [ 4678 / 7498, 768 ins, 397 del, 3513 sub ] exp/tri1/decode_native/wer_13_1.0 +# %WER 67.05 [ 6179 / 9215, 895 ins, 606 del, 4678 sub ] exp/tri1/decode_nonnative/wer_13_1.0 +# %WER 64.97 [ 10859 / 16713, 1678 ins, 999 del, 8182 sub ] exp/tri1/decode_test/wer_13_1.0 +# %WER 65.90 [ 5041 / 7650, 1016 ins, 416 del, 3609 sub ] exp/tri2b/decode_devtest/wer_12_1.0 +# %WER 61.26 [ 4593 / 7498, 908 ins, 300 del, 3385 sub ] exp/tri2b/decode_native/wer_14_1.0 +# %WER 67.51 [ 6221 / 9215, 1085 ins, 524 del, 4612 sub ] exp/tri2b/decode_nonnative/wer_14_1.0 +# %WER 64.87 [ 10842 / 16713, 2004 ins, 838 del, 8000 sub ] exp/tri2b/decode_test/wer_14_1.0 +# %WER 66.09 [ 5056 / 7650, 1078 ins, 402 del, 3576 sub ] exp/tri3b/decode_devtest/wer_16_1.0 +# %WER 74.88 [ 5728 / 7650, 1210 ins, 426 del, 4092 sub ] exp/tri3b/decode_devtest.si/wer_15_1.0 +# %WER 61.19 [ 4588 / 7498, 1038 ins, 255 del, 3295 sub ] exp/tri3b/decode_native/wer_14_1.0 +# %WER 70.99 [ 5323 / 7498, 1185 ins, 301 del, 3837 sub ] exp/tri3b/decode_native.si/wer_16_1.0 +# %WER 66.35 [ 6114 / 9215, 1186 ins, 421 del, 4507 sub ] exp/tri3b/decode_nonnative/wer_17_1.0 +# %WER 76.36 [ 7037 / 9215, 1420 ins, 467 del, 5150 sub ] exp/tri3b/decode_nonnative.si/wer_16_1.0 +# %WER 64.06 [ 10706 / 16713, 2245 ins, 657 del, 7804 sub ] exp/tri3b/decode_test/wer_15_1.0 +# %WER 73.97 [ 12362 / 16713, 2608 ins, 766 del, 8988 sub ] exp/tri3b/decode_test.si/wer_16_1.0 +# %WER 53.07 [ 4060 / 7650, 744 ins, 376 del, 2940 sub ] exp/chain/tdnn1e_sp/decode_devtest/wer_7_1.0 +# %WER 54.47 [ 4084 / 7498, 536 ins, 475 del, 3073 sub ] exp/chain/tdnn1e_sp/decode_native/wer_7_1.0 +# %WER 63.01 [ 5806 / 9215, 685 ins, 784 del, 4337 sub ] exp/chain/tdnn1e_sp/decode_nonnative/wer_7_1.0 +# %WER 59.25 [ 9903 / 16713, 1226 ins, 1259 del, 7418 sub ] exp/chain/tdnn1e_sp/decode_test/wer_7_1.0 + +# new results: +%WER 18.27 [ 1398 / 7650, 213 ins, 253 del, 932 sub ] exp/tri1/decode_devtest/wer_15_0.5 +%WER 9.95 [ 746 / 7498, 74 ins, 108 del, 564 sub ] exp/tri1/decode_native/wer_13_0.5 +%WER 16.63 [ 1532 / 9215, 197 ins, 183 del, 1152 sub ] exp/tri1/decode_nonnative/wer_17_0.0 +%WER 13.68 [ 2287 / 16713, 207 ins, 360 del, 1720 sub ] exp/tri1/decode_test/wer_17_0.5 +%WER 17.19 [ 1315 / 7650, 227 ins, 231 del, 857 sub ] exp/tri2b/decode_devtest/wer_17_0.5 +%WER 9.23 [ 692 / 7498, 60 ins, 103 del, 529 sub ] exp/tri2b/decode_native/wer_16_0.5 +%WER 17.16 [ 1581 / 9215, 184 ins, 216 del, 1181 sub ] exp/tri2b/decode_nonnative/wer_17_0.5 +%WER 13.64 [ 2279 / 16713, 241 ins, 326 del, 1712 sub ] exp/tri2b/decode_test/wer_17_0.5 +%WER 15.36 [ 1175 / 7650, 212 ins, 210 del, 753 sub ] exp/tri3b/decode_devtest/wer_17_0.5 +%WER 20.27 [ 1551 / 7650, 269 ins, 257 del, 1025 sub ] exp/tri3b/decode_devtest.si/wer_14_1.0 +%WER 6.40 [ 480 / 7498, 50 ins, 58 del, 372 sub ] exp/tri3b/decode_native/wer_16_0.0 +%WER 10.91 [ 818 / 7498, 100 ins, 112 del, 606 sub ] exp/tri3b/decode_native.si/wer_16_1.0 +%WER 14.30 [ 1318 / 9215, 206 ins, 134 del, 978 sub ] exp/tri3b/decode_nonnative/wer_17_0.0 +%WER 21.62 [ 1992 / 9215, 286 ins, 224 del, 1482 sub ] exp/tri3b/decode_nonnative.si/wer_16_1.0 +%WER 10.78 [ 1802 / 16713, 247 ins, 195 del, 1360 sub ] exp/tri3b/decode_test/wer_17_0.0 +%WER 16.81 [ 2809 / 16713, 374 ins, 338 del, 2097 sub ] exp/tri3b/decode_test.si/wer_16_1.0 + +# chain model results: +# for dir in $(echo exp/chain/tdnn1b_sp/decode* | grep -v 'si/'); do grep WER $dir/wer* | utils/best_wer.sh; done +%WER 12.99 [ 994 / 7650, 192 ins, 163 del, 639 sub ] exp/chain/tdnn1b_sp/decode_devtest/wer_10_1.0 +%WER 12.47 [ 1149 / 9215, 119 ins, 174 del, 856 sub ] exp/chain/tdnn1b_sp/decode_nonnative/wer_12_0.0 +%WER 9.64 [ 1611 / 16713, 169 ins, 240 del, 1202 sub ] exp/chain/tdnn1b_sp/decode_test/wer_12_0.0 +%WER 6.13 [ 460 / 7498, 52 ins, 55 del, 353 sub ] exp/chain/tdnn1b_sp/decode_native/wer_10_0.0 diff --git a/egs/heroico/s5/cmd.sh b/egs/heroico/s5/cmd.sh index a427f3c16a5..533aad25db1 100755 --- a/egs/heroico/s5/cmd.sh +++ b/egs/heroico/s5/cmd.sh @@ -10,6 +10,7 @@ # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. +export cmd="retry.pl queue.pl" export train_cmd="retry.pl queue.pl" export decode_cmd="retry.pl queue.pl --mem 2G" diff --git a/egs/heroico/s5/local/chain/tuning/run_cnn_tdnn_1a.sh b/egs/heroico/s5/local/chain/tuning/run_cnn_tdnn_1a.sh new file mode 100755 index 00000000000..361879b4142 --- /dev/null +++ b/egs/heroico/s5/local/chain/tuning/run_cnn_tdnn_1a.sh @@ -0,0 +1,318 @@ +#!/bin/bash + +# run_cnn_tdnn_1a.sh is modified from run_tdnn_1b.sh but taking +# the xconfig from mini-librispeech's run_cnn_tdnn_1a54.sh; only +# reducing the bottleneck-dim from 96 to 64, which is the value +# the run_tdnn1b.sh script here has. Results are better. +# local/chain/compare_wer.sh exp/chain/tdnn1a_sp exp/chain/tdnn1b_sp exp/chain/cnn_tdnn1a_sp +# System tdnn1a_sp tdnn1b_sp cnn_tdnn1a_sp +# %WER devtest 53.07 52.54 51.10 +# %WER test 59.25 53.70 52.07 +# %WER native 54.47 48.76 47.88 +# %WER nonnative 63.01 57.66 55.51 +# Final train prob -0.0253 -0.0547 -0.0502 +# Final valid prob -0.0687 -0.0694 -0.0661 +# Final train prob (xent) -0.7715 -0.9502 -0.8513 +# Final valid prob (xent) -1.0719 -1.0849 -0.9915 +# Num-params 6567648 3321312 3345088 + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +decode_nj=10 +train_set=train +test_sets="native nonnative devtest test" +gmm=tri3b +nnet3_affix= + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1a # affix for the TDNN directory name +tree_affix= +train_stage=-10 +get_egs_stage=-10 +decode_iter= + +num_leaves=3500 + +# training options +# training chunk-options +chunk_width=140,100,160 +# we don't need extra left/right context for TDNN systems. +dropout_schedule='0,0@0.20,0.3@0.50,0' +common_egs_dir= +xent_regularize=0.1 + +# training options +srand=0 +remove_egs=true +reporting_email= + +#decode options +test_online_decoding=false # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --cmd "$train_cmd" \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + $num_leaves \ + ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 13 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) + + cnn_opts="l2-regularize=0.03" + ivector_layer_opts="l2-regularize=0.03" + ivector_affine_opts="l2-regularize=0.03" + tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true" + tdnnf_first_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.0" + tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66" + linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0" + prefinal_opts="l2-regularize=0.03" + output_opts="l2-regularize=0.015" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # this takes the MFCCs and generates filterbank coefficients. The MFCCs + # are more compressible so we prefer to dump the MFCCs to disk rather + # than filterbanks. + idct-layer name=idct input=input dim=40 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat + + linear-component name=ivector-linear $ivector_affine_opts dim=200 input=ReplaceIndex(ivector, t, 0) + batchnorm-component name=ivector-batchnorm target-rms=0.025 + + batchnorm-component name=idct-batchnorm input=idct + combine-feature-maps-layer name=combine_inputs input=Append(idct-batchnorm, ivector-batchnorm) num-filters1=1 num-filters2=5 height=40 + + conv-relu-batchnorm-layer name=cnn1 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=48 learning-rate-factor=0.333 max-change=0.25 + conv-relu-batchnorm-layer name=cnn2 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=48 + conv-relu-batchnorm-layer name=cnn3 $cnn_opts height-in=40 height-out=20 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64 + conv-relu-batchnorm-layer name=cnn4 $cnn_opts height-in=20 height-out=20 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64 + conv-relu-batchnorm-layer name=cnn5 $cnn_opts height-in=20 height-out=10 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64 + conv-relu-batchnorm-layer name=cnn6 $cnn_opts height-in=10 height-out=5 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=128 + + # the first TDNN-F layer has no bypass (since dims don't match), and a larger bottleneck so the + # information bottleneck doesn't become a problem. (we use time-stride=0 so no splicing, to + # limit the num-parameters). + tdnnf-layer name=tdnnf7 $tdnnf_first_opts dim=768 bottleneck-dim=192 time-stride=0 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3 + tdnnf-layer name=tdnnf9 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3 + tdnnf-layer name=tdnnf10 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3 + tdnnf-layer name=tdnnf11 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3 + tdnnf-layer name=tdnnf12 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3 + tdnnf-layer name=tdnnf13 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3 + tdnnf-layer name=tdnnf14 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3 + tdnnf-layer name=tdnnf15 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3 + linear-component name=prefinal-l dim=192 $linear_opts + + ## adding the layers for chain branch + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + # adding the layers for xent branch + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 14 ]; then + steps/nnet3/chain/train.py \ + --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=8 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 \ + data/lang_test \ + $tree_dir \ + $tree_dir/graph || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l 1392 combine=-0.040->-0.033 (over 7) xent:train/valid[69,104,final]=(-1.12,-0.880,-0.771/-1.33,-1.21,-1.07) logprob:train/valid[69,104,final]=(-0.050,-0.031,-0.025/-0.079,-0.080,-0.069) +# exp/chain/tdnn1a_sp: num-iters=105 nj=1..1 num-params=6.6M dim=40+100->1384 combine=-0.032->-0.026 (over 7) xent:train/valid[69,104,final]=(-1.14,-0.892,-0.811/-1.19,-1.07,-0.990) logprob:train/valid[69,104,final]=(-0.045,-0.029,-0.023/-0.083,-0.080,-0.072) # Set -e here so that we catch if any executable fails immediately set -euo pipefail @@ -149,7 +150,7 @@ if [ $stage -le 13 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) opts="l2-regularize=0.01" output_opts="l2-regularize=0.0025" diff --git a/egs/heroico/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/heroico/s5/local/chain/tuning/run_tdnn_1b.sh index 33ce1556d29..cfb4dc1f697 100755 --- a/egs/heroico/s5/local/chain/tuning/run_tdnn_1b.sh +++ b/egs/heroico/s5/local/chain/tuning/run_tdnn_1b.sh @@ -3,21 +3,20 @@ # 1b is as 1a but a re-tuned model with quite a few changes, including moving to # a resnet-style factored TDNN-F model. # -# local/chain/compare_wer.sh exp/chain/tdnn1a_sp exp/chain/tdnn1b_sp +# ./local/chain/compare_wer.sh exp/chain/tdnn1a_sp exp/chain/tdnn1b_sp # System tdnn1a_sp tdnn1b_sp -# %WER devtest 53.07 52.54 -# %WER test 59.25 53.70 -# %WER native 54.47 48.76 -# %WER nonnative 63.01 57.66 -# Final train prob -0.0253 -0.0547 -# Final valid prob -0.0687 -0.0694 -# Final train prob (xent) -0.7715 -0.9502 -# Final valid prob (xent) -1.0719 -1.0849 -# Num-params 6567648 3321312 - +# %WER devtest 13.10 12.99 +# %WER test 15.53 9.64 +# %WER native 10.14 6.13 +# %WER nonnative 19.78 12.47 +# Final train prob -0.0233 -0.0442 +# Final valid prob -0.0720 -0.0726 +# Final train prob (xent) -0.8107 -0.9759 +# Final valid prob (xent) -0.9898 -0.9964 +# Num-params 6559440 3318224 # steps/info/chain_dir_info.pl exp/chain/tdnn1b_sp -# exp/chain/tdnn1b_sp: num-iters=34 nj=2..5 num-params=3.3M dim=40+100->1392 combine=-0.059->-0.059 (over 1) xent:train/valid[21,33,final]=(-1.28,-0.986,-0.950/-1.38,-1.10,-1.08) logprob:train/valid[21,33,final]=(-0.085,-0.063,-0.055/-0.090,-0.074,-0.069) +# exp/chain/tdnn1b_sp: num-iters=34 nj=2..5 num-params=3.3M dim=40+100->1384 combine=-0.044->-0.044 (over 1) xent:train/valid[21,33,final]=(-1.30,-0.993,-0.976/-1.28,-1.01,-0.996) logprob:train/valid[21,33,final]=(-0.071,-0.050,-0.044/-0.093,-0.076,-0.073) # Set -e here so that we catch if any executable fails immediately set -euo pipefail @@ -152,7 +151,7 @@ if [ $stage -le 13 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) affine_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true" tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66" linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0" diff --git a/egs/heroico/s5/local/heroico_answers_make_lists.pl b/egs/heroico/s5/local/heroico_answers_make_lists.pl index fb3c0ecb8d1..c1a3735b4f1 100755 --- a/egs/heroico/s5/local/heroico_answers_make_lists.pl +++ b/egs/heroico/s5/local/heroico_answers_make_lists.pl @@ -30,7 +30,7 @@ my $t = "$tmpdir/answers/text"; # initialize hash for prompts -my %p = (); +my %prompts = (); # store prompts in hash LINEA: while ( my $line = <> ) { @@ -40,9 +40,27 @@ my @dirs = split /\//, $directories; # get the speaker number my $s = $dirs[-1]; + # pad the speaker number with zeroes + my $spk = ""; + if ( $s < 10 ) { + $spk = '000' . $s; + } elsif ( $s < 100 ) { + $spk = '00' . $s; + } elsif ( $s < 1000 ) { + $spk = '0' . $s; + } + # pad the filename with zeroes + my $fn = ""; + if ( $file < 10 ) { + $fn = '000' . $file; + } elsif ( $file < 100 ) { + $fn = '00' . $file; + } elsif ( $file < 1000 ) { + $fn = '0' . $file; + } # the utterance name - my $i = $s . '_' . 'a' . '_' . $file; - $p{$i} = $sent; + my $utt = $spk . '_' . $fn; + $prompts{$utt} = $sent; } open my $W, '<', $w or croak "problem with $w $!"; @@ -58,18 +76,36 @@ my @dirs = split /\//, $directories; my $r = basename $line, ".wav"; my $s = $dirs[-1]; - my $rid = $s . '_' . 'a' . '_' . $r; - if ( exists $p{$rid} ) { - print $T "$rid $p{$rid}\n"; - } elsif ( defined $rid ) { - warn "warning: problem\t$rid"; + my $spk = ""; + # pad with zeroes + if ( $s < 10 ) { + $spk = '000' . $s; + } elsif ( $s < 100 ) { + $spk = '00' . $s; + } elsif ( $s < 1000 ) { + $spk = '0' . $s; + } + # pad the file name with zeroes + my $rec = ""; + if ( $r < 10 ) { + $rec = '000' . $r; + } elsif ( $r < 100 ) { + $rec = '00' . $r; + } elsif ( $r < 1000 ) { + $rec = '0' . $r; + } + my $rec_id = $spk . '_' . $rec; + if ( exists $prompts{$rec_id} ) { + print $T "$rec_id $prompts{$rec_id}\n"; + } elsif ( defined $rec_id ) { + warn "warning: problem\t$rec_id"; next LINE; } else { croak "$line"; } - print $O "$rid sox -r 22050 -e signed -b 16 $line -r 16000 -t wav - |\n"; - print $U "$rid ${s}_a\n"; + print $O "$rec_id sox -r 22050 -e signed -b 16 $line -r 16000 -t wav - |\n"; + print $U "$rec_id $spk\n"; } close $T; close $O; diff --git a/egs/heroico/s5/local/heroico_recordings_make_lists.pl b/egs/heroico/s5/local/heroico_recordings_make_lists.pl index 1d157665799..b9a3ab5a565 100755 --- a/egs/heroico/s5/local/heroico_recordings_make_lists.pl +++ b/egs/heroico/s5/local/heroico_recordings_make_lists.pl @@ -19,75 +19,102 @@ system "mkdir -p $tmpdir/recordings/devtest"; # input wav file list -my $w = "$tmpdir/wav_list.txt"; +my $input_wav_list = "$tmpdir/wav_list.txt"; # output temporary wav.scp files -my $o_train = "$tmpdir/recordings/train/wav.scp"; -my $o_test = "$tmpdir/recordings/devtest/wav.scp"; +my $train_wav_scp = "$tmpdir/recordings/train/wav.scp"; +my $test_wav_scp = "$tmpdir/recordings/devtest/wav.scp"; # output temporary utt2spk files -my $u_train = "$tmpdir/recordings/train/utt2spk"; -my $u_test = "$tmpdir/recordings/devtest/utt2spk"; +my $train_uttspk = "$tmpdir/recordings/train/utt2spk"; +my $test_uttspk = "$tmpdir/recordings/devtest/utt2spk"; # output temporary text files -my $t_train = "$tmpdir/recordings/train/text"; -my $t_test = "$tmpdir/recordings/devtest/text"; +my $train_text = "$tmpdir/recordings/train/text"; +my $test_text = "$tmpdir/recordings/devtest/text"; # initialize hash for prompts -my %p = (); +my %prompts = (); # store prompts in hash LINEA: while ( my $line = <> ) { chomp $line; - my ($s,$sent) = split /\t/, $line, 2; - $p{$s} = $sent; + my ($prompt_id,$prompt) = split /\t/, $line, 2; + # pad the prompt id with zeroes + my $pid = ""; + if ( $prompt_id < 10 ) { + $pid = '0000' . $prompt_id; + } elsif ( $prompt_id < 100 ) { + $pid = '000' . $prompt_id; + } elsif ( $prompt_id < 1000 ) { + $pid = '00' . $prompt_id; + } + $prompts{$pid} = $prompt; } -open my $W, '<', $w or croak "problem with $w $!"; -open my $OT, '+>', $o_train or croak "problem with $o_train $!"; -open my $OE, '+>', $o_test or croak "problem with $o_test $!"; -open my $UT, '+>', $u_train or croak "problem with $u_train $!"; -open my $UE, '+>', $u_test or croak "problem with $u_test $!"; -open my $TT, '+>', $t_train or croak "problem with $t_train $!"; -open my $TE, '+>', $t_test or croak "problem with $t_test $!"; +open my $WVL, '<', $input_wav_list or croak "problem with $input_wav_list $!"; +open my $TRNWSCP, '+>', $train_wav_scp or croak "problem with $train_wav_scp $!"; +open my $TSTWSCP, '+>', $test_wav_scp or croak "problem with $test_wav_scp $!"; +open my $TRNUTTSPK, '+>', $train_uttspk or croak "problem with $train_uttspk $!"; +open my $TSTUTTSPK, '+>', $test_uttspk or croak "problem with $test_uttspk $!"; +open my $TRNTXT, '+>', $train_text or croak "problem with $train_text $!"; +open my $TSTTXT, '+>', $test_text or croak "problem with $test_text $!"; - LINE: while ( my $line = <$W> ) { + LINE: while ( my $line = <$WVL> ) { chomp $line; next LINE if ($line =~ /Answers/ ); next LINE unless ( $line =~ /Recordings/ ); my ($volume,$directories,$file) = File::Spec->splitpath( $line ); my @dirs = split /\//, $directories; - my $r = basename $line, ".wav"; - my $s = $dirs[-1]; - my $rid = $s . '_r' . '_' . $r; - if ( ( $r >= 355 ) and ( $r < 561 ) ) { - if ( exists $p{$r} ) { - print $TE "$rid $p{$r}\n"; - } elsif ( defined $rid ) { - warn "problem\t$rid"; + my $utt_id = basename $line, ".wav"; + # pad the utterance id with zeroes + my $utt = ""; + if ( $utt_id < 10 ) { + $utt = '0000' . $utt_id; +} elsif ( $utt_id < 100 ) { + $utt = '000' . $utt_id; +} elsif ( $utt_id < 1000 ) { + $utt = '00' . $utt_id; +} + my $spk_id = $dirs[-1]; + # pad the speaker id with zeroes + my $spk = ""; + if ( $spk_id < 10 ) { + $spk = '000' . $spk_id; + } elsif ( $spk_id < 100 ) { + $spk = '00' . $spk_id; + } elsif ( $spk_id < 1000 ) { + $spk = '0' . $spk_id; + } + my $spk_utt_id = $spk . '_' . $utt; + if ( ( $utt_id >= 355 ) and ( $utt_id < 561 ) ) { +if ( exists $prompts{$utt} ) { + print $TSTTXT "$spk_utt_id $prompts{$utt}\n"; + } elsif ( defined $spk_utt_id ) { + warn "problem\t$spk_utt_id"; next LINE; } else { croak "$line"; } - print $OE "$rid sox -r 22050 -e signed -b 16 $line -r 16000 -t wav - |\n"; - print $UE "$rid ${s}_r\n"; - } elsif ( ( $r < 355 ) or ( $r > 560 ) ) { - if ( exists $p{$r} ) { - print $TT "$rid $p{$r}\n"; - } elsif ( defined $rid ) { - warn "problem\t$rid"; + print $TSTWSCP "$spk_utt_id sox -r 22050 -e signed -b 16 $line -r 16000 -t wav - |\n"; + print $TSTUTTSPK "$spk_utt_id $spk\n"; + } elsif ( ( $utt_id < 355 ) or ( $utt_id > 560 ) ) { + if ( exists $prompts{$utt} ) { + print $TRNTXT "$spk_utt_id $prompts{$utt}\n"; + } elsif ( defined $spk_utt_id ) { + warn "problem\t$spk_utt_id"; next LINE; } else { croak "$line"; } - print $OT "$rid sox -r 22050 -e signed -b 16 $line -r 16000 -t wav - |\n"; - print $UT "$rid ${s}_r\n"; - } + print $TRNWSCP "$spk_utt_id sox -r 22050 -e signed -b 16 $line -r 16000 -t wav - |\n"; + print $TRNUTTSPK "$spk_utt_id $spk\n"; + } } -close $TT; -close $OT; -close $UT; -close $TE; -close $OE; -close $UE; -close $W; +close $TRNTXT; +close $TRNWSCP; +close $TRNUTTSPK; +close $TSTTXT; +close $TSTWSCP; +close $TSTUTTSPK; +close $WVL; diff --git a/egs/heroico/s5/local/nnet3/run_ivector_common.sh b/egs/heroico/s5/local/nnet3/run_ivector_common.sh index 153f0073667..e882ce0c918 100755 --- a/egs/heroico/s5/local/nnet3/run_ivector_common.sh +++ b/egs/heroico/s5/local/nnet3/run_ivector_common.sh @@ -9,6 +9,9 @@ set -euo pipefail # of usage. stage=0 +nj=56 +num_threads_ubm=2 + train_set=train test_sets="native nonnative devtest test" gmm=tri3b @@ -37,25 +40,17 @@ if [ $stage -le 1 ]; then utils/data/perturb_data_dir_speed_3way.sh \ data/${train_set} \ data/${train_set}_sp - echo "$0: making MFCC features for low-resolution speed-perturbed data" - steps/make_mfcc.sh \ - --cmd "$train_cmd" \ - --nj 10 \ - data/${train_set}_sp || exit 1; - steps/compute_cmvn_stats.sh \ - data/${train_set}_sp || exit 1; - utils/fix_data_dir.sh \ - data/${train_set}_sp + + echo "$0: making mfcc features for low-resolution speed-perturbed data" + steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 data/${train_set}_sp || exit 1; + steps/compute_cmvn_stats.sh data/${train_set}_sp || exit 1; + utils/fix_data_dir.sh data/${train_set}_sp fi if [ $stage -le 2 ]; then echo "$0: aligning with the perturbed low-resolution data" steps/align_fmllr.sh \ - --nj 20 \ - --cmd "$train_cmd" \ - data/${train_set}_sp \ - data/lang \ - $gmm_dir \ + --nj 20 --cmd "$train_cmd" data/${train_set}_sp data/lang $gmm_dir \ $ali_dir || exit 1 fi diff --git a/egs/heroico/s5/local/prepare_data.sh b/egs/heroico/s5/local/prepare_data.sh index db2b990c07b..b78d9f1d1cb 100755 --- a/egs/heroico/s5/local/prepare_data.sh +++ b/egs/heroico/s5/local/prepare_data.sh @@ -4,17 +4,17 @@ # Apache 2.0. . ./cmd.sh - . ./path.sh stage=0 +datadir=$1 . ./utils/parse_options.sh set -e set -o pipefail -# the location of the LDC corpus -datadir=$1 +tmpdir=data/local/tmp + # acoustic models are trained on the heroico corpus # testing is done on the usma corpus # heroico consists of 2 parts: answers and recordings (recited) @@ -25,8 +25,6 @@ recordings_transcripts=$datadir/data/transcripts/heroico-recordings.txt # usma is all recited usma_transcripts=$datadir/data/transcripts/usma-prompts.txt -tmpdir=data/local/tmp - # make acoustic model training lists if [ $stage -le 0 ]; then mkdir -p $tmpdir/heroico $tmpdir/usma @@ -37,12 +35,12 @@ if [ $stage -le 0 ]; then # the transcripts are converted to UTF8 export LC_ALL=en_US.UTF-8 cat $answers_transcripts | iconv -f ISO-8859-1 -t UTF-8 | \ - sed -e 's/\r//' | local/heroico_answers_make_lists.pl + tr -d '\r' | local/heroico_answers_make_lists.pl utils/fix_data_dir.sh $tmpdir/heroico/answers cat $recordings_transcripts | iconv -f ISO-8859-1 -t UTF-8 | \ - sed -e 's/\r//' | local/heroico_recordings_make_lists.pl + tr -d '\r' | local/heroico_recordings_make_lists.pl utils/fix_data_dir.sh $tmpdir/heroico/recordings/train utils/fix_data_dir.sh $tmpdir/heroico/recordings/devtest @@ -52,11 +50,11 @@ if [ $stage -le 0 ]; then for x in wav.scp utt2spk text; do cat $tmpdir/heroico/answers/$x $tmpdir/heroico/recordings/train/$x | \ - sed -e 's/\r//' | sort -k1,1 -u >$tmpdir/heroico/lists/train/$x + tr -d '\r' | sort -k1,1 -u >$tmpdir/heroico/lists/train/$x done for x in wav.scp utt2spk text; do - cat $tmpdir/heroico/recordings/devtest/$x | sed -e 's/\r//' | \ + cat $tmpdir/heroico/recordings/devtest/$x | tr -d '\r' | \ sort -k1,1 -u >$tmpdir/heroico/lists/devtest/$x done @@ -67,10 +65,10 @@ fi if [ $stage -le 1 ]; then # make separate lists for usma (US military academy) native and nonnative cat $usma_transcripts | iconv -f ISO-8859-1 -t UTF-8 | \ - sed -e 's/\r//' | local/usma_native_make_lists.pl + tr -d '\r' | dos2unix | local/usma_native_make_lists.pl cat $usma_transcripts | iconv -f ISO-8859-1 -t UTF-8 | \ - sed -e 's/\r//' | local/usma_nonnative_make_lists.pl + tr -d '\r' | local/usma_nonnative_make_lists.pl for n in native nonnative; do mkdir -p $tmpdir/usma/$n/lists @@ -86,14 +84,14 @@ if [ $stage -le 1 ]; then # get training lists for x in wav.scp utt2spk text; do cat $tmpdir/heroico/answers/${x} $tmpdir/heroico/recordings/train/${x} | \ - sed -e 's/\r//' >$tmpdir/lists/train/$x + tr -d '\r' >$tmpdir/lists/train/$x sort $tmpdir/lists/train/$x >data/train/$x done # get devtest lists for x in wav.scp utt2spk text; do cat $tmpdir/heroico/lists/devtest/$x | \ - sed -e 's/\r//' >$tmpdir/lists/devtest/$x + tr -d '\r' >$tmpdir/lists/devtest/$x sort $tmpdir/lists/devtest/$x >data/devtest/$x done diff --git a/egs/heroico/s5/local/prepare_dict.sh b/egs/heroico/s5/local/prepare_dict.sh index a6d182a6852..9f498bc963a 100755 --- a/egs/heroico/s5/local/prepare_dict.sh +++ b/egs/heroico/s5/local/prepare_dict.sh @@ -13,12 +13,12 @@ fi export LC_ALL=C -cut -f2- data/local/tmp/dict/santiago.txt | \ +cut -f2- ./santiago.txt | \ tr -s '[:space:]' '[\n*]' | \ grep -v SPN | sort -u >data/local/dict/nonsilence_phones.txt # sed "1d" deletes the last line. -expand -t 1 data/local/tmp/dict/santiago.txt | sort -u | +expand -t 1 ./santiago.txt | sort -u | sed "1d" >data/local/dict/lexicon.txt echo " SPN" >> data/local/dict/lexicon.txt diff --git a/egs/heroico/s5/local/subs_download.sh b/egs/heroico/s5/local/subs_download.sh new file mode 100755 index 00000000000..98dcb42d4e0 --- /dev/null +++ b/egs/heroico/s5/local/subs_download.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +# Copyright 2017 John Morgan +# Apache 2.0. + +tmpdir=data/local/tmp +download_dir=$(pwd) +mkdir -p $download_dir +subs_src=$1 + +# download the subs corpus +if [ ! -f $download_dir/subs.zip ]; then + wget -O $download_dir/subs.zip $subs_src + ( + cd $download_dir + unzip subs.zip + ) + else + echo "$0: subs file already downloaded." +fi diff --git a/egs/heroico/s5/local/subs_prepare_data.pl b/egs/heroico/s5/local/subs_prepare_data.pl index 3cd906d4699..e39db79f610 100755 --- a/egs/heroico/s5/local/subs_prepare_data.pl +++ b/egs/heroico/s5/local/subs_prepare_data.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl # Copyright 2017 John Morgan # Apache 2.0. @@ -12,69 +12,64 @@ use Encode; # set lower and upper bounds -my $lb = 8; -# only segments with at least $lb words will be written -my $ub = 16; -# only segments with fewer than $ub words will be written +my $low_bound = 8; +# only segments with at least $low_bound words will be written +my $up_bound = 16; +# only segments with fewer than $up_bound words will be written # input and output files -my $c = "data/local/tmp/subs/OpenSubtitles2016.en-es.es"; -my $symtab = "data/lang/words.txt"; -my $rl = "data/local/tmp/subs/lm/es.txt"; -my $oo = "data/local/tmp/subs/lm/oovs.txt"; + +my $corpus = "OpenSubtitles.en-es.es"; +my $symbol_table = "data/lang/words.txt"; +my $filtered = "data/local/tmp/subs/lm/es.txt"; +my $oovs = "data/local/tmp/subs/lm/oovs.txt"; my $iv = "data/local/tmp/subs/lm/in_vocabulary.txt"; -open my $C, '<', $c or croak "problems with $c $!"; +open my $C, '<', $corpus or croak "problems with $corpus $!"; system "mkdir -p data/local/tmp/subs/lm"; -open my $RL, '+>:utf8', $rl or croak "problems with $rl $!"; - -LINE: while ( my $line = <$C> ) { - $line = decode_utf8 $line; - chomp $line; - - my @tokens = split /\s+/, $line; - - next LINE if ( ($#tokens < $lb) or ($#tokens > $ub )); - - #remove control characters - #$line =~ s/(\p{Other})/ /g; - #$line =~ s/(\p{Control})/ /g; - #$line =~ s/(\p{Format})/ /g; - #$line =~ s/(\p{Private_Use})/ /g; - #$line =~ s/(\p{Surrogate})/ /g; - - # punctuation - $line =~ s/(\p{Punctuation}+|\p{Dash_Punctuation}+|\p{Close_Punctuation}+|\p{Open_Punctuation}+|\p{Initial_Punctuation}+|\p{Final_Punctuation}+|\p{Connector_Punctuation}+|\p{Other_Punctuation}+|[ ]+)/ /msxg; -#convert tabs to white space - $line =~ s/\t/ /g; - #hard to soft space - $line =~ s/ / /g; -#squeeze white space - $line =~ s/\s+/ /g; -#initial and final white space - $line =~ s/^\p{Separator}+//; - $line =~ s/\p{Separator}+$//; -#down case - $line = lc $line; - - - print $RL "$line\n"; - +if ( -e $filtered ) { + warn "$filtered already exists."; +} else { + open my $FLT, '+>:utf8', $filtered or croak "problems with $filtered $!"; + LINE: while ( my $line = <$C> ) { + $line = decode_utf8 $line; + chomp $line; + + my @tokens = split /\s+/, $line; + + next LINE if ( ($#tokens < $low_bound) or ($#tokens > $up_bound )); + + # remove punctuation + $line =~ s/(\p{Punctuation}+|\p{Dash_Punctuation}+|\p{Close_Punctuation}+|\p{Open_Punctuation}+|\p{Initial_Punctuation}+|\p{Final_Punctuation}+|\p{Connector_Punctuation}+|\p{Other_Punctuation}+|[ ]+)/ /msxg; + #convert tabs to white space + $line =~ s/\t/ /g; + #hard to soft space + $line =~ s/ / /g; + #squeeze white space + $line =~ s/\s+/ /g; + #initial and final white space + $line =~ s/^\p{Separator}+//; + $line =~ s/\p{Separator}+$//; + #down case + $line = lc $line; + + print $FLT "$line\n"; + } + close $FLT; } - close $C; -close $RL; + # find out of vocabulary words -# $symtab points to a file containing a map of symbols to integers +# $symbol_table points to a file containing a map of symbols to integers # hash for word to integer map my %sym2int = (); -open my $F, '<', $symtab or croak "problem with $symtab $!"; +open my $F, '<', $symbol_table or croak "problem with $symbol_table $!"; # store words to int map in hash while( my $line = <$F>) { @@ -84,33 +79,33 @@ } close $F; -open my $I, '<', $rl or croak "problem with $rl $!"; -open my $OO, '+>', $oo or croak "problems with $oo $!"; +open my $I, '<', $filtered or croak "problem with $filtered $!"; +open my $OOVS, '+>', $oovs or croak "problems with $oovs $!"; while ( my $line = <$I>) { chomp $line; my @A = split /\s/, $line; foreach my $a (@A) { if (!defined ($sym2int{$a})) { - print $OO "$a\n"; + print $OOVS "$a\n"; } } } -close $OO; +close $OOVS; close $I; # remove segments with OOVs # store OOVS in hash my %oov = (); -open my $V, '<', $oo or croak "problems with $oo $!"; +open my $V, '<', $oovs or croak "problems with $oovs $!"; while ( my $line = <$V> ) { chomp $line; $oov{$line} = 1; } close $V; -open my $L, '<', $rl or croak "problems with $rl $!"; +open my $L, '<', $filtered or croak "problems with $filtered $!"; open my $IV, '+>', $iv or croak "problems with $iv $!"; SEGMENT: while ( my $segment = <$L> ) { diff --git a/egs/heroico/s5/run.sh b/egs/heroico/s5/run.sh index 711bece3c66..4cc5617e985 100755 --- a/egs/heroico/s5/run.sh +++ b/egs/heroico/s5/run.sh @@ -1,83 +1,80 @@ #!/bin/bash . ./cmd.sh - . ./path.sh + stage=0 +# the location of the LDC corpus; this location works for the CLSP grid. +datadir=/export/corpora5/LDC/LDC2006S37 + +# The corpus and lexicon are on openslr.org +#speech_url="http://www.openslr.org/resources/39/LDC2006S37.tar.gz" +lexicon_url="http://www.openslr.org/resources/34/santiago.tar.gz" + +# Location of the Movie subtitles text corpus +subtitles_url="http://opus.lingfil.uu.se/download.php?f=OpenSubtitles2018/en-es.txt.zip" + . utils/parse_options.sh set -e set -o pipefail set -u -# the location of the LDC corpus; this location works for the CLSP grid. -datadir=/export/corpora5/LDC/LDC2006S37 -#datadir=/mnt/corpora/LDC2006S37 - -# location of subtitles text data -# note: this is not used so I'm commenting it out; dan. -#subsdata="http://opus.lingfil.uu.se/download.php?f=OpenSubtitles2016/en-es.txt.zip" -lexicon="http://www.openslr.org/resources/34/santiago.tar.gz" # don't change tmpdir, the location is used explicitly in scripts in local/. tmpdir=data/local/tmp if [ $stage -le 0 ]; then - # prepare the lists for acoustic model training and testing - mkdir -p $tmpdir/heroico - mkdir -p $tmpdir/usma - - [ ! -d "$datadir" ] && \ - echo "$0 Data directory (LDC corpus release) does not exist" && \ + if [ ! -d $datadir ]; then + echo "$0: please download and un-tar http://www.openslr.org/resources/39/LDC2006S37.tar.gz" + echo " and set $datadir to the directory where it is located." exit 1 - local/prepare_data.sh $datadir + fi + if [ ! -s santiago.txt ]; then + echo "$0: downloading the lexicon" + wget -c http://www.openslr.org/resources/34/santiago.tar.gz + tar -xvzf santiago.tar.gz + fi + # Get data for lm training + local/subs_download.sh $subtitles_url fi if [ $stage -le 1 ]; then - # prepare a dictionary - mkdir -p data/local/dict - mkdir -p data/local/tmp/dict - - # download the dictionary from openslr - if [ ! -f data/local/tmp/dict/santiago.tar.gz ]; then - wget -O data/local/tmp/dict/santiago.tar.gz $lexicon - fi - - ( - cd $tmpdir/dict - tar -xzf santiago.tar.gz - ) + echo "Making lists for building models." + local/prepare_data.sh $datadir +fi +if [ $stage -le 2 ]; then + mkdir -p data/local/dict $tmpdir/dict local/prepare_dict.sh +fi - # prepare the lang directory +if [ $stage -le 3 ]; then utils/prepare_lang.sh \ data/local/dict "" \ data/local/lang data/lang fi -if [ $stage -le 2 ]; then - # use am training text to train lm - mkdir -p $tmpdir/heroico/lm +if [ $stage -le 4 ]; then + mkdir -p $tmpdir/subs/lm + local/subs_prepare_data.pl +fi + +if [ $stage -le 5 ]; then echo "point 1" - # get the text from data/train/text - cut -d " " -f 2- data/train/text > $tmpdir/heroico/lm/train.txt - echo "point 2" - # build lm - local/prepare_lm.sh $tmpdir/heroico/lm/train.txt + local/prepare_lm.sh $tmpdir/subs/lm/in_vocabulary.txt +fi - echo "point 3" +if [ $stage -le 6 ]; then + echo "point 2" utils/format_lm.sh \ data/lang data/local/lm/trigram.arpa.gz data/local/dict/lexicon.txt \ data/lang_test - - # delete temporary work - rm -rf data/local/tmp fi -if [ $stage -le 3 ]; then - # extract acoustic features +if [ $stage -le 7 ]; then + echo "$0: extracting acoustic features." mkdir -p exp for fld in native nonnative test devtest train; do @@ -92,7 +89,7 @@ if [ $stage -le 3 ]; then done fi -if [ $stage -le 4 ]; then +if [ $stage -le 8 ]; then echo "$0 monophone training" steps/train_mono.sh --nj 8 --cmd "$train_cmd" data/train data/lang exp/mono || exit 1; @@ -108,8 +105,7 @@ if [ $stage -le 4 ]; then ) & fi -if [ $stage -le 5 ]; then - +if [ $stage -le 9 ]; then # align with monophones steps/align_si.sh --nj 8 --cmd "$train_cmd" \ data/train data/lang exp/mono exp/mono_ali @@ -131,10 +127,8 @@ if [ $stage -le 5 ]; then fi -if [ $stage -le 6 ]; then +if [ $stage -le 10 ]; then echo "$0: Starting delta system alignment" - - # align with triphones steps/align_si.sh \ --nj 8 --cmd "$train_cmd" data/train data/lang exp/tri1 exp/tri1_ali @@ -156,10 +150,9 @@ if [ $stage -le 6 ]; then ) & fi -if [ $stage -le 7 ]; then +if [ $stage -le 11 ]; then echo "$0: Starting LDA+MLLT system alignment" - # align with lda and mllt adapted triphones steps/align_si.sh \ --use-graphs true --nj 8 --cmd "$train_cmd" \ data/train data/lang exp/tri2b exp/tri2b_ali @@ -169,7 +162,6 @@ if [ $stage -le 7 ]; then --cmd "$train_cmd" \ 3100 50000 data/train data/lang exp/tri2b_ali exp/tri3b - # align with tri3b models echo "$0 Starting exp/tri3b_ali" steps/align_fmllr.sh \ --nj 8 --cmd "$train_cmd" \ @@ -182,16 +174,16 @@ if [ $stage -le 7 ]; then utils/mkgraph.sh \ data/lang_test exp/tri3b exp/tri3b/graph || exit 1; - # decode test sets with tri3b models for x in native nonnative devtest test; do + echo "$0: decoding $x with tri3b models." steps/decode_fmllr.sh \ --nj 8 --cmd "$decode_cmd" exp/tri3b/graph data/$x exp/tri3b/decode_${x} done ) & fi -if [ $stage -le 9 ]; then - # train and test chain models +if [ $stage -le 12 ]; then + echo "$0: train and test chain models." local/chain/run_tdnn.sh fi diff --git a/egs/hkust/s5/RESULTS b/egs/hkust/s5/RESULTS index c419c9f6ddd..aac01fcb5af 100644 --- a/egs/hkust/s5/RESULTS +++ b/egs/hkust/s5/RESULTS @@ -1,3 +1,5 @@ +## Caution: these WERs are actually CERs. + # for x in exp/*/decode; do [ -d $x ] && grep WER $x/cer_* | utils/best_wer.sh; done %WER 80.67 [ 45198 / 56027, 1607 ins, 10733 del, 32858 sub ] exp/mono0a/decode/cer_9_0.0 %WER 58.79 [ 32939 / 56027, 2662 ins, 6124 del, 24153 sub ] exp/tri1/decode/cer_13_0.0 @@ -41,3 +43,6 @@ exp/nnet2_convnet/decode/cer_10:%WER 41.19 [ 23129 / 56154, 2599 ins, 3782 del, # nnet3 mfcc results (using speed perturbed data) exp/nnet3/tdnn_sp/decode_dev/cer_10:%WER 33.79 [ 18977 / 56154, 2027 ins, 3485 del, 13465 sub ] exp/nnet3/lstm_sp_ld5/decode_dev/cer_9:%WER 33.51 [ 18815 / 56154, 1813 ins, 3249 del, 13753 sub ] + + +# For nnet3+chain results, which are significantly better, see scripts in local/chain/tuning/. diff --git a/egs/hkust/s5/local/chain/compare_wer.sh b/egs/hkust/s5/local/chain/compare_wer.sh index b3376871a69..27a6b783433 100755 --- a/egs/hkust/s5/local/chain/compare_wer.sh +++ b/egs/hkust/s5/local/chain/compare_wer.sh @@ -39,25 +39,25 @@ for x in $*; do done echo -# print decode WER results -echo -n "# WER(%) " +# print decode CER results +echo -n "# CER(%) " for x in $*; do set_names $x - wer=$([ -d $x ] && grep WER $x/decode/cer_* | utils/best_wer.sh | awk '{print $2}') + wer=$([ -d $x ] && grep CER $x/decode/cer_* | utils/best_wer.sh | awk '{print $2}') printf "% 10s" $wer done echo -# so how about online WER? +# so how about online CER? if $include_online; then - echo -n "# WER(%)[online] " + echo -n "# CER(%)[online] " for x in $*; do set_names $x wer=$(cat ${x}_online/decode/cer_* | utils/best_wer.sh | awk '{print $2}') printf "% 10s" $wer done echo - echo -n "# WER(%)[per-utt] " + echo -n "# CER(%)[per-utt] " for x in $*; do set_names $x wer_per_utt=$(cat ${x}_online/decode_per_utt/cer_* | utils/best_wer.sh | awk '{print $2}') diff --git a/egs/hkust/s5/local/chain/tuning/run_tdnn_2a.sh b/egs/hkust/s5/local/chain/tuning/run_tdnn_2a.sh old mode 100644 new mode 100755 index 0fc0de36a45..c62b776de2b --- a/egs/hkust/s5/local/chain/tuning/run_tdnn_2a.sh +++ b/egs/hkust/s5/local/chain/tuning/run_tdnn_2a.sh @@ -5,9 +5,9 @@ # Results # local/chain/compare_wer.sh --online exp/chain/tdnn_7h_chain_2b_sp # Model tdnn_7h_chain_2b_sp -# WER(%) 23.67 -# WER(%)[online] 23.69 -# WER(%)[per-utt] 24.67 +# CER(%) 23.67 +# CER(%)[online] 23.69 +# CER(%)[per-utt] 24.67 # Final train prob -0.0895 # Final valid prob -0.1251 # Final train prob (xent) -1.3628 @@ -109,7 +109,7 @@ if [ $stage -le 12 ]; then ivector_dim=$(feat-to-dim scp:exp/nnet3/ivectors_${train_set}/ivector_online.scp -) feat_dim=$(feat-to-dim scp:data/${train_set}_hires/feats.scp -) num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) opts="l2-regularize=0.004 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true" linear_opts="orthonormal-constraint=-1.0 l2-regularize=0.004" output_opts="l2-regularize=0.002" diff --git a/egs/hkust/s5/local/create_oov_char_lexicon.pl b/egs/hkust/s5/local/create_oov_char_lexicon.pl index 0c146c9a123..33e2e8061c3 100755 --- a/egs/hkust/s5/local/create_oov_char_lexicon.pl +++ b/egs/hkust/s5/local/create_oov_char_lexicon.pl @@ -25,15 +25,17 @@ exit; } -use encoding utf8; +use utf8; my %prons; open(DICT, $ARGV[0]) || die("Can't open dict ".$ARGV[0]."\n"); +binmode(DICT,":encoding(utf8)"); foreach () { chomp; @A = split(" ", $_); $prons{$A[0]} = $A[1]; } close DICT; open(WORDS, $ARGV[1]) || die("Can't open oov word list ".$ARGV[1]."\n"); +binmode(WORDS,":encoding(utf8)"); while () { chomp; print $_; diff --git a/egs/hkust/s5/local/hkust_data_prep.sh b/egs/hkust/s5/local/hkust_data_prep.sh index 207f03af36b..6342ccfe861 100755 --- a/egs/hkust/s5/local/hkust_data_prep.sh +++ b/egs/hkust/s5/local/hkust_data_prep.sh @@ -1,5 +1,5 @@ #!/bin/bash - + . ./path.sh || exit 1; if [ $# != 2 ]; then @@ -14,6 +14,11 @@ hkust_text_dir=$2 train_dir=data/local/train dev_dir=data/local/dev +# transcripts normalization and segmentation +# needs external tools +python2 -c "import mmseg" 2>/dev/null || { + echo "Python module mmseg is not found. To install it, run tools/extra/install_mmseg.sh"; exit 1; } + mkdir -p $train_dir mkdir -p $dev_dir @@ -35,7 +40,7 @@ n=`cat $train_dir/sph.flist $dev_dir/sph.flist | wc -l` #collect all trans, convert encodings to utf-8, find $hkust_text_dir -iname "*.txt" | grep -i "trans/train" | xargs cat |\ - iconv -f GBK -t utf-8 - | perl -e ' + iconv -f GBK -t UTF-8 | perl -e ' while () { @A = split(" ", $_); if (@A <= 1) { next; } @@ -50,7 +55,7 @@ find $hkust_text_dir -iname "*.txt" | grep -i "trans/train" | xargs cat |\ ' | sort -k1 > $train_dir/transcripts.txt || exit 1; find $hkust_text_dir -iname "*.txt" | grep -i "trans/dev" | xargs cat |\ - iconv -f GBK -t utf-8 - | perl -e ' + iconv -f GBK -t UTF-8 | perl -e ' while () { @A = split(" ", $_); if (@A <= 1) { next; } @@ -65,17 +70,13 @@ find $hkust_text_dir -iname "*.txt" | grep -i "trans/dev" | xargs cat |\ ' | sort -k1 > $dev_dir/transcripts.txt || exit 1; #transcripts normalization and segmentation -#(this needs external tools), -python -c "import mmseg" 2>/dev/null || \ - (echo "mmseg is not found. Checkout tools/extra/install_mmseg.sh" && exit 1;) - cat $train_dir/transcripts.txt |\ sed -e 's// /g' |\ sed -e 's/<\/foreign>/ /g' |\ sed -e 's/\(.\+\)<\/noise>/\1/g' |\ sed -e 's/((\([^)]\{0,\}\)))/\1/g' |\ local/hkust_normalize.pl |\ - python local/hkust_segment.py |\ + local/hkust_segment.py |\ awk '{if (NF > 1) print $0;}' > $train_dir/text || exit 1; cat $dev_dir/transcripts.txt |\ @@ -84,7 +85,7 @@ cat $dev_dir/transcripts.txt |\ sed -e 's/\(.\+\)<\/noise>/\1/g' |\ sed -e 's/((\([^)]\{0,\}\)))/\1/g' |\ local/hkust_normalize.pl |\ - python local/hkust_segment.py |\ + local/hkust_segment.py |\ awk '{if (NF > 1) print $0;}' > $dev_dir/text || exit 1; # some data is corrupted. Delete them diff --git a/egs/hkust/s5/local/hkust_prepare_dict.sh b/egs/hkust/s5/local/hkust_prepare_dict.sh index 27d1060e945..49f27f2f868 100755 --- a/egs/hkust/s5/local/hkust_prepare_dict.sh +++ b/egs/hkust/s5/local/hkust_prepare_dict.sh @@ -176,7 +176,9 @@ wc -l $dict_dir/lexicon-ch/lexicon-ch-iv.txt # dictionary in order to get OOV pronunciations cat $dict_dir/cedict/ch-dict.txt |\ perl -e ' - use encoding utf8; + use utf8; + binmode(STDIN,":encoding(utf8)"); + binmode(STDOUT,":encoding(utf8)"); while () { @A = split(" ", $_); $word_len = length($A[0]); @@ -188,7 +190,9 @@ cat $dict_dir/cedict/ch-dict.txt |\ # extract chars cat $dict_dir/cedict/ch-dict-1.txt | awk '{print $1}' |\ perl -e ' - use encoding utf8; + use utf8; + binmode(STDIN,":encoding(utf8)"); + binmode(STDOUT,":encoding(utf8)"); while () { @A = split(" ", $_); @chars = split("", $A[0]); diff --git a/egs/hkust/s5/local/hkust_segment.py b/egs/hkust/s5/local/hkust_segment.py index 92d3add0e3e..d4c2b35a668 100755 --- a/egs/hkust/s5/local/hkust_segment.py +++ b/egs/hkust/s5/local/hkust_segment.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python2.7 #coding:utf-8 from __future__ import print_function diff --git a/egs/hub4_english/s5/local/data_prep/process_1995_bn_annotation.py b/egs/hub4_english/s5/local/data_prep/process_1995_bn_annotation.py index be0c7ad8e0d..5675dc3fbd9 100755 --- a/egs/hub4_english/s5/local/data_prep/process_1995_bn_annotation.py +++ b/egs/hub4_english/s5/local/data_prep/process_1995_bn_annotation.py @@ -31,9 +31,9 @@ def get_args(): parser = argparse.ArgumentParser("Process 1995 CSR-IV HUB4 transcripts") - parser.add_argument("--noise-word", type=str, default="", + parser.add_argument("--noise-word", default="", help="Word to add in-place of noise words") - parser.add_argument("--spoken-noise-word", type=str, + parser.add_argument("--spoken-noise-word", default="", help="Word to add in-place of speaker noise words") parser.add_argument("in_file", type=argparse.FileType('r'), @@ -230,7 +230,7 @@ def run(args): start_time = story_end_time segments = process_story_content( args, reco_id, - ' '.join([unicode(x) for x in s.children]), + ' '.join([str(x) for x in s.children]), start_time=story_begin_time, end_time=story_end_time) write_segments(segments, args) elif (s.name is not None and s.name != "language" @@ -240,9 +240,9 @@ def run(args): "or or ; got {0}".format(s)) elif s.name == "language" or s.name == "sung": non_story_contents.append( - ' '.join([unicode(x) for x in s.children])) + ' '.join([str(x) for x in s.children])) else: - non_story_contents.append(unicode(s)) + non_story_contents.append(str(s)) except RuntimeError: raise except Exception: diff --git a/egs/hub4_english/s5/local/data_prep/process_1996_csr_hub4_lm_filelist.py b/egs/hub4_english/s5/local/data_prep/process_1996_csr_hub4_lm_filelist.py index 95aa7ddb831..fb5ba7a64ee 100755 --- a/egs/hub4_english/s5/local/data_prep/process_1996_csr_hub4_lm_filelist.py +++ b/egs/hub4_english/s5/local/data_prep/process_1996_csr_hub4_lm_filelist.py @@ -36,9 +36,9 @@ def get_args(): corpus (LDC98T31).""") parser.add_argument("--verbose", choices=[0,1,2,3], type=int, default=0, help="Set higher for more verbose logging.") - parser.add_argument("file_list", type=str, + parser.add_argument("file_list", help="""List of compressed source files""") - parser.add_argument("dir", type=str, + parser.add_argument("dir", help="Output directory to dump processed files to") args = parser.parse_args() @@ -83,7 +83,7 @@ def process_file_lines(lines, out_file_handle): for x in para.contents: try: if x.name is None: - normalized_text = normalize_text(unicode(x)) + normalized_text = normalize_text(str(x)) if len(normalized_text) == 0: continue out_file_handle.write("{0}\n".format( diff --git a/egs/hub4_english/s5/local/data_prep/process_na_news_text.py b/egs/hub4_english/s5/local/data_prep/process_na_news_text.py index 94b02a766a9..08203f7ada1 100755 --- a/egs/hub4_english/s5/local/data_prep/process_na_news_text.py +++ b/egs/hub4_english/s5/local/data_prep/process_na_news_text.py @@ -38,10 +38,10 @@ def get_args(): parser = argparse.ArgumentParser("Prepare NA News Text corpus (LDC95T21).") parser.add_argument("--verbose", type=int, choices=[0, 1, 2, 3], default=0, help="Use larger verbosity for more verbose logging.") - parser.add_argument("file_list", type=str, + parser.add_argument("file_list", help="List of compressed source files for NA News Text. " "e.g: /export/corpora/LDC/LDC95T21/na_news_1/latwp/1994") - parser.add_argument("out_file", type=str, + parser.add_argument("out_file", help="Output file to write to.") args = parser.parse_args() @@ -85,7 +85,7 @@ def process_file_lines(lines, out_file_handle): continue for para in art.find_all('p'): assert para.name == 'p' - text = ' '.join([unicode(x).strip() for x in para.contents]) + text = ' '.join([str(x).strip() for x in para.contents]) normalized_text = normalize_text(text) out_file_handle.write("{0}\n".format( normalized_text.encode('ascii'))) diff --git a/egs/hub4_english/s5/local/lm/merge_word_counts.py b/egs/hub4_english/s5/local/lm/merge_word_counts.py index 6338cbbf875..85e15d8dc07 100755 --- a/egs/hub4_english/s5/local/lm/merge_word_counts.py +++ b/egs/hub4_english/s5/local/lm/merge_word_counts.py @@ -7,6 +7,7 @@ A min-count argument is required to only write counts that are above the specified minimum count. """ +from __future__ import print_function import sys @@ -21,7 +22,7 @@ def main(): parts = line.strip().split() words[parts[1]] = words.get(parts[1], 0) + int(parts[0]) - for word, count in words.iteritems(): + for word, count in words.items(): if count >= int(sys.argv[1]): print ("{0} {1}".format(count, word)) diff --git a/egs/hub4_spanish/s5/local/chain/compare_wer.sh b/egs/hub4_spanish/s5/local/chain/compare_wer.sh new file mode 100755 index 00000000000..0194b86ac69 --- /dev/null +++ b/egs/hub4_spanish/s5/local/chain/compare_wer.sh @@ -0,0 +1,135 @@ +#!/bin/bash + +# this script is used for comparing decoding results between systems. +# e.g. local/chain/compare_wer.sh exp/chain/tdnn_{c,d}_sp +# For use with discriminatively trained systems you specify the epochs after a colon: +# for instance, +# local/chain/compare_wer.sh exp/chain/tdnn_c_sp exp/chain/tdnn_c_sp_smbr:{1,2,3} + + +if [ $# == 0 ]; then + echo "Usage: $0: [--looped] [--online] [ ... ]" + echo "e.g.: $0 exp/chain/tdnn_{b,c}_sp" + echo "or (with epoch numbers for discriminative training):" + echo "$0 exp/chain/tdnn_b_sp_disc:{1,2,3}" + exit 1 +fi + +echo "# $0 $*" + +include_looped=false +if [ "$1" == "--looped" ]; then + include_looped=true + shift +fi +include_online=false +if [ "$1" == "--online" ]; then + include_online=true + shift +fi + + +used_epochs=false + +# this function set_names is used to separate the epoch-related parts of the name +# [for discriminative training] and the regular parts of the name. +# If called with a colon-free directory name, like: +# set_names exp/chain/tdnn_lstm1e_sp_bi_smbr +# it will set dir=exp/chain/tdnn_lstm1e_sp_bi_smbr and epoch_infix="" +# If called with something like: +# set_names exp/chain/tdnn_d_sp_smbr:3 +# it will set dir=exp/chain/tdnn_d_sp_smbr and epoch_infix="_epoch3" + + +set_names() { + if [ $# != 1 ]; then + echo "compare_wer_general.sh: internal error" + exit 1 # exit the program + fi + dirname=$(echo $1 | cut -d: -f1) + epoch=$(echo $1 | cut -s -d: -f2) + if [ -z $epoch ]; then + epoch_infix="" + else + used_epochs=true + epoch_infix=_epoch${epoch} + fi +} + + + +echo -n "# System " +for x in $*; do printf "% 10s" " $(basename $x)"; done +echo + +strings=("#WER test ") + +for n in 0; do + echo -n "${strings[$n]}" + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + decode_names=(test) + + wer=$(cat $dirname/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + if $include_looped; then + echo -n "# [looped:] " + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + wer=$(cat $dirname/decode_looped_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + fi + if $include_online; then + echo -n "# [online:] " + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + wer=$(cat ${dirname}_online/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + fi +done + + +if $used_epochs; then + exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. +fi + + +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final train prob (xent)" +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob (xent)" +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Num-params " +for x in $*; do + printf "% 10s" $(grep num-parameters $x/log/progress.1.log | awk '{print $2}') +done +echo diff --git a/egs/hub4_spanish/s5/local/chain/run_cnn_tdnn.sh b/egs/hub4_spanish/s5/local/chain/run_cnn_tdnn.sh new file mode 120000 index 00000000000..ab83f3c43e8 --- /dev/null +++ b/egs/hub4_spanish/s5/local/chain/run_cnn_tdnn.sh @@ -0,0 +1 @@ +tuning/run_cnn_tdnn_1a.sh \ No newline at end of file diff --git a/egs/hub4_spanish/s5/local/chain/run_tdnn.sh b/egs/hub4_spanish/s5/local/chain/run_tdnn.sh index 211957092f9..61f8f499182 120000 --- a/egs/hub4_spanish/s5/local/chain/run_tdnn.sh +++ b/egs/hub4_spanish/s5/local/chain/run_tdnn.sh @@ -1 +1 @@ -./tuning/run_tdnn_1a.sh \ No newline at end of file +tuning/run_tdnn_1b.sh \ No newline at end of file diff --git a/egs/hub4_spanish/s5/local/chain/tuning/run_cnn_tdnn_1a.sh b/egs/hub4_spanish/s5/local/chain/tuning/run_cnn_tdnn_1a.sh new file mode 100755 index 00000000000..d1b657a2d74 --- /dev/null +++ b/egs/hub4_spanish/s5/local/chain/tuning/run_cnn_tdnn_1a.sh @@ -0,0 +1,287 @@ +#!/bin/bash + +## This is taken from mini_librispeech. + +# local/chain/compare_wer.sh --online exp/chain/tdnn1a_sp exp/chain/cnn_tdnn1a_sp +# System tdnn1a_sp cnn_tdnn1a_sp +#WER test 14.19 13.47 +# [online:] 14.26 13.57 +# Final train prob -0.0707 -0.0911 +# Final valid prob -0.1225 -0.1145 +# Final train prob (xent) -1.1117 -1.3038 +# Final valid prob (xent) -1.3199 -1.3374 +# Num-params 6945216 4471200 + +# steps/info/chain_dir_info.pl exp/chain/cnn_tdnn1a_sp +# exp/chain/cnn_tdnn1a_sp: num-iters=102 nj=2..5 num-params=4.5M dim=40+100->2272 combine=-0.101->-0.097 (over 5) xent:train/valid[67,101,final]=(-1.46,-1.31,-1.30/-1.47,-1.34,-1.34) logprob:train/valid[67,101,final]=(-0.112,-0.097,-0.091/-0.129,-0.121,-0.114) + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +decode_nj=10 +train_set=train +test_sets=eval +gmm=tri5 +nnet3_affix= + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1a # affix for the TDNN directory name +tree_affix= +train_stage=-10 +get_egs_stage=-10 +decode_iter= + +# training options +# training chunk-options +chunk_width=140,100,160 +dropout_schedule='0,0@0.20,0.3@0.50,0' +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +common_egs_dir= +xent_regularize=0.1 + +# training options +srand=0 +remove_egs=true +reporting_email= + +#decode options +test_online_decoding=true # if true, it will run the last decoding stage. + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 13 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) + + cnn_opts="l2-regularize=0.03" + ivector_affine_opts="l2-regularize=0.03" + tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true" + tdnnf_first_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.0" + tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66" + linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0" + prefinal_opts="l2-regularize=0.03" + output_opts="l2-regularize=0.015" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # this takes the MFCCs and generates filterbank coefficients. The MFCCs + # are more compressible so we prefer to dump the MFCCs to disk rather + # than filterbanks. + idct-layer name=idct input=input dim=40 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat + + linear-component name=ivector-linear $ivector_affine_opts dim=200 input=ReplaceIndex(ivector, t, 0) + batchnorm-component name=ivector-batchnorm target-rms=0.025 + + batchnorm-component name=idct-batchnorm input=idct + combine-feature-maps-layer name=combine_inputs input=Append(idct-batchnorm, ivector-batchnorm) num-filters1=1 num-filters2=5 height=40 + + conv-relu-batchnorm-layer name=cnn1 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=48 learning-rate-factor=0.333 max-change=0.25 + conv-relu-batchnorm-layer name=cnn2 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=48 + conv-relu-batchnorm-layer name=cnn3 $cnn_opts height-in=40 height-out=20 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64 + conv-relu-batchnorm-layer name=cnn4 $cnn_opts height-in=20 height-out=20 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64 + conv-relu-batchnorm-layer name=cnn5 $cnn_opts height-in=20 height-out=10 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64 + conv-relu-batchnorm-layer name=cnn6 $cnn_opts height-in=10 height-out=5 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=128 + + # the first TDNN-F layer has no bypass (since dims don't match), and a larger bottleneck so the + # information bottleneck doesn't become a problem. (we use time-stride=0 so no splicing, to + # limit the num-parameters). + tdnnf-layer name=tdnnf7 $tdnnf_first_opts dim=768 bottleneck-dim=192 time-stride=0 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf9 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf10 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf11 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf12 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf13 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf14 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf15 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + linear-component name=prefinal-l dim=192 $linear_opts + + ## adding the layers for chain branch + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + # adding the layers for xent branch + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/hub4_spanish-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=10 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.num-chunk-per-minibatch=256,128,64 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/langp_test \ + $tree_dir $dir/graph || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + nspk=$(wc -l $dir/configs/network.xconfig @@ -179,7 +179,7 @@ fi if [ $stage -le 14 ]; then if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then utils/create_split_dir.pl \ - /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/hub4_spanish-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage fi steps/nnet3/chain/train.py --stage=$train_stage \ @@ -227,6 +227,16 @@ if [ $stage -le 15 ]; then $tree_dir $dir/graph || exit 1; fi +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + nspk=$(wc -l 2272 combine=-0.105->-0.100 (over 6) xent:train/valid[67,101,final]=(-1.54,-1.34,-1.35/-1.56,-1.39,-1.39) logprob:train/valid[67,101,final]=(-0.116,-0.099,-0.094/-0.135,-0.123,-0.116) + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +decode_nj=10 +train_set=train +test_sets=eval +gmm=tri5 +nnet3_affix= + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1b # affix for the TDNN directory name +tree_affix= +train_stage=-10 +get_egs_stage=-10 +decode_iter= + +# training options +# training chunk-options +chunk_width=140,100,160 +dropout_schedule='0,0@0.20,0.3@0.50,0' +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +common_egs_dir= +xent_regularize=0.1 + +# training options +srand=0 +remove_egs=true +reporting_email= + +#decode options +test_online_decoding=true # if true, it will run the last decoding stage. + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 13 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) + + tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true" + tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66" + linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0" + prefinal_opts="l2-regularize=0.03" + output_opts="l2-regularize=0.015" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=768 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf9 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf10 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf11 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf12 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf13 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + linear-component name=prefinal-l dim=192 $linear_opts + + ## adding the layers for chain branch + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + # adding the layers for xent branch + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/hub4_spanish-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=10 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.num-chunk-per-minibatch=256,128,64 \ + --egs.cmd="run.pl --max-jobs-run 12" \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/langp_test \ + $tree_dir $dir/graph || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + nspk=$(wc -l is written as a word if(w[0].lower() == ""): - f.write("%s\t\n" % (unicode(w[0]))) + f.write("%s\t\n" % (str(w[0]))) else: - f.write("%s\t%s\n" % (unicode(w[0]), + f.write("%s\t%s\n" % (str(w[0]), encoded_transcription[idx])) if __name__ == "__main__": diff --git a/egs/hub4_spanish/s5/local/prepare_unicode_dict.py b/egs/hub4_spanish/s5/local/prepare_unicode_dict.py index 86fa4d60ba1..3b9dc1abd86 100755 --- a/egs/hub4_spanish/s5/local/prepare_unicode_dict.py +++ b/egs/hub4_spanish/s5/local/prepare_unicode_dict.py @@ -89,7 +89,7 @@ def extract_phonemes(lexicon): # Read all baseform units into dictionary with {a: [a, a_1, a_2], # b: [b_1, b_3], ...} phonemes_dict = {} - for word, pron in lexicon.iteritems(): + for word, pron in lexicon.items(): for p in pron.split(): try: base = p.split("_",1)[0] @@ -98,11 +98,11 @@ def extract_phonemes(lexicon): phonemes_dict[base] = [p] # Makes sure there are no repeats in the list - phonemes_dict = {k: set(v) for k, v in phonemes_dict.iteritems()} + phonemes_dict = {k: set(v) for k, v in phonemes_dict.items()} # Get all unique phonemes phonemes = [] - for v in phonemes_dict.itervalues(): + for v in phonemes_dict.values(): for p in v: phonemes.append(p) @@ -137,11 +137,11 @@ def write_extra_questions(nonsil_phonemes, nonsil_phonemes_dict, # Write all possible phone_tag combinations that occur in the lexicon for tag in tags: - for p in nonsil_phonemes_dict.iterkeys(): + for p in nonsil_phonemes_dict.keys(): tagged_phoneme = "_".join([p, tag]) if(tagged_phoneme in nonsil_phonemes_dict[p]): fp.write("%s " % tagged_phoneme) - for p in sil_phonemes_dict.iterkeys(): + for p in sil_phonemes_dict.keys(): tagged_phoneme = "_".join([p, tag]) if(tagged_phoneme in sil_phonemes_dict[p]): fp.write("%s " % tagged_phoneme) diff --git a/egs/iam/v1/RESULTS b/egs/iam/v1/RESULTS new file mode 100644 index 00000000000..b25cb3cd772 --- /dev/null +++ b/egs/iam/v1/RESULTS @@ -0,0 +1,42 @@ +Run_end2end.sh (WER using lang_test, lang_unk) +flat_start: + • %WER 14.41 [ 2671 / 18542, 262 ins, 561 del, 1848 sub ] exp/chain/e2e_cnn_1a/decode_test/wer_11_1.0 + • %WER 15.21 [ 2821 / 18542, 375 ins, 500 del, 1946 sub ] exp/chain/e2e_cnn_1a/decode_test/wer_11_1.0 + +cnn_e2eali_1a: + • %WER 11.94 [ 2214 / 18542, 267 ins, 380 del, 1567 sub ] exp/chain/cnn_e2eali_1a/decode_test/wer_9_1.0 + • %WER 13.30 [ 2467 / 18542, 441 ins, 330 del, 1696 sub ] exp/chain/cnn_e2eali_1a/decode_test/wer_9_0.5 + +cnn_e2eali_1b: + • %WER 11.20 [ 2076 / 18542, 260 ins, 335 del, 1481 sub ] exp/chain/cnn_e2eali_1b/decode_test/wer_9_1.0 + • %WER 12.46 [ 2311 / 18542, 371 ins, 326 del, 1614 sub ] exp/chain/cnn_e2eali_1b/decode_test/wer_9_1.0 + +cnn_e2eali_1c: + • %WER 9.90 [ 1836 / 18542, 257 ins, 227 del, 1352 sub ] exp/chain/cnn_e2eali_1c/decode_test/wer_10_1.0 + • %WER 12.10 [ 2243 / 18542, 411 ins, 269 del, 1563 sub ] exp/chain/cnn_e2eali_1c/decode_test/wer_12_0.5 + + +Run.sh (WER using lang_test, lang_unk) +cnn_1a: + • %WER 15.18 [ 2815 / 18542, 285 ins, 509 del, 2021 sub ] exp/chain/cnn_1a/decode_test/wer_11_0.0 + • %WER 16.88 [ 3130 / 18542, 444 ins, 611 del, 2075 sub ] exp/chain/cnn_1a/decode_test/wer_11_0.0 + +cnn_chainali_1a: + • %WER 14.09 [ 2612 / 18542, 245 ins, 505 del, 1862 sub ] exp/chain/cnn_chainali_1a/decode_test/wer_13_0.0 + • %WER 15.93 [ 2954 / 18542, 454 ins, 470 del, 2030 sub ] exp/chain/cnn_chainali_1a/decode_test/wer_10_0.0 + +cnn_chainali_1b: + • %WER 13.29 [ 2465 / 18542, 221 ins, 499 del, 1745 sub ] exp/chain/cnn_chainali_1b/decode_test/wer_12_0.5 + • %WER 15.09 [ 2798 / 18542, 418 ins, 468 del, 1912 sub ] exp/chain/cnn_chainali_1b/decode_test/wer_10_0.5 + +cnn_chainali_1c: + • %WER 11.59 [ 2149 / 18542, 276 ins, 362 del, 1511 sub ] exp/chain/cnn_chainali_1c/decode_test/wer_9_0.0 + • %WER 13.75 [ 2550 / 18542, 465 ins, 368 del, 1717 sub ] exp/chain/cnn_chainali_1c/decode_test/wer_8_0.0 + +cnn_chainali_1d: + • %WER 11.07 [ 2053 / 18542, 261 ins, 311 del, 1481 sub ] exp/chain/cnn_chainali_1c/decode_test/wer_9_0.0 + • %WER 12.95 [ 2402 / 18542, 436 ins, 313 del, 1653 sub ] exp/chain/cnn_chainali_1c/decode_test/wer_8_0.0 + +cnn_chainali_1e: + • %WER 10.03 [ 1859 / 18542, 226 ins, 291 del, 1342 sub ] exp/chain/cnn_chainali_1e/decode_test/wer_11_0.5 + %WER 12.15 [ 2253 / 18542, 406 ins, 282 del, 1565 sub ] exp/chain/cnn_chainali_1e/decode_test/wer_10_0.5 diff --git a/egs/iam/v1/local/augment_data.sh b/egs/iam/v1/local/augment_data.sh new file mode 100755 index 00000000000..31e4a8217ca --- /dev/null +++ b/egs/iam/v1/local/augment_data.sh @@ -0,0 +1,34 @@ +#!/bin/bash +# Copyright 2018 Hossein Hadian +# 2018 Ashish Arora + +# Apache 2.0 +# This script performs data augmentation. + +nj=4 +cmd=run.pl +feat_dim=40 +echo "$0 $@" + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; + +srcdir=$1 +outdir=$2 +datadir=$3 +aug_set=aug1 +mkdir -p $datadir/augmentations +echo "copying $srcdir to $datadir/augmentations/$aug_set, allowed length, creating feats.scp" + +for set in $aug_set; do + image/copy_data_dir.sh --spk-prefix $set- --utt-prefix $set- \ + $srcdir $datadir/augmentations/$set + cat $srcdir/allowed_lengths.txt > $datadir/augmentations/$set/allowed_lengths.txt + local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim $feat_dim \ + --fliplr false --augment true $datadir/augmentations/$set +done + +echo " combine original data and data from different augmentations" +utils/combine_data.sh --extra-files images.scp $outdir $srcdir $datadir/augmentations/$aug_set +cat $srcdir/allowed_lengths.txt > $outdir/allowed_lengths.txt diff --git a/egs/iam/v1/local/chain/compare_wer.sh b/egs/iam/v1/local/chain/compare_wer.sh index ad90710b13f..4a2cc29481c 100755 --- a/egs/iam/v1/local/chain/compare_wer.sh +++ b/egs/iam/v1/local/chain/compare_wer.sh @@ -34,6 +34,20 @@ for x in $*; do done echo +echo -n "# WER val " +for x in $*; do + wer=$(cat $x/decode_val/scoring_kaldi/best_wer | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "# CER val " +for x in $*; do + cer=$(cat $x/decode_val/scoring_kaldi/best_cer | awk '{print $2}') + printf "% 10s" $cer +done +echo + if $used_epochs; then exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. fi diff --git a/egs/iam/v1/local/chain/run_cnn.sh b/egs/iam/v1/local/chain/run_cnn.sh new file mode 120000 index 00000000000..df6f0a468c1 --- /dev/null +++ b/egs/iam/v1/local/chain/run_cnn.sh @@ -0,0 +1 @@ +tuning/run_cnn_1a.sh \ No newline at end of file diff --git a/egs/iam/v1/local/chain/run_cnn_chainali.sh b/egs/iam/v1/local/chain/run_cnn_chainali.sh new file mode 120000 index 00000000000..41b712609c2 --- /dev/null +++ b/egs/iam/v1/local/chain/run_cnn_chainali.sh @@ -0,0 +1 @@ +tuning/run_cnn_chainali_1d.sh \ No newline at end of file diff --git a/egs/iam/v1/local/chain/run_cnn_e2eali.sh b/egs/iam/v1/local/chain/run_cnn_e2eali.sh new file mode 120000 index 00000000000..ad51803ab0e --- /dev/null +++ b/egs/iam/v1/local/chain/run_cnn_e2eali.sh @@ -0,0 +1 @@ +tuning/run_cnn_e2eali_1c.sh \ No newline at end of file diff --git a/egs/iam/v1/local/chain/run_e2e_cnn.sh b/egs/iam/v1/local/chain/run_e2e_cnn.sh new file mode 120000 index 00000000000..d26ba0182ce --- /dev/null +++ b/egs/iam/v1/local/chain/run_e2e_cnn.sh @@ -0,0 +1 @@ +tuning/run_e2e_cnn_1a.sh \ No newline at end of file diff --git a/egs/iam/v1/local/chain/run_cnn_1a.sh b/egs/iam/v1/local/chain/tuning/run_cnn_1a.sh similarity index 80% rename from egs/iam/v1/local/chain/run_cnn_1a.sh rename to egs/iam/v1/local/chain/tuning/run_cnn_1a.sh index 41a76920e37..ef1273f3961 100755 --- a/egs/iam/v1/local/chain/run_cnn_1a.sh +++ b/egs/iam/v1/local/chain/tuning/run_cnn_1a.sh @@ -4,23 +4,23 @@ # 2017 Chun Chieh Chang # 2017 Ashish Arora -# steps/info/chain_dir_info.pl exp/chain/cnn_1a/ -# exp/chain/cnn_1a/: num-iters=21 nj=2..4 num-params=4.4M dim=40->364 combine=-0.021->-0.015 xent:train/valid[13,20,final]=(-1.05,-0.701,-0.591/-1.30,-1.08,-1.00) logprob:train/valid[13,20,final]=(-0.061,-0.034,-0.030/-0.107,-0.101,-0.098) - # local/chain/compare_wer.sh exp/chain/cnn_1a/ -# System cnn_1a -# WER 18.52 -# CER 10.07 -# Final train prob -0.0077 -# Final valid prob -0.0970 -# Final train prob (xent) -0.5484 -# Final valid prob (xent) -0.9643 -# Parameters 4.36M +# System cnn_1a(dict_50k) cnn_1a(dict_50k + unk model) +# WER 16.88 15.18 +# CER 8.52 7.58 +# WER val 16.17 13.53 +# CER val 7.15 5.89 +# Final train prob -0.0299 +# Final valid prob -0.0574 +# Final train prob (xent) -0.3912 +# Final valid prob (xent) -0.6439 +# Parameters 4.36M -set -e -o pipefail +# steps/info/chain_dir_info.pl exp/chain/cnn_1a/ +# exp/chain/cnn_1a/: num-iters=42 nj=2..4 num-params=4.4M dim=40->368 combine=-0.029->-0.029 (over 2) xent:train/valid[27,41,final]=(-0.522,-0.394,-0.391/-0.695,-0.644,-0.644) logprob:train/valid[27,41,final]=(-0.035,-0.030,-0.030/-0.056,-0.057,-0.057) +set -e -o pipefail stage=0 - nj=30 train_set=train gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it @@ -34,28 +34,21 @@ reporting_email= # chain options train_stage=-10 xent_regularize=0.1 -frame_subsampling_factor=4 -alignment_subsampling_factor=1 # training chunk-options chunk_width=340,300,200,100 num_leaves=500 -# we don't need extra left/right context for TDNN systems. -chunk_left_context=0 -chunk_right_context=0 tdnn_dim=450 # training options -srand=0 -remove_egs=false -lang_test=lang_unk +lang_decode=lang_unk +decode_val=true +if $decode_val; then maybe_val=val; else maybe_val= ; fi # End configuration section. echo "$0 $@" # Print the command line for logging - . ./cmd.sh . ./path.sh . ./utils/parse_options.sh - if ! cuda-compiled; then cat <368 combine=-0.020->-0.020 (over 2) xent:train/valid[27,41,final]=(-0.534,-0.425,-0.424/-0.659,-0.612,-0.612) logprob:train/valid[27,41,final]=(-0.026,-0.022,-0.022/-0.017,-0.016,-0.016) +set -e -o pipefail + +stage=0 +nj=30 +train_set=train +decode_val=true +gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it + # should have alignments for the specified training data. +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +ali=tri3_ali +chain_model_dir=exp/chain${nnet3_affix}/cnn_1a +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +chunk_width=340,300,200,100 +num_leaves=500 +tdnn_dim=450 +lang_decode=lang_unk +if $decode_val; then maybe_val=val; else maybe_val= ; fi +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --acoustic-scale 1.0 \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + $train_data_dir data/lang $chain_model_dir $lat_dir + cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 4 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves $train_data_dir \ + $lang $ali_dir $tree_dir +fi + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) + common1="height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn4 input=Append(-4,0,4) dim=$tdnn_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn4 dim=$tdnn_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=500" \ + --chain.frame-subsampling-factor=4 \ + --chain.alignment-subsampling-factor=1 \ + --trainer.srand=0 \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=4 \ + --trainer.frames-per-iter=1000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=4 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=64,32 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=false \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/$lang_decode \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + for decode_set in test $maybe_val; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1; + done +fi + +echo "$0 Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/iam/v1/local/chain/run_cnn_chainali_1b.sh b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1b.sh similarity index 79% rename from egs/iam/v1/local/chain/run_cnn_chainali_1b.sh rename to egs/iam/v1/local/chain/tuning/run_cnn_chainali_1b.sh index c6876fbafcb..401ffa14e19 100755 --- a/egs/iam/v1/local/chain/run_cnn_chainali_1b.sh +++ b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1b.sh @@ -1,27 +1,26 @@ #!/bin/bash # chainali_1b is as chainali_1a except it has 3 more cnn layers and 1 less tdnn layer. - -# local/chain/compare_wer.sh exp/chain/cnn_1a/ exp/chain/cnn_chainali_1b/ -# System cnn_1a cnn_chainali_1b -# WER 18.52 14.38 -# CER 10.07 7.14 -# Final train prob -0.0077 -0.0113 -# Final valid prob -0.0970 -0.0400 -# Final train prob (xent) -0.5484 -0.6043 -# Final valid prob (xent) -0.9643 -0.9030 -# Parameters 4.36M 3.96M +# local/chain/compare_wer.sh exp/chain/cnn_chainali_1b +# System cnn_chainali_1b(dict_50k) cnn_chainali_1b(dict_50k + unk_model) +# WER 15.09 13.29 +# CER 7.13 6.08 +# WER val 14.80 11.98 +# CER val 6.16 4.87 +# Final train prob -0.0225 +# Final valid prob -0.0132 +# Final train prob (xent) -0.4466 +# Final valid prob (xent) -0.6048 +# Parameters 3.96M # steps/info/chain_dir_info.pl exp/chain/chainali_cnn_1b/ -# exp/chain/chainali_cnn_1b/: num-iters=21 nj=2..4 num-params=4.0M dim=40->364 combine=-0.009->-0.005 xent:train/valid[13,20,final]=(-1.47,-0.728,-0.623/-1.69,-1.02,-0.940) logprob:train/valid[13,20,final]=(-0.068,-0.030,-0.011/-0.086,-0.056,-0.038) - +# exp/chain/cnn_chainali_1b: num-iters=42 nj=2..4 num-params=4.0M dim=40->368 combine=-0.019->-0.019 (over 2) xent:train/valid[27,41,final]=(-0.545,-0.448,-0.447/-0.645,-0.605,-0.605) logprob:train/valid[27,41,final]=(-0.026,-0.023,-0.023/-0.014,-0.013,-0.013) set -e -o pipefail - stage=0 - nj=30 train_set=train +decode_val=true gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it # should have alignments for the specified training data. nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. @@ -31,31 +30,20 @@ chain_model_dir=exp/chain${nnet3_affix}/cnn_1a common_egs_dir= reporting_email= -# chain options train_stage=-10 xent_regularize=0.1 -frame_subsampling_factor=4 -alignment_subsampling_factor=1 -# training chunk-options chunk_width=340,300,200,100 num_leaves=500 -# we don't need extra left/right context for TDNN systems. -chunk_left_context=0 -chunk_right_context=0 tdnn_dim=450 -# training options -srand=0 -remove_egs=false -lang_test=lang_unk +lang_decode=lang_unk +if $decode_val; then maybe_val=val; else maybe_val= ; fi # End configuration section. echo "$0 $@" # Print the command line for logging - . ./cmd.sh . ./path.sh . ./utils/parse_options.sh - if ! cuda-compiled; then cat < $dir/configs/network.xconfig input dim=40 name=input - conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 @@ -160,7 +145,6 @@ if [ $stage -le 4 ]; then ## adding the layers for chain branch relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 - # adding the layers for xent branch # This block prints the configs for a separate output that will be # trained with a cross-entropy objective in the 'chain' mod?els... this @@ -191,9 +175,9 @@ if [ $stage -le 5 ]; then --chain.l2-regularize=0.00005 \ --chain.apply-deriv-weights=false \ --chain.lm-opts="--num-extra-lm-states=500" \ - --chain.frame-subsampling-factor=$frame_subsampling_factor \ - --chain.alignment-subsampling-factor=$alignment_subsampling_factor \ - --trainer.srand=$srand \ + --chain.frame-subsampling-factor=4 \ + --chain.alignment-subsampling-factor=1 \ + --trainer.srand=0 \ --trainer.max-param-change=2.0 \ --trainer.num-epochs=4 \ --trainer.frames-per-iter=1000000 \ @@ -203,15 +187,10 @@ if [ $stage -le 5 ]; then --trainer.optimization.final-effective-lrate=0.0001 \ --trainer.optimization.shrink-value=1.0 \ --trainer.num-chunk-per-minibatch=64,32 \ - --trainer.optimization.momentum=0.0 \ --egs.chunk-width=$chunk_width \ - --egs.chunk-left-context=$chunk_left_context \ - --egs.chunk-right-context=$chunk_right_context \ - --egs.chunk-left-context-initial=0 \ - --egs.chunk-right-context-final=0 \ --egs.dir="$common_egs_dir" \ --egs.opts="--frames-overlap-per-eg 0" \ - --cleanup.remove-egs=$remove_egs \ + --cleanup.remove-egs=false \ --use-gpu=true \ --reporting.email="$reporting_email" \ --feat-dir=$train_data_dir \ @@ -227,20 +206,20 @@ if [ $stage -le 6 ]; then # topology file from the model). So you could give it a different # lang directory, one that contained a wordlist and LM of your choice, # as long as phones.txt was compatible. - utils/mkgraph.sh \ - --self-loop-scale 1.0 data/$lang_test \ + --self-loop-scale 1.0 data/$lang_decode \ $dir $dir/graph || exit 1; fi if [ $stage -le 7 ]; then frames_per_chunk=$(echo $chunk_width | cut -d, -f1) - steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context $chunk_left_context \ - --extra-right-context $chunk_right_context \ - --extra-left-context-initial 0 \ - --extra-right-context-final 0 \ - --frames-per-chunk $frames_per_chunk \ - --nj $nj --cmd "$cmd" \ - $dir/graph data/test $dir/decode_test || exit 1; + for decode_set in test $maybe_val; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1; + done fi + +echo "$0 Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/iam/v1/local/chain/run_cnn_chainali_1c.sh b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1c.sh similarity index 80% rename from egs/iam/v1/local/chain/run_cnn_chainali_1c.sh rename to egs/iam/v1/local/chain/tuning/run_cnn_chainali_1c.sh index 54c52d913de..17209b9204f 100755 --- a/egs/iam/v1/local/chain/run_cnn_chainali_1c.sh +++ b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1c.sh @@ -1,25 +1,25 @@ #!/bin/bash # chainali_1c is as chainali_1b except it uses l2-regularize -# local/chain/compare_wer.sh exp/chain/cnn_chainali_1b exp/chain/cnn_chainali_1c -# System cnn_chainali_1b cnn_chainali_1c -# WER 14.38 12.72 -# CER 7.14 5.99 -# Final train prob -0.0113 -0.0291 -# Final valid prob -0.0400 -0.0359 -# Final train prob (xent) -0.6043 -0.9781 -# Final valid prob (xent) -0.9030 -1.1544 -# Parameters 3.96M 3.96M +# local/chain/compare_wer.sh exp/chain/cnn_chainali_1c +# System cnn_chainali_1c (dict_50k) cnn_chainali_1c(dict_50k + unk_model) +# WER 12.95 11.07 +# CER 6.04 4.91 +# WER val 12.75 9.78 +# CER val 5.15 3.74 +# Final train prob -0.0217 +# Final valid prob -0.0060 +# Final train prob (xent) -0.8303 +# Final valid prob (xent) -0.8665 +# Parameters 3.96M # steps/info/chain_dir_info.pl exp/chain/cnn_chainali_1c -# exp/chain/cnn_chainali_1c: num-iters=21 nj=2..4 num-params=4.0M dim=40->369 combine=-0.007->-0.007 (over 1) xent:train/valid[13,20,final]=(-1.44,-1.05,-0.997/-1.53,-1.19,-1.15) logprob:train/valid[13,20,final]=(-0.056,-0.020,-0.012/-0.056,-0.025,-0.020) - +# exp/chain/cnn_chainali_1c/: num-iters=42 nj=2..4 num-params=4.0M dim=40->368 combine=-0.018->-0.018 (over 1) xent:train/valid[27,41,final]=(-1.22,-0.847,-0.830/-1.19,-0.880,-0.867) logprob:train/valid[27,41,final]=(-0.045,-0.025,-0.022/-0.026,-0.010,-0.006) set -e -o pipefail - stage=0 - nj=30 train_set=train +decode_val=true gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it # should have alignments for the specified training data. nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. @@ -29,30 +29,20 @@ chain_model_dir=exp/chain${nnet3_affix}/cnn_1a common_egs_dir= reporting_email= -# chain options train_stage=-10 xent_regularize=0.1 -frame_subsampling_factor=4 -# training chunk-options chunk_width=340,300,200,100 num_leaves=500 -# we don't need extra left/right context for TDNN systems. -chunk_left_context=0 -chunk_right_context=0 tdnn_dim=450 -# training options -srand=0 -remove_egs=false -lang_test=lang_unk +lang_decode=lang_unk +if $decode_val; then maybe_val=val; else maybe_val= ; fi # End configuration section. echo "$0 $@" # Print the command line for logging - . ./cmd.sh . ./path.sh . ./utils/parse_options.sh - if ! cuda-compiled; then cat <376 combine=-0.002->-0.002 (over 1) xent:train/valid[13,20,final]=(-1.66,-1.01,-0.865/-1.72,-1.12,-1.01) logprob:train/valid[13,20,final]=(-0.058,-0.019,-0.004/-0.055,-0.027,-0.013) - +# exp/chain/cnn_chainali_1d/: num-iters=42 nj=2..4 num-params=4.0M dim=40->368 combine=-0.018->-0.018 (over 1) xent:train/valid[27,41,final]=(-1.22,-0.847,-0.830/-1.19,-0.880,-0.867) logprob:train/valid[27,41,final]=(-0.045,-0.025,-0.022/-0.026,-0.010,-0.006) set -e -o pipefail stage=0 - nj=30 train_set=train gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it # should have alignments for the specified training data. nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. -affix=_1c_uc #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +affix=_1d #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. ali=tri3_ali -chain_model_dir=exp/chain${nnet3_affix}/cnn_1a_uc +chain_model_dir=exp/chain${nnet3_affix}/cnn_1a common_egs_dir= reporting_email= # chain options train_stage=-10 xent_regularize=0.1 -frame_subsampling_factor=4 # training chunk-options chunk_width=340,300,200,100 num_leaves=500 -# we don't need extra left/right context for TDNN systems. -chunk_left_context=0 -chunk_right_context=0 tdnn_dim=450 -# training options -srand=0 -remove_egs=false -lang_test=lang_unk +lang_decode=lang_unk +decode_val=true +if $decode_val; then maybe_val=val; else maybe_val= ; fi + # End configuration section. echo "$0 $@" # Print the command line for logging - . ./cmd.sh . ./path.sh . ./utils/parse_options.sh - if ! cuda-compiled; then cat < $dir/configs/network.xconfig input dim=40 name=input - conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 @@ -157,7 +147,6 @@ if [ $stage -le 4 ]; then relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts - ## adding the layers for chain branch relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts @@ -192,11 +181,11 @@ if [ $stage -le 5 ]; then --chain.l2-regularize=0.00005 \ --chain.apply-deriv-weights=false \ --chain.lm-opts="--num-extra-lm-states=500" \ - --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.frame-subsampling-factor=4 \ --chain.alignment-subsampling-factor=1 \ --chain.left-tolerance 3 \ --chain.right-tolerance 3 \ - --trainer.srand=$srand \ + --trainer.srand=0 \ --trainer.max-param-change=2.0 \ --trainer.num-epochs=4 \ --trainer.frames-per-iter=1000000 \ @@ -206,15 +195,10 @@ if [ $stage -le 5 ]; then --trainer.optimization.final-effective-lrate=0.0001 \ --trainer.optimization.shrink-value=1.0 \ --trainer.num-chunk-per-minibatch=64,32 \ - --trainer.optimization.momentum=0.0 \ --egs.chunk-width=$chunk_width \ - --egs.chunk-left-context=$chunk_left_context \ - --egs.chunk-right-context=$chunk_right_context \ - --egs.chunk-left-context-initial=0 \ - --egs.chunk-right-context-final=0 \ --egs.dir="$common_egs_dir" \ --egs.opts="--frames-overlap-per-eg 0 --constrained false" \ - --cleanup.remove-egs=$remove_egs \ + --cleanup.remove-egs=false \ --use-gpu=true \ --reporting.email="$reporting_email" \ --feat-dir=$train_data_dir \ @@ -230,20 +214,20 @@ if [ $stage -le 6 ]; then # topology file from the model). So you could give it a different # lang directory, one that contained a wordlist and LM of your choice, # as long as phones.txt was compatible. - utils/mkgraph.sh \ - --self-loop-scale 1.0 data/$lang_test \ + --self-loop-scale 1.0 data/$lang_decode \ $dir $dir/graph || exit 1; fi if [ $stage -le 7 ]; then frames_per_chunk=$(echo $chunk_width | cut -d, -f1) - steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context $chunk_left_context \ - --extra-right-context $chunk_right_context \ - --extra-left-context-initial 0 \ - --extra-right-context-final 0 \ - --frames-per-chunk $frames_per_chunk \ - --nj $nj --cmd "$cmd" \ - $dir/graph data/test $dir/decode_test || exit 1; + for decode_set in test $maybe_val; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1; + done fi + +echo "$0 Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1a.sh new file mode 100755 index 00000000000..703d404159a --- /dev/null +++ b/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1a.sh @@ -0,0 +1,229 @@ +#!/bin/bash + +# local/chain/compare_wer.sh exp/chain/cnn_e2eali_1a +# System cnn_e2eali_1a_(dict_50k) cnn_e2eali_1a_(dict_50k + unk model) +# WER 13.30 11.94 +# CER 5.95 5.15 +# WER val 12.85 10.71 +# CER val 5.09 4.03 +# Final train prob -0.0562 +# Final valid prob -0.0634 +# Final train prob (xent) -0.8196 +# Final valid prob (xent) -0.8816 +# Parameters 3.96M + +# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1a +# exp/chain/cnn_e2eali_1a: num-iters=42 nj=2..4 num-params=4.0M dim=40->368 combine=-0.058->-0.058 (over 1) xent:train/valid[27,41,final]=(-2.67,-0.841,-0.820/-2.71,-0.892,-0.882) logprob:train/valid[27,41,final]=(-0.240,-0.060,-0.056/-0.245,-0.068,-0.063) + +set -e -o pipefail + +stage=0 +nj=30 +train_set=train +decode_val=true +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +e2echain_model_dir=exp/chain/e2e_cnn_1a +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=4 +chunk_width=340,300,200,100 +num_leaves=500 +tdnn_dim=450 +remove_egs=true +lang_decode=lang_unk +if $decode_val; then maybe_val=val; else maybe_val= ; fi +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --acoustic-scale 1.0 \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + ${train_data_dir} data/lang $e2echain_model_dir $lat_dir + echo "" >$lat_dir/splice_opts +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 4 \ + --alignment-subsampling-factor 1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves $train_data_dir \ + $lang $ali_dir $tree_dir +fi + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) + cnn_opts="l2-regularize=0.075" + tdnn_opts="l2-regularize=0.075" + output_opts="l2-regularize=0.1" + common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=500" \ + --chain.frame-subsampling-factor=4 \ + --chain.alignment-subsampling-factor=1 \ + --chain.left-tolerance 3 \ + --chain.right-tolerance 3 \ + --trainer.srand=0 \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=4 \ + --trainer.frames-per-iter=1000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=4 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=64,32 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/$lang_decode \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + for decode_set in test $maybe_val; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1; + done +fi + +echo "$0 Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1b.sh b/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1b.sh new file mode 100755 index 00000000000..905c4661477 --- /dev/null +++ b/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1b.sh @@ -0,0 +1,221 @@ +#!/bin/bash + +# e2eali_1b is the same as e2eali_1a but uses unconstrained egs +# local/chain/compare_wer.sh exp/chain/cnn_e2eali_1b +# System cnn_e2eali_1b (dict_50k) cnn_e2eali_1b (dict_50k + unk model) +# WER 12.46 11.20 +# CER 5.53 4.76 +# WER val 12.71 10.49 +# CER val 4.97 3.92 +# Final train prob -0.0381 +# Final valid prob -0.0443 +# Final train prob (xent) -0.7860 +# Final valid prob (xent) -0.8290 +# Parameters 3.96M + +# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1b +# exp/chain/cnn_e2eali_1b: num-iters=42 nj=2..4 num-params=4.0M dim=40->368 combine=-0.039->-0.039 (over 2) xent:train/valid[27,41,final]=(-1.19,-0.805,-0.786/-1.19,-0.846,-0.829) logprob:train/valid[27,41,final]=(-0.060,-0.041,-0.038/-0.062,-0.048,-0.044) + +set -e -o pipefail +stage=0 +nj=30 +train_set=train +decode_val=true +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1b #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +e2echain_model_dir=exp/chain/e2e_cnn_1a +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +chunk_width=340,300,200,100 +num_leaves=500 +tdnn_dim=450 +lang_decode=lang_unk +if $decode_val; then maybe_val=val; else maybe_val= ; fi +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --acoustic-scale 1.0 \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + $train_data_dir data/lang $e2echain_model_dir $lat_dir + echo "" >$lat_dir/splice_opts +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 4 \ + --alignment-subsampling-factor 1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves $train_data_dir \ + $lang $ali_dir $tree_dir +fi + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) + cnn_opts="l2-regularize=0.075" + tdnn_opts="l2-regularize=0.075" + output_opts="l2-regularize=0.1" + common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=500" \ + --chain.frame-subsampling-factor=4 \ + --chain.alignment-subsampling-factor=1 \ + --chain.left-tolerance 3 \ + --chain.right-tolerance 3 \ + --trainer.srand=0 \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=4 \ + --trainer.frames-per-iter=1000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=4 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=64,32 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0 --constrained false" \ + --cleanup.remove-egs=true \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/$lang_decode \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + for decode_set in test $maybe_val; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1; + done +fi + +echo "$0 Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1c.sh b/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1c.sh new file mode 100755 index 00000000000..26b1aca0929 --- /dev/null +++ b/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1c.sh @@ -0,0 +1,224 @@ +#!/bin/bash + +# e2eali_1c is the same as e2eali_1b but has more CNN layers, different filter size +# smaller lm-opts, minibatch, frams-per-iter, less epochs and more initial/finaljobs. +# local/chain/compare_wer.sh exp/chain/cnn_e2eali_1c +# System cnn_e2eali_1c (dict_50k) cnn_e2eali_1c(dict_50k + unk_model) +# WER 12.10 9.90 +# CER 5.23 4.16 +# WER val 12.15 9.60 +# CER val 4.78 3.56 +# Final train prob -0.0470 +# Final valid prob -0.0657 +# Final train prob (xent) -0.4713 +# Final valid prob (xent) -0.5437 +# Parameters 4.32M + +# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1c +# exp/chain/cnn_e2eali_1c: num-iters=30 nj=3..5 num-params=4.3M dim=40->368 combine=-0.051->-0.051 (over 1) xent:train/valid[19,29,final]=(-0.722,-0.500,-0.471/-0.748,-0.568,-0.544) logprob:train/valid[19,29,final]=(-0.090,-0.053,-0.047/-0.106,-0.071,-0.066) +set -e -o pipefail + +stage=0 +nj=30 +train_set=train +decode_val=true +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1c #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +e2echain_model_dir=exp/chain/e2e_cnn_1a +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +chunk_width=340,300,200,100 +num_leaves=500 +tdnn_dim=550 +lang_decode=data/lang_unk +if $decode_val; then maybe_val=val; else maybe_val= ; fi +dropout_schedule='0,0@0.20,0.2@0.50,0' +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --acoustic-scale 1.0 \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + $train_data_dir data/lang $e2echain_model_dir $lat_dir + echo "" >$lat_dir/splice_opts +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 4 \ + --alignment-subsampling-factor 1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves $train_data_dir \ + $lang $ali_dir $tree_dir +fi + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) + cnn_opts="l2-regularize=0.03 dropout-proportion=0.0" + tdnn_opts="l2-regularize=0.03" + output_opts="l2-regularize=0.04" + common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + conv-relu-batchnorm-dropout-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-dropout-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-dropout-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-dropout-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-dropout-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common3 height-subsample-out=2 + conv-relu-batchnorm-dropout-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 + relu-batchnorm-dropout-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + relu-batchnorm-dropout-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + relu-batchnorm-dropout-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=true \ + --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \ + --chain.frame-subsampling-factor=4 \ + --chain.alignment-subsampling-factor=1 \ + --chain.left-tolerance 3 \ + --chain.right-tolerance 3 \ + --trainer.srand=0 \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=5 \ + --trainer.frames-per-iter=1500000 \ + --trainer.optimization.num-jobs-initial=3 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=32,16 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0 --constrained false" \ + --cleanup.remove-egs=true \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + utils/mkgraph.sh \ + --self-loop-scale 1.0 $lang_decode \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + for decode_set in test $maybe_val; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1; + done +fi + +echo "$0 Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/iam/v1/local/chain/tuning/run_e2e_cnn_1a.sh b/egs/iam/v1/local/chain/tuning/run_e2e_cnn_1a.sh new file mode 100755 index 00000000000..462ad0522de --- /dev/null +++ b/egs/iam/v1/local/chain/tuning/run_e2e_cnn_1a.sh @@ -0,0 +1,154 @@ +#!/bin/bash +# Copyright 2017 Hossein Hadian + +# This script does end2end chain training (i.e. from scratch) +# local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/ +# System e2e_cnn_1a (dict_50k) e2e_cnn_1a (dict_50k + unk_model) +# WER 15.21 14.41 +# CER 7.43 6.82 +# WER val 14.84 13.51 +# CER val 6.41 5.60 +# Final train prob -0.0206 +# Final valid prob -0.0393 +# Final train prob (xent) +# Final valid prob (xent) +# Parameters 9.52M + +# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a +# exp/chain/e2e_cnn_1a: num-iters=42 nj=2..4 num-params=9.5M dim=40->12640 combine=-0.020->-0.020 (over 1) logprob:train/valid[27,41,final]=(-0.025,-0.021,-0.021/-0.044,-0.040,-0.039) + +set -e +stage=0 +train_stage=-10 +get_egs_stage=-10 +affix=1a +nj=30 + +# training options +tdnn_dim=450 +minibatch_size=150=100,64/300=50,32/600=25,16/1200=16,8 +common_egs_dir= +train_set=train +decode_val=true +lang_decode=data/lang_unk +if $decode_val; then maybe_val=val; else maybe_val= ; fi +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 1 ]; then + steps/nnet3/chain/e2e/prepare_e2e.sh --nj 30 --cmd "$cmd" \ + --shared-phones true \ + --type biphone \ + data/$train_set $lang $treedir + $cmd $treedir/log/make_phone_lm.log \ + cat data/$train_set/text \| \ + steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \ + utils/sym2int.pl -f 2- data/lang/phones.txt \| \ + chain-est-phone-lm --num-extra-lm-states=500 \ + ark:- $treedir/phone_lm.fst +fi + +if [ $stage -le 2 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}') + common1="height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="height-offsets=-2,-1,0,1,2 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn4 input=Append(-4,0,4) dim=$tdnn_dim + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $output_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs +fi + +if [ $stage -le 3 ]; then + # no need to store the egs in a shared storage because we always + # remove them. Anyway, it takes only 5 minutes to generate them. + steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \ + --cmd "$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \ + --chain.frame-subsampling-factor 4 \ + --chain.alignment-subsampling-factor 4 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1000000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.momentum 0 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 4 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.shrink-value 1.0 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs true \ + --feat-dir data/$train_set \ + --tree-dir $treedir \ + --dir $dir || exit 1; +fi + +if [ $stage -le 4 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + utils/mkgraph.sh \ + --self-loop-scale 1.0 $lang_decode \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 5 ]; then + for decode_set in test $maybe_val; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1; + done +fi + +echo "$0 Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/iam/v1/local/extract_features.sh b/egs/iam/v1/local/extract_features.sh new file mode 100755 index 00000000000..1741ad3f9b2 --- /dev/null +++ b/egs/iam/v1/local/extract_features.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +# Copyright 2017 Yiwen Shao +# 2018 Ashish Arora + +# Apache 2.0 +# This script runs the make features script in parallel. + +nj=4 +cmd=run.pl +feat_dim=40 +augment=false +fliplr=false +echo "$0 $@" + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; + +data=$1 +featdir=$data/data +scp=$data/images.scp +logdir=$data/log + +mkdir -p $logdir +mkdir -p $featdir + +# make $featdir an absolute pathname +featdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $featdir ${PWD}` + +for n in $(seq $nj); do + split_scps="$split_scps $logdir/images.$n.scp" +done + +# split images.scp +utils/split_scp.pl $scp $split_scps || exit 1; + +$cmd JOB=1:$nj $logdir/extract_features.JOB.log \ + local/make_features.py $logdir/images.JOB.scp \ + --allowed_len_file_path $data/allowed_lengths.txt \ + --feat-dim $feat_dim --fliplr $fliplr --augment $augment \| \ + copy-feats --compress=true --compression-method=7 \ + ark:- ark,scp:$featdir/images.JOB.ark,$featdir/images.JOB.scp + +## aggregates the output scp's to get feats.scp +for n in $(seq $nj); do + cat $featdir/images.$n.scp || exit 1; +done > $data/feats.scp || exit 1 diff --git a/egs/iam/v1/local/gen_topo.py b/egs/iam/v1/local/gen_topo.py new file mode 100755 index 00000000000..6fae276d542 --- /dev/null +++ b/egs/iam/v1/local/gen_topo.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python + +# Copyright 2017 (author: Chun-Chieh Chang) + +# Generate a topology file. This allows control of the number of states in the +# non-silence HMMs, and in the silence HMMs. This is a modified version of +# 'utils/gen_topo.pl'. The difference is that this creates two topologies for +# the non-silence HMMs. The number of states for punctuations is different than +# the number of states for other characters. + +from __future__ import print_function +import argparse +import string + +parser = argparse.ArgumentParser(description="Usage: steps/nnet3/chain/gen_topo.py " + " " + "e.g.: steps/nnet3/chain/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n", + epilog="See egs/swbd/s5c/local/chain/train_tdnn_a.sh for example of usage."); +parser.add_argument("num_nonsil_states", type=int, help="number of states for nonsilence phones"); +parser.add_argument("num_sil_states", type=int, help="number of states for silence phones"); +parser.add_argument("num_punctuation_states", type=int, help="number of states for punctuation"); +parser.add_argument("nonsilence_phones", type=str, + help="List of non-silence phones as integers, separated by colons, e.g. 4:5:6:7:8:9"); +parser.add_argument("silence_phones", type=str, + help="List of silence phones as integers, separated by colons, e.g. 1:2:3"); +parser.add_argument("phone_list", type=str, help="file containing all phones and their corresponding number."); + +args = parser.parse_args() + +silence_phones = [ int(x) for x in args.silence_phones.split(":") ] +nonsilence_phones = [ int(x) for x in args.nonsilence_phones.split(":") ] +all_phones = silence_phones + nonsilence_phones + +punctuation_phones = [] +exclude = set("!(),.?;:'-\"") +with open(args.phone_list) as f: + for line in f: + line = line.strip() + phone = line.split('_')[0] + if len(phone) == 1 and phone in exclude: + punctuation_phones.append(int(line.split(' ')[1])) +# For nonsilence phones that are not punctuations +print("") +print("") +print("") +print(" ".join([str(x) for x in nonsilence_phones if x not in punctuation_phones])) +print("") +for x in range(0, args.num_nonsil_states): + xp1 = x + 1 + print(" " + str(x) + " " + str(x) + " " + str(x) + " 0.75 " + str(xp1) + " 0.25 ") +print(" " + str(args.num_nonsil_states) + " ") +print("") + +# For nonsilence phones that ar punctuations +print("") +print("") +print(" ".join([str(x) for x in nonsilence_phones if x in punctuation_phones])) +print("") +for x in range(0, args.num_punctuation_states): + xp1 = x + 1 + print(" " + str(x) + " " + str(x) + " " + str(x) + " 0.75 " + str(xp1) + " 0.25 ") +print(" " + str(args.num_punctuation_states) + " ") +print("") + +# For silence phones +print("") +print("") +print(" ".join([str(x) for x in silence_phones])) +print("") +if(args.num_sil_states > 1): + transp = 1.0 / (args.num_sil_states - 1) + + state_str = " 0 0 " + for x in range(0, (args.num_sil_states - 1)): + state_str = state_str + " " + str(x) + " " + str(transp) + " " + state_str = state_str + "" + print(state_str) + + for x in range(1, (args.num_sil_states - 1)): + state_str = " " + str(x) + " " + str(x) + " " + for y in range(1, args.num_sil_states): + state_str = state_str + " " + str(y) + " " + str(transp) + " " + state_str = state_str + "" + print(state_str) + second_last = args.num_sil_states - 1 + print(" " + str(second_last) + " " + str(second_last) + " " + str(second_last) + " 0.75 " + str(args.num_sil_states) + " 0.25 ") + print(" " + str(args.num_sil_states) + " ") +else: + print(" 0 0 0 0.75 1 0.25 ") + print(" " + str(args.num_sil_states) + " ") +print("") +print("") diff --git a/egs/iam/v1/local/make_features.py b/egs/iam/v1/local/make_features.py index 84e012daedb..3ce501732cf 100755 --- a/egs/iam/v1/local/make_features.py +++ b/egs/iam/v1/local/make_features.py @@ -2,6 +2,7 @@ # Copyright 2017 Chun Chieh Chang # 2017 Ashish Arora +# 2017 Yiwen Shao # 2018 Hossein Hadian """ This script converts images to Kaldi-format feature matrices. The input to @@ -14,20 +15,27 @@ to enforce the images to have the specified length in that file by padding white pixels (the --padding option will be ignored in this case). This relates to end2end chain training. - eg. local/make_features.py data/train --feat-dim 40 """ - +import random import argparse import os import sys +import scipy.io as sio import numpy as np from scipy import misc +from scipy.ndimage.interpolation import affine_transform +import math +from signal import signal, SIGPIPE, SIG_DFL +signal(SIGPIPE, SIG_DFL) parser = argparse.ArgumentParser(description="""Converts images (in 'dir'/images.scp) to features and writes them to standard output in text format.""") -parser.add_argument('dir', type=str, - help='Source data directory (containing images.scp)') +parser.add_argument('images_scp_path', type=str, + help='Path of images.scp file') +parser.add_argument('--allowed_len_file_path', type=str, default=None, + help='If supplied, each images will be padded to reach the ' + 'target length (this overrides --padding).') parser.add_argument('--out-ark', type=str, default='-', help='Where to write the output feature file') parser.add_argument('--feat-dim', type=int, default=40, @@ -35,8 +43,10 @@ parser.add_argument('--padding', type=int, default=5, help='Number of white pixels to pad on the left' 'and right side of the image.') - - +parser.add_argument('--fliplr', type=lambda x: (str(x).lower()=='true'), default=False, + help="Flip the image left-right for right to left languages") +parser.add_argument("--augment", type=lambda x: (str(x).lower()=='true'), default=False, + help="performs image augmentation") args = parser.parse_args() @@ -56,18 +66,12 @@ def write_kaldi_matrix(file_handle, matrix, key): file_handle.write("\n") file_handle.write(" ]\n") -def get_scaled_image(im, allowed_lengths = None): - scale_size = args.feat_dim - sx = im.shape[1] - sy = im.shape[0] - scale = (1.0 * scale_size) / sy - nx = int(scale_size) - ny = int(scale * sx) - im = misc.imresize(im, (nx, ny)) + +def horizontal_pad(im, allowed_lengths = None): if allowed_lengths is None: left_padding = right_padding = args.padding else: # Find an allowed length for the image - imlen = im.shape[1] + imlen = im.shape[1] # width allowed_len = 0 for l in allowed_lengths: if l > imlen: @@ -77,28 +81,153 @@ def get_scaled_image(im, allowed_lengths = None): # No allowed length was found for the image (the image is too long) return None padding = allowed_len - imlen - left_padding = padding // 2 + left_padding = int(padding // 2) right_padding = padding - left_padding - dim_y = im.shape[0] + dim_y = im.shape[0] # height im_pad = np.concatenate((255 * np.ones((dim_y, left_padding), dtype=int), im), axis=1) im_pad1 = np.concatenate((im_pad, 255 * np.ones((dim_y, right_padding), dtype=int)), axis=1) return im_pad1 -### main ### -data_list_path = os.path.join(args.dir, 'images.scp') +def get_scaled_image_aug(im, mode='normal'): + scale_size = args.feat_dim + sx = im.shape[1] + sy = im.shape[0] + scale = (1.0 * scale_size) / sy + nx = int(scale_size) + ny = int(scale * sx) + scale_size = random.randint(10, 30) + scale = (1.0 * scale_size) / sy + down_nx = int(scale_size) + down_ny = int(scale * sx) + if mode == 'normal': + im = misc.imresize(im, (nx, ny)) + return im + else: + im_scaled_down = misc.imresize(im, (down_nx, down_ny)) + im_scaled_up = misc.imresize(im_scaled_down, (nx, ny)) + return im_scaled_up + return im + +def contrast_normalization(im, low_pct, high_pct): + element_number = im.size + rows = im.shape[0] + cols = im.shape[1] + im_contrast = np.zeros(shape=im.shape) + low_index = int(low_pct * element_number) + high_index = int(high_pct * element_number) + sorted_im = np.sort(im, axis=None) + low_thred = sorted_im[low_index] + high_thred = sorted_im[high_index] + for i in range(rows): + for j in range(cols): + if im[i, j] > high_thred: + im_contrast[i, j] = 255 # lightest to white + elif im[i, j] < low_thred: + im_contrast[i, j] = 0 # darkest to black + else: + # linear normalization + im_contrast[i, j] = (im[i, j] - low_thred) * \ + 255 / (high_thred - low_thred) + return im_contrast + + +def geometric_moment(frame, p, q): + m = 0 + for i in range(frame.shape[1]): + for j in range(frame.shape[0]): + m += (i ** p) * (j ** q) * frame[i][i] + return m + + +def central_moment(frame, p, q): + u = 0 + x_bar = geometric_moment(frame, 1, 0) / \ + geometric_moment(frame, 0, 0) # m10/m00 + y_bar = geometric_moment(frame, 0, 1) / \ + geometric_moment(frame, 0, 0) # m01/m00 + for i in range(frame.shape[1]): + for j in range(frame.shape[0]): + u += ((i - x_bar)**p) * ((j - y_bar)**q) * frame[i][j] + return u + + +def height_normalization(frame, w, h): + frame_normalized = np.zeros(shape=(h, w)) + alpha = 4 + x_bar = geometric_moment(frame, 1, 0) / \ + geometric_moment(frame, 0, 0) # m10/m00 + y_bar = geometric_moment(frame, 0, 1) / \ + geometric_moment(frame, 0, 0) # m01/m00 + sigma_x = (alpha * ((central_moment(frame, 2, 0) / + geometric_moment(frame, 0, 0)) ** .5)) # alpha * sqrt(u20/m00) + sigma_y = (alpha * ((central_moment(frame, 0, 2) / + geometric_moment(frame, 0, 0)) ** .5)) # alpha * sqrt(u02/m00) + for x in range(w): + for y in range(h): + i = int((x / w - 0.5) * sigma_x + x_bar) + j = int((y / h - 0.5) * sigma_y + y_bar) + frame_normalized[x][y] = frame[i][j] + return frame_normalized + +def find_slant_project(im): + rows = im.shape[0] + cols = im.shape[1] + std_max = 0 + alpha_max = 0 + col_disp = np.zeros(90, int) + proj = np.zeros(shape=(90, cols + 2 * rows), dtype=int) + for r in range(rows): + for alpha in range(-45, 45, 1): + col_disp[alpha] = int(r * math.tan(alpha / 180.0 * math.pi)) + for c in range(cols): + if im[r, c] < 100: + for alpha in range(-45, 45, 1): + proj[alpha + 45, c + col_disp[alpha] + rows] += 1 + for alpha in range(-45, 45, 1): + proj_histogram, bin_array = np.histogram(proj[alpha + 45, :], bins=10) + proj_std = np.std(proj_histogram) + if proj_std > std_max: + std_max = proj_std + alpha_max = alpha + proj_std = np.std(proj, axis=1) + return -alpha_max + + +def horizontal_shear(im, degree): + rad = degree / 180.0 * math.pi + padding_x = int(abs(np.tan(rad)) * im.shape[0]) + padding_y = im.shape[0] + if rad > 0: + im_pad = np.concatenate( + (255 * np.ones((padding_y, padding_x), dtype=int), im), axis=1) + elif rad < 0: + im_pad = np.concatenate( + (im, 255 * np.ones((padding_y, padding_x), dtype=int)), axis=1) + else: + im_pad = im + shear_matrix = np.array([[1, 0], + [np.tan(rad), 1]]) + sheared_im = affine_transform(im_pad, shear_matrix, cval=255.0) + return sheared_im + + +### main ### +random.seed(1) +data_list_path = args.images_scp_path if args.out_ark == '-': out_fh = sys.stdout else: - out_fh = open(args.out_ark,'wb') + out_fh = open(args.out_ark,'w') allowed_lengths = None -if os.path.isfile(os.path.join(args.dir, 'allowed_lengths.txt')): +allowed_len_handle = args.allowed_len_file_path +if os.path.isfile(allowed_len_handle): print("Found 'allowed_lengths.txt' file...", file=sys.stderr) allowed_lengths = [] - with open(os.path.join(args.dir,'allowed_lengths.txt')) as f: + with open(allowed_len_handle) as f: for line in f: allowed_lengths.append(int(line.strip())) print("Read {} allowed lengths and will apply them to the " @@ -106,6 +235,7 @@ def get_scaled_image(im, allowed_lengths = None): num_fail = 0 num_ok = 0 +aug_setting = ['normal', 'scaled'] with open(data_list_path) as f: for line in f: line = line.strip() @@ -113,15 +243,24 @@ def get_scaled_image(im, allowed_lengths = None): image_id = line_vect[0] image_path = line_vect[1] im = misc.imread(image_path) - im_scaled = get_scaled_image(im, allowed_lengths) - - if im_scaled is None: + if args.fliplr: + im = np.fliplr(im) + if args.augment: + im_aug = get_scaled_image_aug(im, aug_setting[0]) + im_contrast = contrast_normalization(im_aug, 0.05, 0.2) + slant_degree = find_slant_project(im_contrast) + im_sheared = horizontal_shear(im_contrast, slant_degree) + im_aug = im_sheared + else: + im_aug = get_scaled_image_aug(im, aug_setting[0]) + im_horizontal_padded = horizontal_pad(im_aug, allowed_lengths) + if im_horizontal_padded is None: num_fail += 1 continue - data = np.transpose(im_scaled, (1, 0)) + data = np.transpose(im_horizontal_padded, (1, 0)) data = np.divide(data, 255.0) num_ok += 1 write_kaldi_matrix(out_fh, data, image_id) -print('Generated features for {} images. Failed for {} (iamge too ' +print('Generated features for {} images. Failed for {} (image too ' 'long).'.format(num_ok, num_fail), file=sys.stderr) diff --git a/egs/iam/v1/local/prepare_data.sh b/egs/iam/v1/local/prepare_data.sh index 73d711c73f0..dc07f07e318 100755 --- a/egs/iam/v1/local/prepare_data.sh +++ b/egs/iam/v1/local/prepare_data.sh @@ -18,6 +18,7 @@ stage=0 download_dir=data/download +process_aachen_split=false wellington_dir= username= password= # username and password for downloading the IAM database @@ -53,6 +54,8 @@ ascii_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/ascii/ascii.tgz brown_corpus_url=http://www.sls.hawaii.edu/bley-vroman/brown.txt lob_corpus_url=http://ota.ox.ac.uk/text/0167.zip wellington_corpus_loc=/export/corpora5/Wellington/WWC/ +aachen_split_url=http://www.openslr.org/resources/56/splits.zip +aachen_splits=data/local/aachensplits mkdir -p $download_dir data/local # download and extact images and transcription @@ -144,6 +147,18 @@ else echo "$0: Wellington Corpus not included because wellington_dir not provided" fi +if [ -d $aachen_splits ]; then + echo "$0: Not downloading the Aachen splits as it is already there." +else + if [ ! -f $aachen_splits/splits.zip ]; then + echo "$0: Downloading Aachen splits ..." + mkdir -p $aachen_splits + wget -P $aachen_splits/ $aachen_split_url || exit 1; + fi + unzip $aachen_splits/splits.zip -d $aachen_splits || exit 1; + echo "$0: Done downloading and extracting Aachen splits" +fi + mkdir -p data/{train,test,val} file_name=largeWriterIndependentTextLineRecognitionTask @@ -160,11 +175,17 @@ cat $train_old > $train_new cat $test_old > $test_new cat $val1_old $val2_old > $val_new -if [ $stage -le 0 ]; then - local/process_data.py data/local data/train --dataset train || exit 1 - local/process_data.py data/local data/test --dataset test || exit 1 - local/process_data.py data/local data/val --dataset validation || exit 1 - - utils/utt2spk_to_spk2utt.pl data/train/utt2spk > data/train/spk2utt - utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt +if $process_aachen_split; then + local/process_aachen_splits.py data/local $aachen_splits/splits data/train --dataset train || exit 1 + local/process_aachen_splits.py data/local $aachen_splits/splits data/test --dataset test || exit 1 + local/process_aachen_splits.py data/local $aachen_splits/splits data/val --dataset validation || exit 1 +else + local/process_data.py data/local data/train --dataset train || exit 1 + local/process_data.py data/local data/test --dataset test || exit 1 + local/process_data.py data/local data/val --dataset validation || exit 1 fi + +image/fix_data_dir.sh data/train +image/fix_data_dir.sh data/test +image/fix_data_dir.sh data/val + diff --git a/egs/iam/v1/local/prepare_dict.sh b/egs/iam/v1/local/prepare_dict.sh index f691d577fba..7451f6b85f7 100755 --- a/egs/iam/v1/local/prepare_dict.sh +++ b/egs/iam/v1/local/prepare_dict.sh @@ -38,7 +38,7 @@ while(<>){ }' | sort -u > $dir/lexicon.txt -sed -i "s/#//" $dir/nonsilence_phones.txt +perl -i -pe "s/#//" $dir/nonsilence_phones.txt echo ' SIL' >> $dir/lexicon.txt echo ' SIL' >> $dir/lexicon.txt diff --git a/egs/iam/v1/local/process_aachen_splits.py b/egs/iam/v1/local/process_aachen_splits.py new file mode 100755 index 00000000000..cb6a6d4f0d8 --- /dev/null +++ b/egs/iam/v1/local/process_aachen_splits.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora + +""" This script reads the extracted IAM database files and creates + the following files (for the data subset selected via --dataset): + text, utt2spk, images.scp. + + Eg. local/process_aachen_splits.py data/local data/train data --dataset train + Eg. text file: 000_a01-000u-00 A MOVE to stop Mr. Gaitskell from + utt2spk file: 000_a01-000u-00 000 + images.scp file: 000_a01-000u-00 data/local/lines/a01/a01-000u/a01-000u-00.png +""" + +import argparse +import os +import sys +import xml.dom.minidom as minidom + +parser = argparse.ArgumentParser(description="""Creates text, utt2spk + and images.scp files.""") +parser.add_argument('database_path', type=str, + help='Path to the downloaded (and extracted) IAM data') +parser.add_argument('split_path', type=str, + help='location of the train/test/val set') +parser.add_argument('out_dir', type=str, + help='location to write output files.') +parser.add_argument('--dataset', type=str, default='train', + choices=['train', 'test','validation'], + help='Subset of data to process.') +args = parser.parse_args() + +text_file = os.path.join(args.out_dir + '/', 'text') +text_fh = open(text_file, 'w') + +utt2spk_file = os.path.join(args.out_dir + '/', 'utt2spk') +utt2spk_fh = open(utt2spk_file, 'w') + +image_file = os.path.join(args.out_dir + '/', 'images.scp') +image_fh = open(image_file, 'w') + +dataset_path = os.path.join(args.split_path, + args.dataset + '.uttlist') + +text_file_path = os.path.join(args.database_path, + 'ascii','lines.txt') +text_dict = {} +def process_text_file_for_word_model(): + with open (text_file_path, 'rt') as in_file: + for line in in_file: + if line[0]=='#': + continue + line = line.strip() + utt_id = line.split(' ')[0] + text_vect = line.split(' ')[8:] + text = "".join(text_vect) + text = text.replace("|", " ") + text_dict[utt_id] = text + + +### main ### + +print("Processing '{}' data...".format(args.dataset)) +process_text_file_for_word_model() + +with open(dataset_path) as f: + for line in f: + line = line.strip() + line_vect = line.split('-') + xml_file = line_vect[0] + '-' + line_vect[1] + xml_path = os.path.join(args.database_path, 'xml', xml_file + '.xml') + doc = minidom.parse(xml_path) + form_elements = doc.getElementsByTagName('form')[0] + writer_id = form_elements.getAttribute('writer-id') + outerfolder = form_elements.getAttribute('id')[0:3] + innerfolder = form_elements.getAttribute('id') + lines_path = os.path.join(args.database_path, 'lines', + outerfolder, innerfolder) + for file in os.listdir(lines_path): + if file.endswith(".png"): + image_file_path = os.path.join(lines_path, file) + base_name = os.path.splitext(os.path.basename(image_file_path))[0] + text = text_dict[base_name] + utt_id = writer_id + '_' + base_name + text_fh.write(utt_id + ' ' + text + '\n') + utt2spk_fh.write(utt_id + ' ' + writer_id + '\n') + image_fh.write(utt_id + ' ' + image_file_path + '\n') diff --git a/egs/iam/v1/local/train_lm.sh b/egs/iam/v1/local/train_lm.sh index a15fbea2af3..3e8c838efdb 100755 --- a/egs/iam/v1/local/train_lm.sh +++ b/egs/iam/v1/local/train_lm.sh @@ -58,9 +58,12 @@ if [ $stage -le 0 ]; then rm ${dir}/data/text/* 2>/dev/null || true # Using LOB and brown corpus. - cat data/local/lobcorpus/0167/download/LOB_COCOA/lob.txt | \ - local/remove_test_utterances_from_lob.py data/test/text data/val/text \ - > ${dir}/data/text/lob.txt + if [ ! -f data/local/lob-train-only.txt ]; then + cat data/local/lobcorpus/0167/download/LOB_COCOA/lob.txt | \ + local/remove_test_utterances_from_lob.py data/test/text data/val/text \ + > data/local/lob-train-only.txt + fi + cat data/local/lob-train-only.txt > ${dir}/data/text/lob.txt cat data/local/browncorpus/brown.txt > ${dir}/data/text/brown.txt if [ -d "data/local/wellingtoncorpus" ]; then cat data/local/wellingtoncorpus/Wellington_annotation_removed.txt > ${dir}/data/text/wellington.txt diff --git a/egs/iam/v1/local/unk_arc_post_to_transcription.py b/egs/iam/v1/local/unk_arc_post_to_transcription.py index c5ad1235427..1f1404b5165 100755 --- a/egs/iam/v1/local/unk_arc_post_to_transcription.py +++ b/egs/iam/v1/local/unk_arc_post_to_transcription.py @@ -1,88 +1,108 @@ #!/usr/bin/env python3 -# Copyright 2017 Ashish Arora +#Copyright 2017 Ashish Arora +""" This module will be used by scripts for open vocabulary setup. + If the hypothesis transcription contains , then it will replace the + with the word predicted by model by concatenating phones decoded + from the unk-model. It is currently supported only for triphone setup. + Args: + phones: File name of a file that contains the phones.txt, (symbol-table for phones). + phone and phoneID, Eg. a 217, phoneID of 'a' is 217. + words: File name of a file that contains the words.txt, (symbol-table for words). + word and wordID. Eg. ACCOUNTANCY 234, wordID of 'ACCOUNTANCY' is 234. + unk: ID of . Eg. 231. + one-best-arc-post: A file in arc-post format, which is a list of timing info and posterior + of arcs along the one-best path from the lattice. + E.g. 506_m01-049-00 8 12 1 7722 282 272 288 231 + [] + [ ...] + output-text: File containing hypothesis transcription with recognized by the + unk-model. + E.g. A move to stop mr. gaitskell. + + Eg. local/unk_arc_post_to_transcription.py lang/phones.txt lang/words.txt + data/lang/oov.int +""" import argparse +import io +import os import sys - parser = argparse.ArgumentParser(description="""uses phones to convert unk to word""") -parser.add_argument('phones', type=str, help='phones and phonesID') -parser.add_argument('words', type=str, help='word and wordID') -parser.add_argument('unk', type=str, default='-', help='location of unk file') -parser.add_argument('--input-ark', type=str, default='-', help='where to read the input data') -parser.add_argument('--out-ark', type=str, default='-', help='where to write the output data') +parser.add_argument('phones', type=str, help='File name of a file that contains the' + 'symbol-table for phones. Each line must be: ') +parser.add_argument('words', type=str, help='File name of a file that contains the' + 'symbol-table for words. Each line must be: ') +parser.add_argument('unk', type=str, default='-', help='File name of a file that' + 'contains the ID of . The content must be: , e.g. 231') +parser.add_argument('--one-best-arc-post', type=str, default='-', help='A file in arc-post' + 'format, which is a list of timing info and posterior of arcs' + 'along the one-best path from the lattice') +parser.add_argument('--output-text', type=str, default='-', help='File containing' + 'hypothesis transcription with recognized by the unk-model') args = parser.parse_args() - ### main ### -phone_fh = open(args.phones, 'r', encoding='latin-1') -word_fh = open(args.words, 'r', encoding='latin-1') -unk_fh = open(args.unk, 'r', encoding='latin-1') -if args.input_ark == '-': - input_fh = sys.stdin +phone_handle = open(args.phones, 'r', encoding='utf8') # Create file handles +word_handle = open(args.words, 'r', encoding='utf8') +unk_handle = open(args.unk,'r', encoding='utf8') +if args.one_best_arc_post == '-': + arc_post_handle = io.TextIOWrapper(sys.stdin.buffer, encoding='utf8') else: - input_fh = open(args.input_ark, 'r', encoding='latin-1') -if args.out_ark == '-': - out_fh = sys.stdout + arc_post_handle = open(args.one_best_arc_post, 'r', encoding='utf8') +if args.output_text == '-': + output_text_handle = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8') else: - out_fh = open(args.out_ark, 'w', encoding='latin-1') + output_text_handle = open(args.output_text, 'w', encoding='utf8') -phone_dict = dict() # Stores phoneID and phone mapping -phone_data_vect = phone_fh.read().strip().split("\n") -for key_val in phone_data_vect: +id2phone = dict() # Stores the mapping from phone_id (int) to phone (char) +phones_data = phone_handle.read().strip().split("\n") + +for key_val in phones_data: key_val = key_val.split(" ") - phone_dict[key_val[1]] = key_val[0] + id2phone[key_val[1]] = key_val[0] + word_dict = dict() -word_data_vect = word_fh.read().strip().split("\n") +word_data_vect = word_handle.read().strip().split("\n") + for key_val in word_data_vect: key_val = key_val.split(" ") word_dict[key_val[1]] = key_val[0] -unk_val = unk_fh.read().strip().split(" ")[0] +unk_val = unk_handle.read().strip().split(" ")[0] -utt_word_dict = dict() -utt_phone_dict = dict() # Stores utteranceID and phoneID -unk_word_dict = dict() -count=0 -for line in input_fh: +utt_word_dict = dict() # Dict of list, stores mapping from utteranceID(int) to words(str) +for line in arc_post_handle: line_vect = line.strip().split("\t") - if len(line_vect) < 6: - print("Bad line: '{}' Expecting 6 fields. Skipping...".format(line), + if len(line_vect) < 6: # Check for 1best-arc-post output + print("Error: Bad line: '{}' Expecting 6 fields. Skipping...".format(line), file=sys.stderr) continue - uttID = line_vect[0] + utt_id = line_vect[0] word = line_vect[4] phones = line_vect[5] - if uttID in utt_word_dict.keys(): - utt_word_dict[uttID][count] = word - utt_phone_dict[uttID][count] = phones - else: - count = 0 - utt_word_dict[uttID] = dict() - utt_phone_dict[uttID] = dict() - utt_word_dict[uttID][count] = word - utt_phone_dict[uttID][count] = phones - if word == unk_val: # Get character sequence for unk - phone_key_vect = phones.split(" ") - phone_val_vect = list() - for pkey in phone_key_vect: - phone_val_vect.append(phone_dict[pkey]) + if utt_id not in list(utt_word_dict.keys()): + utt_word_dict[utt_id] = list() + + if word == unk_val: # Get the 1best phone sequence given by the unk-model + phone_id_seq = phones.split(" ") + phone_seq = list() + for pkey in phone_id_seq: + phone_seq.append(id2phone[pkey]) # Convert the phone-id sequence to a phone sequence. phone_2_word = list() - for phone_val in phone_val_vect: - phone_2_word.append(phone_val.split('_')[0]) - phone_2_word = ''.join(phone_2_word) - utt_word_dict[uttID][count] = phone_2_word + for phone_val in phone_seq: + phone_2_word.append(phone_val.split('_')[0]) # Removing the world-position markers(e.g. _B) + phone_2_word = ''.join(phone_2_word) # Concatnate phone sequence + utt_word_dict[utt_id].append(phone_2_word) # Store word from unk-model else: - if word == '0': + if word == '0': # Store space/silence word_val = ' ' else: word_val = word_dict[word] - utt_word_dict[uttID][count] = word_val - count += 1 + utt_word_dict[utt_id].append(word_val) # Store word from 1best-arc-post -transcription = "" -for key in sorted(utt_word_dict.keys()): - transcription = key - for index in sorted(utt_word_dict[key].keys()): - value = utt_word_dict[key][index] - transcription = transcription + " " + value - out_fh.write(transcription + '\n') +transcription = "" # Output transcription +for utt_key in sorted(utt_word_dict.keys()): + transcription = utt_key + for word in utt_word_dict[utt_key]: + transcription = transcription + " " + word + output_text_handle.write(transcription + '\n') diff --git a/egs/iam/v1/run.sh b/egs/iam/v1/run.sh index b943870f530..85811b6cb3d 100755 --- a/egs/iam/v1/run.sh +++ b/egs/iam/v1/run.sh @@ -20,6 +20,9 @@ iam_database=/export/corpora5/handwriting_ocr/IAM # This corpus is of written NZ English that can be purchased here: # "https://www.victoria.ac.nz/lals/resources/corpora-default" wellington_database=/export/corpora5/Wellington/WWC/ +train_set=train_aug +process_aachen_split=false +overwrite=false . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. ## This relates to the queue. @@ -30,39 +33,63 @@ wellington_database=/export/corpora5/Wellington/WWC/ ./local/check_tools.sh if [ $stage -le 0 ]; then + if [ -f data/train/text ] && ! $overwrite; then + echo "$0: Not processing, probably script have run from wrong stage" + echo "Exiting with status 1 to avoid data corruption" + exit 1; + fi + echo "$0: Preparing data..." local/prepare_data.sh --download-dir "$iam_database" \ --wellington-dir "$wellington_database" \ - --username "$username" --password "$password" + --username "$username" --password "$password" \ + --process_aachen_split $process_aachen_split fi -mkdir -p data/{train,test}/data +mkdir -p data/{train,test,val}/data if [ $stage -le 1 ]; then - echo "$0: Preparing the test and train feature files..." - for dataset in train test; do - local/make_features.py data/$dataset --feat-dim 40 | \ - copy-feats --compress=true --compression-method=7 \ - ark:- ark,scp:data/$dataset/data/images.ark,data/$dataset/feats.scp - steps/compute_cmvn_stats.sh data/$dataset + echo "$0: $(date) stage 1: getting allowed image widths for e2e training..." + image/get_image2num_frames.py --feat-dim 40 data/train # This will be needed for the next command + # The next command creates a "allowed_lengths.txt" file in data/train + # which will be used by local/make_features.py to enforce the images to + # have allowed lengths. The allowed lengths will be spaced by 10% difference in length. + image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train + echo "$0: $(date) Extracting features, creating feats.scp file" + local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/train + steps/compute_cmvn_stats.sh data/train || exit 1; + for set in val test; do + local/extract_features.sh --nj $nj --cmd "$cmd" --augment true \ + --feat-dim 40 data/${set} + steps/compute_cmvn_stats.sh data/${set} || exit 1; done + utils/fix_data_dir.sh data/train fi if [ $stage -le 2 ]; then + for set in train; do + echo "$0: $(date) stage 2: Performing augmentation, it will double training data" + local/augment_data.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/${set} data/${set}_aug data + steps/compute_cmvn_stats.sh data/${set}_aug || exit 1; + done +fi + +if [ $stage -le 3 ]; then echo "$0: Estimating a language model for decoding..." # We do this stage before dict preparation because prepare_dict.sh # generates the lexicon from pocolm's wordlist local/train_lm.sh --vocab-size 50k fi -if [ $stage -le 3 ]; then +if [ $stage -le 4 ]; then echo "$0: Preparing dictionary and lang..." - # This is for training. Use a large vocab size, e.g. 500k to include all the # training words: local/prepare_dict.sh --vocab-size 500k --dir data/local/dict # this is for training utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.95 \ data/local/dict "" data/lang/temp data/lang - + silphonelist=`cat data/lang/phones/silence.csl` + nonsilphonelist=`cat data/lang/phones/nonsilence.csl` + local/gen_topo.py 8 4 4 $nonsilphonelist $silphonelist data/lang/phones.txt >data/lang/topo # This is for decoding. We use a 50k lexicon to be consistent with the papers # reporting WERs on IAM: local/prepare_dict.sh --vocab-size 50k --dir data/local/dict_50k # this is for decoding @@ -77,11 +104,14 @@ if [ $stage -le 3 ]; then utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 \ --unk-fst exp/unk_lang_model/unk_fst.txt \ data/local/dict_50k "" data/lang_unk/temp data/lang_unk + silphonelist=`cat data/lang/phones/silence.csl` + nonsilphonelist=`cat data/lang/phones/nonsilence.csl` + local/gen_topo.py 8 4 4 $nonsilphonelist $silphonelist data/lang_unk/phones.txt >data/lang_unk/topo cp data/lang_test/G.fst data/lang_unk/G.fst fi if [ $stage -le 4 ]; then - steps/train_mono.sh --nj $nj --cmd $cmd --totgauss 10000 data/train \ + steps/train_mono.sh --nj $nj --cmd $cmd --totgauss 10000 data/$train_set \ data/lang exp/mono fi @@ -93,10 +123,10 @@ if [ $stage -le 5 ] && $decode_gmm; then fi if [ $stage -le 6 ]; then - steps/align_si.sh --nj $nj --cmd $cmd data/train data/lang \ + steps/align_si.sh --nj $nj --cmd $cmd data/$train_set data/lang \ exp/mono exp/mono_ali - steps/train_deltas.sh --cmd $cmd 500 20000 data/train data/lang \ + steps/train_deltas.sh --cmd $cmd 500 20000 data/$train_set data/lang \ exp/mono_ali exp/tri fi @@ -108,12 +138,12 @@ if [ $stage -le 7 ] && $decode_gmm; then fi if [ $stage -le 8 ]; then - steps/align_si.sh --nj $nj --cmd $cmd data/train data/lang \ + steps/align_si.sh --nj $nj --cmd $cmd data/$train_set data/lang \ exp/tri exp/tri_ali steps/train_lda_mllt.sh --cmd $cmd \ --splice-opts "--left-context=3 --right-context=3" 500 20000 \ - data/train data/lang exp/tri_ali exp/tri2 + data/$train_set data/lang exp/tri_ali exp/tri2 fi if [ $stage -le 9 ] && $decode_gmm; then @@ -125,10 +155,10 @@ fi if [ $stage -le 10 ]; then steps/align_fmllr.sh --nj $nj --cmd $cmd --use-graphs true \ - data/train data/lang exp/tri2 exp/tri2_ali + data/$train_set data/lang exp/tri2 exp/tri2_ali steps/train_sat.sh --cmd $cmd 500 20000 \ - data/train data/lang exp/tri2_ali exp/tri3 + data/$train_set data/lang exp/tri2_ali exp/tri3 fi if [ $stage -le 11 ] && $decode_gmm; then @@ -140,13 +170,13 @@ fi if [ $stage -le 12 ]; then steps/align_fmllr.sh --nj $nj --cmd $cmd --use-graphs true \ - data/train data/lang exp/tri3 exp/tri3_ali + data/$train_set data/lang exp/tri3 exp/tri3_ali fi if [ $stage -le 13 ]; then - local/chain/run_cnn_1a.sh --lang-test lang_unk + local/chain/run_cnn.sh --lang-test lang_unk --train_set $train_set fi if [ $stage -le 14 ]; then - local/chain/run_cnn_chainali_1c.sh --chain-model-dir exp/chain/cnn_1a --stage 2 + local/chain/run_cnn_chainali.sh --chain-model-dir exp/chain/cnn_1a --stage 2 --train_set $train_set fi diff --git a/egs/iam/v1/run_end2end.sh b/egs/iam/v1/run_end2end.sh index 6df93e739f4..0a8b014715f 100755 --- a/egs/iam/v1/run_end2end.sh +++ b/egs/iam/v1/run_end2end.sh @@ -6,6 +6,8 @@ stage=0 nj=20 username= password= +process_aachen_split=false +overwrite=false # iam_database points to the database path on the JHU grid. If you have not # already downloaded the database you can set it to a local directory # like "data/download" and follow the instructions @@ -16,61 +18,78 @@ iam_database=/export/corpora5/handwriting_ocr/IAM # This corpus is of written NZ English that can be purchased here: # "https://www.victoria.ac.nz/lals/resources/corpora-default" wellington_database=/export/corpora5/Wellington/WWC/ - +train_set=train_aug . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. ## This relates to the queue. . ./path.sh . ./utils/parse_options.sh # e.g. this parses the above options # if supplied. - - ./local/check_tools.sh - if [ $stage -le 0 ]; then + + if [ -f data/train/text ] && ! $overwrite; then + echo "$0: Not processing, probably script have run from wrong stage" + echo "Exiting with status 1 to avoid data corruption" + exit 1; + fi + echo "$0: Preparing data..." local/prepare_data.sh --download-dir "$iam_database" \ --wellington-dir "$wellington_database" \ - --username "$username" --password "$password" + --username "$username" --password "$password" \ + --process_aachen_split $process_aachen_split fi -mkdir -p data/{train,test}/data +mkdir -p data/{train,test,val}/data if [ $stage -le 1 ]; then - image/get_image2num_frames.py data/train # This will be needed for the next command + echo "$0: $(date) stage 1: getting allowed image widths for e2e training..." + image/get_image2num_frames.py --feat-dim 40 data/train # This will be needed for the next command # The next command creates a "allowed_lengths.txt" file in data/train # which will be used by local/make_features.py to enforce the images to # have allowed lengths. The allowed lengths will be spaced by 10% difference in length. image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train - echo "$0: Preparing the test and train feature files..." - for dataset in train test; do - local/make_features.py data/$dataset --feat-dim 40 | \ - copy-feats --compress=true --compression-method=7 \ - ark:- ark,scp:data/$dataset/data/images.ark,data/$dataset/feats.scp - steps/compute_cmvn_stats.sh data/$dataset + echo "$0: $(date) Extracting features, creating feats.scp file" + local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/train + steps/compute_cmvn_stats.sh data/train || exit 1; + for set in val test; do + local/extract_features.sh --nj $nj --cmd "$cmd" --augment true \ + --feat-dim 40 data/${set} + steps/compute_cmvn_stats.sh data/${set} || exit 1; done utils/fix_data_dir.sh data/train fi if [ $stage -le 2 ]; then + for set in train; do + echo "$0: $(date) stage 2: Performing augmentation, it will double training data" + local/augment_data.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/${set} data/${set}_aug data + steps/compute_cmvn_stats.sh data/${set}_aug || exit 1; + done +fi + +if [ $stage -le 3 ]; then echo "$0: Estimating a language model for decoding..." # We do this stage before dict preparation because prepare_dict.sh # generates the lexicon from pocolm's wordlist local/train_lm.sh --vocab-size 50k fi -if [ $stage -le 3 ]; then +if [ $stage -le 4 ]; then echo "$0: Preparing dictionary and lang..." - # This is for training. Use a large vocab size, e.g. 500k to include all the # training words: local/prepare_dict.sh --vocab-size 500k --dir data/local/dict - utils/prepare_lang.sh --sil-prob 0.95 \ + utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.95 \ data/local/dict "" data/lang/temp data/lang + silphonelist=`cat data/lang/phones/silence.csl` + nonsilphonelist=`cat data/lang/phones/nonsilence.csl` + local/gen_topo.py 8 4 4 $nonsilphonelist $silphonelist data/lang/phones.txt >data/lang/topo # This is for decoding. We use a 50k lexicon to be consistent with the papers # reporting WERs on IAM. local/prepare_dict.sh --vocab-size 50k --dir data/local/dict_50k - utils/prepare_lang.sh --sil-prob 0.95 data/local/dict_50k \ - "" data/lang_test/temp data/lang_test + utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.95 \ + data/local/dict_50k "" data/lang_test/temp data/lang_test utils/format_lm.sh data/lang_test data/local/local_lm/data/arpa/3gram_big.arpa.gz \ data/local/dict_50k/lexicon.txt data/lang_test @@ -79,23 +98,27 @@ if [ $stage -le 3 ]; then data/local/dict_50k exp/unk_lang_model utils/prepare_lang.sh --unk-fst exp/unk_lang_model/unk_fst.txt \ data/local/dict_50k "" data/lang_unk/temp data/lang_unk + + silphonelist=`cat data/lang/phones/silence.csl` + nonsilphonelist=`cat data/lang/phones/nonsilence.csl` + local/gen_topo.py 8 4 4 $nonsilphonelist $silphonelist data/lang_unk/phones.txt >data/lang_unk/topo cp data/lang_test/G.fst data/lang_unk/G.fst fi -if [ $stage -le 4 ]; then +if [ $stage -le 5 ]; then echo "$0: Calling the flat-start chain recipe..." - local/chain/run_flatstart_cnn1a.sh + local/chain/run_e2e_cnn.sh --train_set $train_set fi -if [ $stage -le 5 ]; then +if [ $stage -le 6 ]; then echo "$0: Aligning the training data using the e2e chain model..." steps/nnet3/align.sh --nj 50 --cmd "$cmd" \ --use-gpu false \ --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \ - data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train + data/$train_set data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train fi -if [ $stage -le 6 ]; then +if [ $stage -le 7 ]; then echo "$0: Building a tree and training a regular chain model using the e2e alignments..." - local/chain/run_cnn_e2eali_1a.sh + local/chain/run_cnn_e2eali.sh --train_set $train_set fi diff --git a/egs/iam/v2/cmd.sh b/egs/iam/v2/cmd.sh new file mode 100755 index 00000000000..3c8eb9f93a5 --- /dev/null +++ b/egs/iam/v2/cmd.sh @@ -0,0 +1,13 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export cmd="queue.pl" diff --git a/egs/iam/v2/image b/egs/iam/v2/image new file mode 120000 index 00000000000..1668ee99922 --- /dev/null +++ b/egs/iam/v2/image @@ -0,0 +1 @@ +../../cifar/v1/image/ \ No newline at end of file diff --git a/egs/iam/v2/local/augment_data.sh b/egs/iam/v2/local/augment_data.sh new file mode 100755 index 00000000000..31e4a8217ca --- /dev/null +++ b/egs/iam/v2/local/augment_data.sh @@ -0,0 +1,34 @@ +#!/bin/bash +# Copyright 2018 Hossein Hadian +# 2018 Ashish Arora + +# Apache 2.0 +# This script performs data augmentation. + +nj=4 +cmd=run.pl +feat_dim=40 +echo "$0 $@" + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; + +srcdir=$1 +outdir=$2 +datadir=$3 +aug_set=aug1 +mkdir -p $datadir/augmentations +echo "copying $srcdir to $datadir/augmentations/$aug_set, allowed length, creating feats.scp" + +for set in $aug_set; do + image/copy_data_dir.sh --spk-prefix $set- --utt-prefix $set- \ + $srcdir $datadir/augmentations/$set + cat $srcdir/allowed_lengths.txt > $datadir/augmentations/$set/allowed_lengths.txt + local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim $feat_dim \ + --fliplr false --augment true $datadir/augmentations/$set +done + +echo " combine original data and data from different augmentations" +utils/combine_data.sh --extra-files images.scp $outdir $srcdir $datadir/augmentations/$aug_set +cat $srcdir/allowed_lengths.txt > $outdir/allowed_lengths.txt diff --git a/egs/iam/v2/local/chain/compare_wer.sh b/egs/iam/v2/local/chain/compare_wer.sh new file mode 100755 index 00000000000..2ce14e13694 --- /dev/null +++ b/egs/iam/v2/local/chain/compare_wer.sh @@ -0,0 +1,120 @@ +#!/bin/bash + +# this script is used for comparing decoding results between systems. +# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b} + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora + +if [ $# == 0 ]; then + echo "Usage: $0: [ ... ]" + echo "e.g.: $0 exp/chain/cnn{1a,1b}" + exit 1 +fi +. ./path.sh + +echo "# $0 $*" +used_epochs=false + +echo -n "# System " +for x in $*; do printf "% 10s" " $(basename $x)"; done +echo + +echo -n "# WER " +for x in $*; do + wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "# WER (rescored) " +for x in $*; do + wer="--" + [ -d $x/decode_test_rescored ] && wer=$(cat $x/decode_test_rescored/scoring_kaldi/best_wer | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "# CER " +for x in $*; do + cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}') + printf "% 10s" $cer +done +echo + +echo -n "# CER (rescored) " +for x in $*; do + cer="--" + [ -d $x/decode_test_rescored ] && cer=$(cat $x/decode_test_rescored/scoring_kaldi/best_cer | awk '{print $2}') + printf "% 10s" $cer +done +echo + +echo -n "# WER val " +for x in $*; do + wer=$(cat $x/decode_val/scoring_kaldi/best_wer | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "# WER (rescored) val " +for x in $*; do + wer="--" + [ -d $x/decode_val_rescored ] && wer=$(cat $x/decode_val_rescored/scoring_kaldi/best_wer | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "# CER val " +for x in $*; do + cer=$(cat $x/decode_val/scoring_kaldi/best_cer | awk '{print $2}') + printf "% 10s" $cer +done +echo + +echo -n "# CER (rescored) val " +for x in $*; do + cer="--" + [ -d $x/decode_val_rescored ] && cer=$(cat $x/decode_val_rescored/scoring_kaldi/best_cer | awk '{print $2}') + printf "% 10s" $cer +done +echo + +if $used_epochs; then + exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. +fi + +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final train prob (xent) " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob (xent) " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Parameters " +for x in $*; do + params=$(nnet3-info $x/final.mdl 2>/dev/null | grep num-parameters | cut -d' ' -f2 | awk '{printf "%0.2fM\n",$1/1000000}') + printf "% 10s" $params +done +echo diff --git a/egs/iam/v2/local/chain/run_cnn_e2eali.sh b/egs/iam/v2/local/chain/run_cnn_e2eali.sh new file mode 120000 index 00000000000..da731bcb0b1 --- /dev/null +++ b/egs/iam/v2/local/chain/run_cnn_e2eali.sh @@ -0,0 +1 @@ +tuning/run_cnn_e2eali_1d.sh \ No newline at end of file diff --git a/egs/iam/v2/local/chain/run_e2e_cnn.sh b/egs/iam/v2/local/chain/run_e2e_cnn.sh new file mode 120000 index 00000000000..7dca9c30e23 --- /dev/null +++ b/egs/iam/v2/local/chain/run_e2e_cnn.sh @@ -0,0 +1 @@ +tuning/run_e2e_cnn_1b.sh \ No newline at end of file diff --git a/egs/iam/v1/local/chain/run_cnn_e2eali_1a.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1a.sh similarity index 91% rename from egs/iam/v1/local/chain/run_cnn_e2eali_1a.sh rename to egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1a.sh index ba28f681708..9a01688ba35 100755 --- a/egs/iam/v1/local/chain/run_cnn_e2eali_1a.sh +++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1a.sh @@ -22,6 +22,7 @@ stage=0 nj=30 train_set=train +decode_val=true nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. affix=_1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. e2echain_model_dir=exp/chain/e2e_cnn_1a @@ -42,7 +43,9 @@ tdnn_dim=450 # training options srand=0 remove_egs=true -lang_test=lang_unk +lang_decode=data/lang +lang_rescore=data/lang_rescore_6g +if $decode_val; then maybe_val=val; else maybe_val= ; fi # End configuration section. echo "$0 $@" # Print the command line for logging @@ -132,7 +135,7 @@ if [ $stage -le 4 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) cnn_opts="l2-regularize=0.075" tdnn_opts="l2-regularize=0.075" output_opts="l2-regularize=0.1" @@ -228,18 +231,26 @@ if [ $stage -le 6 ]; then # as long as phones.txt was compatible. utils/mkgraph.sh \ - --self-loop-scale 1.0 data/$lang_test \ + --self-loop-scale 1.0 $lang_decode \ $dir $dir/graph || exit 1; fi if [ $stage -le 7 ]; then frames_per_chunk=$(echo $chunk_width | cut -d, -f1) - steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context $chunk_left_context \ - --extra-right-context $chunk_right_context \ - --extra-left-context-initial 0 \ - --extra-right-context-final 0 \ - --frames-per-chunk $frames_per_chunk \ - --nj $nj --cmd "$cmd" \ - $dir/graph data/test $dir/decode_test || exit 1; + for decode_set in test $maybe_val; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/$decode_set $dir/decode_${decode_set}{,_rescored} || exit 1 + done fi + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/iam/v1/local/chain/run_cnn_e2eali_1b.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1b.sh similarity index 86% rename from egs/iam/v1/local/chain/run_cnn_e2eali_1b.sh rename to egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1b.sh index 6d8cca876bf..28aa246f334 100755 --- a/egs/iam/v1/local/chain/run_cnn_e2eali_1b.sh +++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1b.sh @@ -2,15 +2,17 @@ # e2eali_1b is the same as e2eali_1a but uses unconstrained egs -# local/chain/compare_wer.sh /home/hhadian/kaldi-rnnlm/egs/iam/v1/exp/chain/cnn_e2eali_1a exp/chain/cnn_e2eali_1b +# local/chain/compare_wer.sh exp/chain/cnn_e2eali_1a exp/chain/cnn_e2eali_1b # System cnn_e2eali_1a cnn_e2eali_1b -# WER 12.79 12.23 -# CER 5.73 5.48 -# Final train prob -0.0556 -0.0367 -# Final valid prob -0.0795 -0.0592 -# Final train prob (xent) -0.9178 -0.8382 -# Final valid prob (xent) -1.0604 -0.9853 -# Parameters 3.95M 3.95M +# WER 10.40 10.33 +# WER (rescored) 10.02 10.10 +# CER 4.97 5.00 +# CER (rescored) 4.83 4.88 +# Final train prob -0.0612 -0.0428 +# Final valid prob -0.0857 -0.0666 +# Final train prob (xent) -0.8990 -0.9210 +# Final valid prob (xent) -1.0024 -1.0264 +# Parameters 3.98M 3.98M # steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1b # exp/chain/cnn_e2eali_1b: num-iters=21 nj=2..4 num-params=4.0M dim=40->360 combine=-0.038->-0.038 (over 1) xent:train/valid[13,20,final]=(-1.34,-0.967,-0.838/-1.40,-1.07,-0.985) logprob:train/valid[13,20,final]=(-0.075,-0.054,-0.037/-0.083,-0.072,-0.059) @@ -21,6 +23,7 @@ stage=0 nj=30 train_set=train +decode_val=true nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. affix=_1b #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. e2echain_model_dir=exp/chain/e2e_cnn_1a @@ -41,7 +44,10 @@ tdnn_dim=450 # training options srand=0 remove_egs=true -lang_test=lang_unk +lang_decode=data/lang +lang_rescore=data/lang_rescore_6g +if $decode_val; then maybe_val=val; else maybe_val= ; fi + # End configuration section. echo "$0 $@" # Print the command line for logging @@ -131,7 +137,7 @@ if [ $stage -le 4 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) cnn_opts="l2-regularize=0.075" tdnn_opts="l2-regularize=0.075" output_opts="l2-regularize=0.1" @@ -227,18 +233,26 @@ if [ $stage -le 6 ]; then # as long as phones.txt was compatible. utils/mkgraph.sh \ - --self-loop-scale 1.0 data/$lang_test \ + --self-loop-scale 1.0 $lang_decode \ $dir $dir/graph || exit 1; fi if [ $stage -le 7 ]; then frames_per_chunk=$(echo $chunk_width | cut -d, -f1) - steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context $chunk_left_context \ - --extra-right-context $chunk_right_context \ - --extra-left-context-initial 0 \ - --extra-right-context-final 0 \ - --frames-per-chunk $frames_per_chunk \ - --nj $nj --cmd "$cmd" \ - $dir/graph data/test $dir/decode_test || exit 1; + for decode_set in test $maybe_val; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/$decode_set $dir/decode_${decode_set}{,_rescored} || exit 1 + done fi + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh new file mode 100755 index 00000000000..f158317950a --- /dev/null +++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh @@ -0,0 +1,259 @@ +#!/bin/bash + +# e2eali_1c is the same as e2eali_1b but has fewer CNN layers, smaller +# l2-regularize, more epochs and uses dropout. + + +# local/chain/compare_wer.sh exp/chain/cnn_e2eali_1b exp/chain/cnn_e2eali_1c +# System cnn_e2eali_1b cnn_e2eali_1c +# WER 10.33 10.05 +# WER (rescored) 10.10 9.75 +# CER 5.00 4.76 +# CER (rescored) 4.88 4.68 +# Final train prob -0.0428 -0.0317 +# Final valid prob -0.0666 -0.0630 +# Final train prob (xent) -0.9210 -0.5413 +# Final valid prob (xent) -1.0264 -0.7096 +# Parameters 3.98M 5.12M + +# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1c +# exp/chain/cnn_e2eali_1c: num-iters=21 nj=2..4 num-params=5.1M dim=40->392 combine=-0.034->-0.034 (over 1) xent:train/valid[13,20,final]=(-0.953,-0.800,-0.541/-1.03,-0.933,-0.710) logprob:train/valid[13,20,final]=(-0.069,-0.048,-0.032/-0.091,-0.078,-0.063) + +set -e -o pipefail + +stage=0 + +nj=30 +train_set=train +decode_val=true +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1c #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +e2echain_model_dir=exp/chain/e2e_cnn_1a +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=4 +# training chunk-options +chunk_width=340,300,200,100 +num_leaves=500 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +tdnn_dim=550 +# training options +srand=0 +remove_egs=true +lang_decode=data/lang +lang_rescore=data/lang_rescore_6g +if $decode_val; then maybe_val=val; else maybe_val= ; fi +dropout_schedule='0,0@0.20,0.2@0.50,0' +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --acoustic-scale 1.0 \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + ${train_data_dir} data/lang $e2echain_model_dir $lat_dir + echo "" >$lat_dir/splice_opts +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves ${train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) + cnn_opts="l2-regularize=0.03 dropout-proportion=0.0" + tdnn_opts="l2-regularize=0.03" + output_opts="l2-regularize=0.04" + common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-dropout-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-dropout-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-dropout-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-dropout-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-dropout-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + relu-batchnorm-dropout-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + relu-batchnorm-dropout-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + relu-batchnorm-dropout-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=500" \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=1 \ + --chain.left-tolerance 3 \ + --chain.right-tolerance 3 \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=8 \ + --trainer.frames-per-iter=2000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=4 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=64,32 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0 --constrained false" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 $lang_decode \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + for decode_set in test $maybe_val; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/$decode_set $dir/decode_${decode_set}{,_rescored} || exit 1 + done +fi + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh new file mode 100755 index 00000000000..1c44057454a --- /dev/null +++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh @@ -0,0 +1,259 @@ +#!/bin/bash + +# e2eali_1d is the same as e2eali_1c but has more CNN layers, different filter size +# smaller lm-opts, minibatch, frams-per-iter, less epochs and more initial/finaljobs. + +# local/chain/compare_wer.sh exp/chain/e2e_cnn_1b/ exp/chain/cnn_e2eali_1d +# System e2e_cnn_1b cnn_e2eali_1d +# WER 13.91 8.80 +# WER (rescored) 13.64 8.52 +# CER 7.08 4.06 +# CER (rescored) 6.82 3.98 +# Final train prob 0.0148 -0.0524 +# Final valid prob 0.0105 -0.0713 +# Final train prob (xent) -0.4695 +# Final valid prob (xent) -0.5310 +# Parameters 9.52M 4.36M + +# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1d +# exp/chain/cnn_e2eali_1d: num-iters=30 nj=3..5 num-params=4.4M dim=40->400 combine=-0.055->-0.055 (over 1) xent:train/valid[19,29,final]=(-0.683,-0.489,-0.469/-0.703,-0.544,-0.531) logprob:train/valid[19,29,final]=(-0.090,-0.057,-0.052/-0.107,-0.076,-0.071) +set -e -o pipefail + +stage=0 + +nj=30 +train_set=train +decode_val=true +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1d #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +e2echain_model_dir=exp/chain/e2e_cnn_1b +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=4 +# training chunk-options +chunk_width=340,300,200,100 +num_leaves=500 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +tdnn_dim=550 +# training options +srand=0 +remove_egs=true +lang_decode=data/lang +lang_rescore=data/lang_rescore_6g +if $decode_val; then maybe_val=val; else maybe_val= ; fi +dropout_schedule='0,0@0.20,0.2@0.50,0' +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --acoustic-scale 1.0 \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + ${train_data_dir} data/lang $e2echain_model_dir $lat_dir + echo "" >$lat_dir/splice_opts +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves ${train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) + cnn_opts="l2-regularize=0.03 dropout-proportion=0.0" + tdnn_opts="l2-regularize=0.03" + output_opts="l2-regularize=0.04" + common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-dropout-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-dropout-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-dropout-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-dropout-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-dropout-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common3 height-subsample-out=2 + conv-relu-batchnorm-dropout-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 + relu-batchnorm-dropout-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + relu-batchnorm-dropout-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + relu-batchnorm-dropout-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=true \ + --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=1 \ + --chain.left-tolerance 3 \ + --chain.right-tolerance 3 \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=5 \ + --trainer.frames-per-iter=1500000 \ + --trainer.optimization.num-jobs-initial=3 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=32,16 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0 --constrained false" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 $lang_decode \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + for decode_set in test $maybe_val; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/$decode_set $dir/decode_${decode_set}{,_rescored} || exit 1 + done +fi + + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh b/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1a.sh similarity index 84% rename from egs/iam/v1/local/chain/run_flatstart_cnn1a.sh rename to egs/iam/v2/local/chain/tuning/run_e2e_cnn_1a.sh index 56c897137f4..cb2bfa0a82d 100755 --- a/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh +++ b/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1a.sh @@ -2,19 +2,21 @@ # Copyright 2017 Hossein Hadian # This script does end2end chain training (i.e. from scratch) - -# local/chain/compare_wer.sh exp/chain/cnn_1a exp/chain/cnn_chainali_1c exp/chain/e2e_cnn_1a -# System cnn_1a cnn_chainali_1c e2e_cnn_1a -# WER 18.52 12.72 13.87 -# CER 10.07 5.99 6.54 -# Final train prob -0.0077 -0.0291 -0.0371 -# Final valid prob -0.0970 -0.0359 -0.0636 -# Final train prob (xent) -0.5484 -0.9781 -# Final valid prob (xent) -0.9643 -1.1544 -# Parameters 4.36M 3.96M 9.13M +# ./local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/ +# System e2e_cnn_1a +# WER 11.24 +# WER (rescored) 10.80 +# CER 5.32 +# CER (rescored) 5.24 +# Final train prob 0.0568 +# Final valid prob 0.0381 +# Final train prob (xent) +# Final valid prob (xent) +# Parameters 9.13M # steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a -# exp/chain/e2e_cnn_1a: num-iters=21 nj=2..4 num-params=9.1M dim=40->12640 combine=-0.033->-0.033 (over 1) logprob:train/valid[13,20,final]=(-0.058,-0.042,-0.035/-0.070,-0.064,-0.059) +# exp/chain/e2e_cnn_1a: num-iters=42 nj=2..4 num-params=9.1M dim=40->12640 combine=0.049->0.049 (over 1) logprob:train/valid[27,41,final]=(0.035,0.055,0.057/0.016,0.037,0.038) + set -e @@ -23,6 +25,7 @@ stage=0 train_stage=-10 get_egs_stage=-10 affix=1a +nj=30 # training options tdnn_dim=450 @@ -35,7 +38,9 @@ l2_regularize=0.00005 frames_per_iter=1000000 cmvn_opts="--norm-means=true --norm-vars=true" train_set=train -lang_test=lang_unk +decode_val=true +lang_decode=data/lang +lang_rescore=data/lang_rescore_6g # End configuration section. echo "$0 $@" # Print the command line for logging @@ -95,7 +100,6 @@ if [ $stage -le 2 ]; then mkdir -p $dir/configs cat < $dir/configs/network.xconfig input dim=40 name=input - conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 @@ -106,7 +110,6 @@ if [ $stage -le 2 ]; then relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts - ## adding the layers for chain branch relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $output_opts output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts @@ -155,15 +158,19 @@ if [ $stage -le 4 ]; then # as long as phones.txt was compatible. utils/mkgraph.sh \ - --self-loop-scale 1.0 data/$lang_test \ + --self-loop-scale 1.0 $lang_decode \ $dir $dir/graph || exit 1; fi if [ $stage -le 5 ]; then - frames_per_chunk=$(echo $chunk_width | cut -d, -f1) - steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --nj 30 --cmd "$cmd" \ - $dir/graph data/test $dir/decode_test || exit 1; + for decode_set in test $maybe_val; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/$decode_set $dir/decode_${decode_set}{,_rescored} || exit 1 + done fi echo "Done. Date: $(date). Results:" diff --git a/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1b.sh b/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1b.sh new file mode 100755 index 00000000000..d5f79602695 --- /dev/null +++ b/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1b.sh @@ -0,0 +1,163 @@ +#!/bin/bash +# Copyright 2017 Hossein Hadian + +# This script does end2end chain training (i.e. from scratch) +# ./local/chain/compare_wer.sh exp/chain/e2e_cnn_1b/ +# System e2e_cnn_1b +# WER 13.59 +# WER (rescored) 13.27 +# CER 6.92 +# CER (rescored) 6.71 +# Final train prob 0.0345 +# Final valid prob 0.0269 +# Final train prob (xent) +# Final valid prob (xent) +# Parameters 9.52M + +# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1b +# exp/chain/e2e_cnn_1b: num-iters=42 nj=2..4 num-params=9.5M dim=40->12640 combine=0.041->0.041 (over 2) logprob:train/valid[27,41,final]=(0.032,0.035,0.035/0.025,0.026,0.027) +set -e + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +affix=1b +nj=30 + +# training options +tdnn_dim=450 +minibatch_size=150=100,64/300=50,32/600=25,16/1200=16,8 +common_egs_dir= +train_set=train +decode_val=true +lang_decode=data/lang +lang_rescore=data/lang_rescore_6g +if $decode_val; then maybe_val=val; else maybe_val= ; fi +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 1 ]; then + steps/nnet3/chain/e2e/prepare_e2e.sh --nj 30 --cmd "$cmd" \ + --shared-phones true \ + --type biphone \ + data/$train_set $lang $treedir + $cmd $treedir/log/make_phone_lm.log \ + cat data/$train_set/text \| \ + steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \ + utils/sym2int.pl -f 2- data/lang/phones.txt \| \ + chain-est-phone-lm --num-extra-lm-states=500 \ + ark:- $treedir/phone_lm.fst +fi + +if [ $stage -le 2 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}') + common1="height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="height-offsets=-2,-1,0,1,2 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn4 input=Append(-4,0,4) dim=$tdnn_dim + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $output_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs +fi + +if [ $stage -le 3 ]; then + # no need to store the egs in a shared storage because we always + # remove them. Anyway, it takes only 5 minutes to generate them. + + steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \ + --cmd "$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \ + --chain.frame-subsampling-factor 4 \ + --chain.alignment-subsampling-factor 4 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1000000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.momentum 0 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 4 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.shrink-value 1.0 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs true \ + --feat-dir data/${train_set} \ + --tree-dir $treedir \ + --dir $dir || exit 1; +fi + +if [ $stage -le 4 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 $lang_decode \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 5 ]; then + for decode_set in test $maybe_val; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/$decode_set $dir/decode_${decode_set}{,_rescored} || exit 1 + done +fi + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/iam/v2/local/check_tools.sh b/egs/iam/v2/local/check_tools.sh new file mode 100755 index 00000000000..5b4d3107d3b --- /dev/null +++ b/egs/iam/v2/local/check_tools.sh @@ -0,0 +1,43 @@ +#!/bin/bash -u + +# Copyright 2015 (c) Johns Hopkins University (Jan Trmal ) + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +[ -f ./path.sh ] && . ./path.sh +set +e + +command -v python3 >&/dev/null \ + || { echo >&2 "python3 not found on PATH. You will have to install Python3, preferably >= 3.6"; exit 1; } + +python3 -c "import numpy" +if [ $? -ne 0 ] ; then + echo >&2 "This recipe needs numpy installed." + exit 1 +fi + +python3 -c "import scipy" +if [ $? -ne 0 ] ; then + echo >&2 "This recipe needs scipy installed." + exit 1 +fi + +python3 -c "import scipy.misc; scipy.misc.__dict__['imread']" +if [ $? -ne 0 ] ; then + echo >&2 "This recipe needs scipy-image and Pillow installed." + exit 1 +fi + + +exit 0 diff --git a/egs/iam/v2/local/extract_features.sh b/egs/iam/v2/local/extract_features.sh new file mode 100755 index 00000000000..1741ad3f9b2 --- /dev/null +++ b/egs/iam/v2/local/extract_features.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +# Copyright 2017 Yiwen Shao +# 2018 Ashish Arora + +# Apache 2.0 +# This script runs the make features script in parallel. + +nj=4 +cmd=run.pl +feat_dim=40 +augment=false +fliplr=false +echo "$0 $@" + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; + +data=$1 +featdir=$data/data +scp=$data/images.scp +logdir=$data/log + +mkdir -p $logdir +mkdir -p $featdir + +# make $featdir an absolute pathname +featdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $featdir ${PWD}` + +for n in $(seq $nj); do + split_scps="$split_scps $logdir/images.$n.scp" +done + +# split images.scp +utils/split_scp.pl $scp $split_scps || exit 1; + +$cmd JOB=1:$nj $logdir/extract_features.JOB.log \ + local/make_features.py $logdir/images.JOB.scp \ + --allowed_len_file_path $data/allowed_lengths.txt \ + --feat-dim $feat_dim --fliplr $fliplr --augment $augment \| \ + copy-feats --compress=true --compression-method=7 \ + ark:- ark,scp:$featdir/images.JOB.ark,$featdir/images.JOB.scp + +## aggregates the output scp's to get feats.scp +for n in $(seq $nj); do + cat $featdir/images.$n.scp || exit 1; +done > $data/feats.scp || exit 1 diff --git a/egs/iam/v2/local/gen_topo.py b/egs/iam/v2/local/gen_topo.py new file mode 100755 index 00000000000..8ffc59c5788 --- /dev/null +++ b/egs/iam/v2/local/gen_topo.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python + +# Copyright 2017 (author: Chun-Chieh Chang) + +# Generate a topology file. This allows control of the number of states in the +# non-silence HMMs, and in the silence HMMs. This is a modified version of +# 'utils/gen_topo.pl'. The difference is that this creates two topologies for +# the non-silence HMMs. The number of states for punctuations is different than +# the number of states for other characters. + +from __future__ import print_function +from __future__ import division +import argparse +import string + +parser = argparse.ArgumentParser(description="Usage: steps/nnet3/chain/gen_topo.py " + " " + "e.g.: steps/nnet3/chain/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n", + epilog="See egs/swbd/s5c/local/chain/train_tdnn_a.sh for example of usage."); +parser.add_argument("num_nonsil_states", type=int, help="number of states for nonsilence phones"); +parser.add_argument("num_sil_states", type=int, help="number of states for silence phones"); +parser.add_argument("num_punctuation_states", type=int, help="number of states for punctuation"); +parser.add_argument("nonsilence_phones", + help="List of non-silence phones as integers, separated by colons, e.g. 4:5:6:7:8:9"); +parser.add_argument("silence_phones", + help="List of silence phones as integers, separated by colons, e.g. 1:2:3"); +parser.add_argument("phone_list", help="file containing all phones and their corresponding number."); + +args = parser.parse_args() + +silence_phones = [ int(x) for x in args.silence_phones.split(":") ] +nonsilence_phones = [ int(x) for x in args.nonsilence_phones.split(":") ] +all_phones = silence_phones + nonsilence_phones + +punctuation_phones = [] +exclude = set("!(),.?;:'-\"") +with open(args.phone_list) as f: + for line in f: + line = line.strip() + phone = line.split(' ')[0] + if len(phone) == 1 and phone in exclude: + punctuation_phones.append(int(line.split(' ')[1])) +# For nonsilence phones that are not punctuations +print("") +print("") +print("") +print(" ".join([str(x) for x in nonsilence_phones if x not in punctuation_phones])) +print("") +for x in range(0, args.num_nonsil_states): + xp1 = x + 1 + print(" {0} {0} {0} 0.75 {1} 0.25 ".format(x, xp1)) +print(" {} ".format(args.num_nonsil_states)) +print("") + +# For nonsilence phones that ar punctuations +print("") +print("") +print(" ".join([str(x) for x in nonsilence_phones if x in punctuation_phones])) +print("") +for x in range(0, args.num_punctuation_states): + xp1 = x + 1 + print(" {0} {0} {0} 0.75 {1} 0.25 ".format(x, xp1)) +print(" {} ".format(args.num_punctuation_states)) +print("") + +# For silence phones +print("") +print("") +print(" ".join([str(x) for x in silence_phones])) +print("") +if(args.num_sil_states > 1): + transp = 1.0 / (args.num_sil_states - 1) + + state_str = " 0 0 " + for x in range(0, (args.num_sil_states - 1)): + state_str = "{} {} {} ".format(state_str, x, transp)) + state_str = state_str + "" + print(state_str) + + for x in range(1, (args.num_sil_states - 1)): + state_str = " {0} " + print(state_str) + second_last = args.num_sil_states - 1 + print(" {0} {0} {0} 0.75 {1} 0.25 ".format(second_last, args.num_sil_states)) + print(" {} ".format(args.num_sil_states)) +else: + print(" 0 0 0 0.75 1 0.25 ") + print(" {} ".format(args.num_sil_states)) +print("") +print("") diff --git a/egs/iam/v2/local/make_features.py b/egs/iam/v2/local/make_features.py new file mode 100755 index 00000000000..3ce501732cf --- /dev/null +++ b/egs/iam/v2/local/make_features.py @@ -0,0 +1,266 @@ +#!/usr/bin/env python3 + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora +# 2017 Yiwen Shao +# 2018 Hossein Hadian + +""" This script converts images to Kaldi-format feature matrices. The input to + this script is the path to a data directory, e.g. "data/train". This script + reads the images listed in images.scp and writes them to standard output + (by default) as Kaldi-formatted matrices (in text form). It also scales the + images so they have the same height (via --feat-dim). It can optionally pad + the images (on left/right sides) with white pixels. + If an 'image2num_frames' file is found in the data dir, it will be used + to enforce the images to have the specified length in that file by padding + white pixels (the --padding option will be ignored in this case). This relates + to end2end chain training. + eg. local/make_features.py data/train --feat-dim 40 +""" +import random +import argparse +import os +import sys +import scipy.io as sio +import numpy as np +from scipy import misc +from scipy.ndimage.interpolation import affine_transform +import math +from signal import signal, SIGPIPE, SIG_DFL +signal(SIGPIPE, SIG_DFL) + +parser = argparse.ArgumentParser(description="""Converts images (in 'dir'/images.scp) to features and + writes them to standard output in text format.""") +parser.add_argument('images_scp_path', type=str, + help='Path of images.scp file') +parser.add_argument('--allowed_len_file_path', type=str, default=None, + help='If supplied, each images will be padded to reach the ' + 'target length (this overrides --padding).') +parser.add_argument('--out-ark', type=str, default='-', + help='Where to write the output feature file') +parser.add_argument('--feat-dim', type=int, default=40, + help='Size to scale the height of all images') +parser.add_argument('--padding', type=int, default=5, + help='Number of white pixels to pad on the left' + 'and right side of the image.') +parser.add_argument('--fliplr', type=lambda x: (str(x).lower()=='true'), default=False, + help="Flip the image left-right for right to left languages") +parser.add_argument("--augment", type=lambda x: (str(x).lower()=='true'), default=False, + help="performs image augmentation") +args = parser.parse_args() + + +def write_kaldi_matrix(file_handle, matrix, key): + file_handle.write(key + " [ ") + num_rows = len(matrix) + if num_rows == 0: + raise Exception("Matrix is empty") + num_cols = len(matrix[0]) + + for row_index in range(len(matrix)): + if num_cols != len(matrix[row_index]): + raise Exception("All the rows of a matrix are expected to " + "have the same length") + file_handle.write(" ".join(map(lambda x: str(x), matrix[row_index]))) + if row_index != num_rows - 1: + file_handle.write("\n") + file_handle.write(" ]\n") + + +def horizontal_pad(im, allowed_lengths = None): + if allowed_lengths is None: + left_padding = right_padding = args.padding + else: # Find an allowed length for the image + imlen = im.shape[1] # width + allowed_len = 0 + for l in allowed_lengths: + if l > imlen: + allowed_len = l + break + if allowed_len == 0: + # No allowed length was found for the image (the image is too long) + return None + padding = allowed_len - imlen + left_padding = int(padding // 2) + right_padding = padding - left_padding + dim_y = im.shape[0] # height + im_pad = np.concatenate((255 * np.ones((dim_y, left_padding), + dtype=int), im), axis=1) + im_pad1 = np.concatenate((im_pad, 255 * np.ones((dim_y, right_padding), + dtype=int)), axis=1) + return im_pad1 + +def get_scaled_image_aug(im, mode='normal'): + scale_size = args.feat_dim + sx = im.shape[1] + sy = im.shape[0] + scale = (1.0 * scale_size) / sy + nx = int(scale_size) + ny = int(scale * sx) + scale_size = random.randint(10, 30) + scale = (1.0 * scale_size) / sy + down_nx = int(scale_size) + down_ny = int(scale * sx) + if mode == 'normal': + im = misc.imresize(im, (nx, ny)) + return im + else: + im_scaled_down = misc.imresize(im, (down_nx, down_ny)) + im_scaled_up = misc.imresize(im_scaled_down, (nx, ny)) + return im_scaled_up + return im + +def contrast_normalization(im, low_pct, high_pct): + element_number = im.size + rows = im.shape[0] + cols = im.shape[1] + im_contrast = np.zeros(shape=im.shape) + low_index = int(low_pct * element_number) + high_index = int(high_pct * element_number) + sorted_im = np.sort(im, axis=None) + low_thred = sorted_im[low_index] + high_thred = sorted_im[high_index] + for i in range(rows): + for j in range(cols): + if im[i, j] > high_thred: + im_contrast[i, j] = 255 # lightest to white + elif im[i, j] < low_thred: + im_contrast[i, j] = 0 # darkest to black + else: + # linear normalization + im_contrast[i, j] = (im[i, j] - low_thred) * \ + 255 / (high_thred - low_thred) + return im_contrast + + +def geometric_moment(frame, p, q): + m = 0 + for i in range(frame.shape[1]): + for j in range(frame.shape[0]): + m += (i ** p) * (j ** q) * frame[i][i] + return m + + +def central_moment(frame, p, q): + u = 0 + x_bar = geometric_moment(frame, 1, 0) / \ + geometric_moment(frame, 0, 0) # m10/m00 + y_bar = geometric_moment(frame, 0, 1) / \ + geometric_moment(frame, 0, 0) # m01/m00 + for i in range(frame.shape[1]): + for j in range(frame.shape[0]): + u += ((i - x_bar)**p) * ((j - y_bar)**q) * frame[i][j] + return u + + +def height_normalization(frame, w, h): + frame_normalized = np.zeros(shape=(h, w)) + alpha = 4 + x_bar = geometric_moment(frame, 1, 0) / \ + geometric_moment(frame, 0, 0) # m10/m00 + y_bar = geometric_moment(frame, 0, 1) / \ + geometric_moment(frame, 0, 0) # m01/m00 + sigma_x = (alpha * ((central_moment(frame, 2, 0) / + geometric_moment(frame, 0, 0)) ** .5)) # alpha * sqrt(u20/m00) + sigma_y = (alpha * ((central_moment(frame, 0, 2) / + geometric_moment(frame, 0, 0)) ** .5)) # alpha * sqrt(u02/m00) + for x in range(w): + for y in range(h): + i = int((x / w - 0.5) * sigma_x + x_bar) + j = int((y / h - 0.5) * sigma_y + y_bar) + frame_normalized[x][y] = frame[i][j] + return frame_normalized + + +def find_slant_project(im): + rows = im.shape[0] + cols = im.shape[1] + std_max = 0 + alpha_max = 0 + col_disp = np.zeros(90, int) + proj = np.zeros(shape=(90, cols + 2 * rows), dtype=int) + for r in range(rows): + for alpha in range(-45, 45, 1): + col_disp[alpha] = int(r * math.tan(alpha / 180.0 * math.pi)) + for c in range(cols): + if im[r, c] < 100: + for alpha in range(-45, 45, 1): + proj[alpha + 45, c + col_disp[alpha] + rows] += 1 + for alpha in range(-45, 45, 1): + proj_histogram, bin_array = np.histogram(proj[alpha + 45, :], bins=10) + proj_std = np.std(proj_histogram) + if proj_std > std_max: + std_max = proj_std + alpha_max = alpha + proj_std = np.std(proj, axis=1) + return -alpha_max + + +def horizontal_shear(im, degree): + rad = degree / 180.0 * math.pi + padding_x = int(abs(np.tan(rad)) * im.shape[0]) + padding_y = im.shape[0] + if rad > 0: + im_pad = np.concatenate( + (255 * np.ones((padding_y, padding_x), dtype=int), im), axis=1) + elif rad < 0: + im_pad = np.concatenate( + (im, 255 * np.ones((padding_y, padding_x), dtype=int)), axis=1) + else: + im_pad = im + shear_matrix = np.array([[1, 0], + [np.tan(rad), 1]]) + sheared_im = affine_transform(im_pad, shear_matrix, cval=255.0) + return sheared_im + + +### main ### +random.seed(1) +data_list_path = args.images_scp_path +if args.out_ark == '-': + out_fh = sys.stdout +else: + out_fh = open(args.out_ark,'w') + +allowed_lengths = None +allowed_len_handle = args.allowed_len_file_path +if os.path.isfile(allowed_len_handle): + print("Found 'allowed_lengths.txt' file...", file=sys.stderr) + allowed_lengths = [] + with open(allowed_len_handle) as f: + for line in f: + allowed_lengths.append(int(line.strip())) + print("Read {} allowed lengths and will apply them to the " + "features.".format(len(allowed_lengths)), file=sys.stderr) + +num_fail = 0 +num_ok = 0 +aug_setting = ['normal', 'scaled'] +with open(data_list_path) as f: + for line in f: + line = line.strip() + line_vect = line.split(' ') + image_id = line_vect[0] + image_path = line_vect[1] + im = misc.imread(image_path) + if args.fliplr: + im = np.fliplr(im) + if args.augment: + im_aug = get_scaled_image_aug(im, aug_setting[0]) + im_contrast = contrast_normalization(im_aug, 0.05, 0.2) + slant_degree = find_slant_project(im_contrast) + im_sheared = horizontal_shear(im_contrast, slant_degree) + im_aug = im_sheared + else: + im_aug = get_scaled_image_aug(im, aug_setting[0]) + im_horizontal_padded = horizontal_pad(im_aug, allowed_lengths) + if im_horizontal_padded is None: + num_fail += 1 + continue + data = np.transpose(im_horizontal_padded, (1, 0)) + data = np.divide(data, 255.0) + num_ok += 1 + write_kaldi_matrix(out_fh, data, image_id) + +print('Generated features for {} images. Failed for {} (image too ' + 'long).'.format(num_ok, num_fail), file=sys.stderr) diff --git a/egs/iam/v2/local/prepare_data.sh b/egs/iam/v2/local/prepare_data.sh new file mode 100755 index 00000000000..cf729d9a939 --- /dev/null +++ b/egs/iam/v2/local/prepare_data.sh @@ -0,0 +1,191 @@ +#!/bin/bash + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora +# 2017 Hossein Hadian +# Apache 2.0 + +# This script downloads the IAM handwriting database and prepares the training +# and test data (i.e text, images.scp, utt2spk and spk2utt) by calling process_data.py. +# It also downloads the LOB and Brown text corpora. It downloads the database files +# only if they do not already exist in download directory. + +# Eg. local/prepare_data.sh +# Eg. text file: 000_a01-000u-00 A MOVE to stop Mr. Gaitskell from +# utt2spk file: 000_a01-000u-00 000 +# images.scp file: 000_a01-000u-00 data/local/lines/a01/a01-000u/a01-000u-00.png +# spk2utt file: 000 000_a01-000u-00 000_a01-000u-01 000_a01-000u-02 000_a01-000u-03 + +stage=0 +download_dir=data/download +process_aachen_split=false +wellington_dir= +username= +password= # username and password for downloading the IAM database + # if you have not already downloaded the database, please + # register at http://www.fki.inf.unibe.ch/databases/iam-handwriting-database + # and provide this script with your username and password. + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; + +if [[ ! -f $download_dir/lines.tgz && -z $username ]]; then + echo "$0: Warning: Couldn't find lines.tgz in $download_dir. Unless the extracted dataset files" + echo "exist in your data/local directory this script will fail because the required files" + echo "can't be downloaded automatically (it needs registration)." + echo "Please register at http://www.fki.inf.unibe.ch/databases/iam-handwriting-database" + echo "... and then call this script again with --username --password " + echo "" + exit 1 +fi + +lines=data/local/lines +xml=data/local/xml +ascii=data/local/ascii +bcorpus=data/local/browncorpus +lobcorpus=data/local/lobcorpus +wcorpus=data/local/wellingtoncorpus +data_split_info=data/local/largeWriterIndependentTextLineRecognitionTask +lines_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/lines/lines.tgz +xml_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/xml/xml.tgz +data_split_info_url=http://www.fki.inf.unibe.ch/DBs/iamDB/tasks/largeWriterIndependentTextLineRecognitionTask.zip +ascii_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/ascii/ascii.tgz +brown_corpus_url=http://www.sls.hawaii.edu/bley-vroman/brown.txt +lob_corpus_url=http://ota.ox.ac.uk/text/0167.zip +wellington_corpus_loc=/export/corpora5/Wellington/WWC/ +aachen_split_url=http://www.openslr.org/resources/56/splits.zip +aachen_splits=data/local/aachensplits +mkdir -p $download_dir data/local + +# download and extact images and transcription +if [ -d $lines ]; then + echo "$0: Not downloading lines images as it is already there." +else + if [ ! -f $download_dir/lines.tgz ]; then + echo "$0: Trying to download lines images..." + wget -P $download_dir --user "$username" --password "$password" $lines_url || exit 1; + fi + mkdir -p $lines + tar -xzf $download_dir/lines.tgz -C $lines || exit 1; + echo "$0: Done downloading and extracting lines images" +fi + +if [ -d $xml ]; then + echo "$0: Not downloading transcriptions as it is already there." +else + if [ ! -f $download_dir/xml.tgz ]; then + echo "$0: Trying to download transcriptions..." + wget -P $download_dir --user "$username" --password "$password" $xml_url || exit 1; + fi + mkdir -p $xml + tar -xzf $download_dir/xml.tgz -C $xml || exit 1; + echo "$0: Done downloading and extracting transcriptions." +fi + +if [ -d $data_split_info ]; then + echo "$0: Not downloading data split information as it is already there." +else + if [ ! -f $download_dir/largeWriterIndependentTextLineRecognitionTask.zip ]; then + echo "$0: Trying to download training and testing data split information..." + wget -P $download_dir --user "$username" --password "$password" $data_split_info_url || exit 1; + fi + mkdir -p $data_split_info + unzip $download_dir/largeWriterIndependentTextLineRecognitionTask.zip -d $data_split_info || exit 1; + echo "$0: Done downloading and extracting training and testing data split information" +fi + +if [ -d $ascii ]; then + echo "$0: Not downloading ascii.tgz as it is already there." +else + if [ ! -f $download_dir/ascii.tgz ]; then + echo "$0: trying to download ascii.tgz..." + wget -P $download_dir --user "$username" --password "$password" $ascii_url || exit 1; + fi + mkdir -p $ascii + tar -xzf $download_dir/ascii.tgz -C $ascii || exit 1; + echo "$0: Done downloading and extracting ascii.tgz" +fi + +if [ -d $lobcorpus ]; then + echo "$0: Not downloading the LOB text corpus as it is already there." +else + if [ ! -f $lobcorpus/0167.zip ]; then + echo "$0: Downloading the LOB text corpus ..." + mkdir -p $lobcorpus + wget -P $lobcorpus/ $lob_corpus_url || exit 1; + fi + unzip $lobcorpus/0167.zip -d $lobcorpus || exit 1; + echo "$0: Done downloading and extracting LOB corpus" +fi + +if [ -d $bcorpus ]; then + echo "$0: Not downloading the Brown corpus as it is already there." +else + if [ ! -f $bcorpus/brown.txt ]; then + mkdir -p $bcorpus + echo "$0: Downloading the Brown text corpus..." + wget -P $bcorpus $brown_corpus_url || exit 1; + fi + echo "$0: Done downloading the Brown text corpus" +fi + +if [ -d $wcorpus ]; then + echo "$0: Not copying Wellington corpus as it is already there." +elif [ ! -z $wellington_dir ]; then + mkdir -p $wcorpus + cp -r $wellington_dir/. $wcorpus + + # Combine Wellington corpora and replace some of their annotations + cat data/local/wellingtoncorpus/Section{A,B,C,D,E,F,G,H,J,K,L}.txt | \ + cut -d' ' -f3- | sed "s/^[ \t]*//" > data/local/wellingtoncorpus/Wellington_annotated.txt + + cat data/local/wellingtoncorpus/Wellington_annotated.txt | local/remove_wellington_annotations.py > data/local/wellingtoncorpus/Wellington_annotation_removed.txt + + echo "$0: Done copying Wellington corpus" +else + echo "$0: Wellington Corpus not included because wellington_dir not provided" +fi + +if [ -d $aachen_splits ]; then + echo "$0: Not downloading the Aachen splits as it is already there." +else + if [ ! -f $aachen_splits/splits.zip ]; then + echo "$0: Downloading Aachen splits ..." + mkdir -p $aachen_splits + wget -P $aachen_splits/ $aachen_split_url || exit 1; + fi + unzip $aachen_splits/splits.zip -d $aachen_splits || exit 1; + echo "$0: Done downloading and extracting Aachen splits" +fi + + +mkdir -p data/{train,test,val} +file_name=largeWriterIndependentTextLineRecognitionTask + +train_old="data/local/$file_name/trainset.txt" +test_old="data/local/$file_name/testset.txt" +val1_old="data/local/$file_name/validationset1.txt" +val2_old="data/local/$file_name/validationset2.txt" + +train_new="data/local/train.uttlist" +test_new="data/local/test.uttlist" +val_new="data/local/validation.uttlist" + +cat $train_old > $train_new +cat $test_old > $test_new +cat $val1_old $val2_old > $val_new + +if $process_aachen_split; then + local/process_aachen_splits.py data/local $aachen_splits/splits data/train --dataset train || exit 1 + local/process_aachen_splits.py data/local $aachen_splits/splits data/test --dataset test || exit 1 + local/process_aachen_splits.py data/local $aachen_splits/splits data/val --dataset validation || exit 1 +else + local/process_data.py data/local data/train --dataset train || exit 1 + local/process_data.py data/local data/test --dataset test || exit 1 + local/process_data.py data/local data/val --dataset validation || exit 1 +fi + +image/fix_data_dir.sh data/train +image/fix_data_dir.sh data/test +image/fix_data_dir.sh data/val diff --git a/egs/iam/v2/local/prepare_dict.sh b/egs/iam/v2/local/prepare_dict.sh new file mode 100755 index 00000000000..714b5b51788 --- /dev/null +++ b/egs/iam/v2/local/prepare_dict.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash + +# Copyright 2017 Hossein Hadian +# 2017 Chun Chieh Chang +# 2017 Ashish Arora + +# This script prepares the dictionary. + +set -e +dir=data/local/dict +vocab_size=50000 +. ./utils/parse_options.sh + +mkdir -p $dir + +# First get the set of all letters that occur in data/train/text +cat data/train/text | \ + perl -ne '@A = split; shift @A; for(@A) {print join("\n", split(//)), "\n";}' | \ + sort -u | grep -v "|" > $dir/nonsilence_phones.txt + +# Now use the pocolm's wordlist which is the most N frequent words in +# in data/train/text and LOB+Brown corpora (dev and test excluded) with their comprising +# letters as their transcription. Only include words that use the above letters. +# (Letter # is replaced with ) + +export letters=$(cat $dir/nonsilence_phones.txt | tr -d "\n") + +head -n $vocab_size data/local/local_lm/data/word_count | awk '{print $2}' | \ + perl -e '$letters=$ENV{letters}; $letters=$letters . "|"; +while(<>){ + chop; + $w = $_; + if($w =~ m/^[$letters]+$/){ + $trans = join(" ", split(//, $w)); + $trans =~ s/#//g; + $trans =~ s/\|/SIL/g; + print "$w $trans\n"; + } +}' | sort -u > $dir/lexicon.txt + + +perl -i -pe "s/#//" $dir/nonsilence_phones.txt + +echo ' SIL' >> $dir/lexicon.txt + +echo SIL > $dir/silence_phones.txt + +echo SIL >$dir/optional_silence.txt + +echo -n "" >$dir/extra_questions.txt diff --git a/egs/iam/v2/local/process_aachen_splits.py b/egs/iam/v2/local/process_aachen_splits.py new file mode 100755 index 00000000000..cb6a6d4f0d8 --- /dev/null +++ b/egs/iam/v2/local/process_aachen_splits.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora + +""" This script reads the extracted IAM database files and creates + the following files (for the data subset selected via --dataset): + text, utt2spk, images.scp. + + Eg. local/process_aachen_splits.py data/local data/train data --dataset train + Eg. text file: 000_a01-000u-00 A MOVE to stop Mr. Gaitskell from + utt2spk file: 000_a01-000u-00 000 + images.scp file: 000_a01-000u-00 data/local/lines/a01/a01-000u/a01-000u-00.png +""" + +import argparse +import os +import sys +import xml.dom.minidom as minidom + +parser = argparse.ArgumentParser(description="""Creates text, utt2spk + and images.scp files.""") +parser.add_argument('database_path', type=str, + help='Path to the downloaded (and extracted) IAM data') +parser.add_argument('split_path', type=str, + help='location of the train/test/val set') +parser.add_argument('out_dir', type=str, + help='location to write output files.') +parser.add_argument('--dataset', type=str, default='train', + choices=['train', 'test','validation'], + help='Subset of data to process.') +args = parser.parse_args() + +text_file = os.path.join(args.out_dir + '/', 'text') +text_fh = open(text_file, 'w') + +utt2spk_file = os.path.join(args.out_dir + '/', 'utt2spk') +utt2spk_fh = open(utt2spk_file, 'w') + +image_file = os.path.join(args.out_dir + '/', 'images.scp') +image_fh = open(image_file, 'w') + +dataset_path = os.path.join(args.split_path, + args.dataset + '.uttlist') + +text_file_path = os.path.join(args.database_path, + 'ascii','lines.txt') +text_dict = {} +def process_text_file_for_word_model(): + with open (text_file_path, 'rt') as in_file: + for line in in_file: + if line[0]=='#': + continue + line = line.strip() + utt_id = line.split(' ')[0] + text_vect = line.split(' ')[8:] + text = "".join(text_vect) + text = text.replace("|", " ") + text_dict[utt_id] = text + + +### main ### + +print("Processing '{}' data...".format(args.dataset)) +process_text_file_for_word_model() + +with open(dataset_path) as f: + for line in f: + line = line.strip() + line_vect = line.split('-') + xml_file = line_vect[0] + '-' + line_vect[1] + xml_path = os.path.join(args.database_path, 'xml', xml_file + '.xml') + doc = minidom.parse(xml_path) + form_elements = doc.getElementsByTagName('form')[0] + writer_id = form_elements.getAttribute('writer-id') + outerfolder = form_elements.getAttribute('id')[0:3] + innerfolder = form_elements.getAttribute('id') + lines_path = os.path.join(args.database_path, 'lines', + outerfolder, innerfolder) + for file in os.listdir(lines_path): + if file.endswith(".png"): + image_file_path = os.path.join(lines_path, file) + base_name = os.path.splitext(os.path.basename(image_file_path))[0] + text = text_dict[base_name] + utt_id = writer_id + '_' + base_name + text_fh.write(utt_id + ' ' + text + '\n') + utt2spk_fh.write(utt_id + ' ' + writer_id + '\n') + image_fh.write(utt_id + ' ' + image_file_path + '\n') diff --git a/egs/iam/v2/local/process_data.py b/egs/iam/v2/local/process_data.py new file mode 100755 index 00000000000..2adae7bf7be --- /dev/null +++ b/egs/iam/v2/local/process_data.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python3 + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora + +""" This script reads the extracted IAM database files and creates + the following files (for the data subset selected via --dataset): + text, utt2spk, images.scp. + + Eg. local/process_data.py data/local data/train data --dataset train + Eg. text file: 000_a01-000u-00 A MOVE to stop Mr. Gaitskell from + utt2spk file: 000_a01-000u-00 000 + images.scp file: 000_a01-000u-00 data/local/lines/a01/a01-000u/a01-000u-00.png +""" + +import argparse +import os +import sys +import xml.dom.minidom as minidom + +parser = argparse.ArgumentParser(description="""Creates text, utt2spk + and images.scp files.""") +parser.add_argument('database_path', type=str, + help='Path to the downloaded (and extracted) IAM data') +parser.add_argument('out_dir', type=str, + help='Where to write output files.') +parser.add_argument('--dataset', type=str, default='train', + choices=['train', 'test','validation'], + help='Subset of data to process.') +args = parser.parse_args() + +text_file = os.path.join(args.out_dir + '/', 'text') +text_fh = open(text_file, 'w') + +utt2spk_file = os.path.join(args.out_dir + '/', 'utt2spk') +utt2spk_fh = open(utt2spk_file, 'w') + +image_file = os.path.join(args.out_dir + '/', 'images.scp') +image_fh = open(image_file, 'w') + +dataset_path = os.path.join(args.database_path, + args.dataset + '.uttlist') + +text_file_path = os.path.join(args.database_path, + 'ascii','lines.txt') +text_dict = {} +def process_text_file_for_word_model(): + with open (text_file_path, 'rt') as in_file: + for line in in_file: + if line[0]=='#': + continue + line = line.strip() + utt_id = line.split(' ')[0] + text_vect = line.split(' ')[8:] + text = "".join(text_vect) + text = text.replace("|", " ") + text_dict[utt_id] = text + +print("Processing '{}' data...".format(args.dataset)) +process_text_file_for_word_model() + +with open(dataset_path) as f: + for line in f: + line = line.strip() + line_vect = line.split('-') + xml_file = line_vect[0] + '-' + line_vect[1] + xml_path = os.path.join(args.database_path, 'xml', xml_file + '.xml') + img_num = line[-3:] + doc = minidom.parse(xml_path) + form_elements = doc.getElementsByTagName('form')[0] + writer_id = form_elements.getAttribute('writer-id') + outerfolder = form_elements.getAttribute('id')[0:3] + innerfolder = form_elements.getAttribute('id') + lines_path = os.path.join(args.database_path, 'lines', + outerfolder, innerfolder, innerfolder) + image_file_path = lines_path + img_num + '.png' + text = text_dict[line] + utt_id = writer_id + '_' + line + text_fh.write(utt_id + ' ' + text + '\n') + utt2spk_fh.write(utt_id + ' ' + writer_id + '\n') + image_fh.write(utt_id + ' ' + image_file_path + '\n') diff --git a/egs/iam/v2/local/remove_test_utterances_from_lob.py b/egs/iam/v2/local/remove_test_utterances_from_lob.py new file mode 100755 index 00000000000..5e5dac52818 --- /dev/null +++ b/egs/iam/v2/local/remove_test_utterances_from_lob.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python3 +# Copyright 2018 Ashish Arora + +import argparse +import os +import numpy as np +import sys +import re + +parser = argparse.ArgumentParser(description="""Removes dev/test set lines + from the LOB corpus. Reads the + corpus from stdin, and writes it to stdout.""") +parser.add_argument('dev_text', type=str, + help='dev transcription location.') +parser.add_argument('test_text', type=str, + help='test transcription location.') +args = parser.parse_args() + +def remove_punctuations(transcript): + char_list = [] + for char in transcript: + if char.isdigit() or char == '+' or char == '~' or char == '?': + continue + if char == '#' or char == '=' or char == '-' or char == '!': + continue + if char == ',' or char == '.' or char == ')' or char == '\'': + continue + if char == '(' or char == ':' or char == ';' or char == '"': + continue + if char == '*': + continue + char_list.append(char) + return char_list + + +def remove_special_words(words): + word_list = [] + for word in words: + if word == '' or word == '#': + continue + word_list.append(word) + return word_list + + +# process and add dev/eval transcript in a list +# remove special words, punctuations, spaces between words +# lowercase the characters +def read_utterances(text_file_path): + with open(text_file_path, 'rt') as in_file: + for line in in_file: + words = line.strip().split() + words_wo_sw = remove_special_words(words) + transcript = ''.join(words_wo_sw[1:]) + transcript = transcript.lower() + trans_wo_punct = remove_punctuations(transcript) + transcript = ''.join(trans_wo_punct) + utterance_dict[words_wo_sw[0]] = transcript + + +### main ### + +# read utterances and add it to utterance_dict +utterance_dict = dict() +read_utterances(args.dev_text) +read_utterances(args.test_text) + +# read corpus and add it to below lists +corpus_text_lowercase_wo_sc = list() +corpus_text_wo_sc = list() +original_corpus_text = list() +for line in sys.stdin: + original_corpus_text.append(line) + words = line.strip().split() + words_wo_sw = remove_special_words(words) + + transcript = ''.join(words_wo_sw) + transcript = transcript.lower() + trans_wo_punct = remove_punctuations(transcript) + transcript = ''.join(trans_wo_punct) + corpus_text_lowercase_wo_sc.append(transcript) + + transcript = ''.join(words_wo_sw) + trans_wo_punct = remove_punctuations(transcript) + transcript = ''.join(trans_wo_punct) + corpus_text_wo_sc.append(transcript) + +# find majority of utterances below +# for utterances which were not found +# add them to remaining_utterances +row_to_keep = [True for i in range(len(original_corpus_text))] +remaining_utterances = dict() +for line_id, line_to_find in utterance_dict.items(): + found_line = False + # avoiding very small utterance, it causes removing + # complete lob text + if len(line_to_find) < 10: + remaining_utterances[line_id] = line_to_find + else: + for i in range(1, (len(corpus_text_lowercase_wo_sc) - 2)): + # Combine 3 consecutive lines of the corpus into a single line + prev_words = corpus_text_lowercase_wo_sc[i - 1].strip() + curr_words = corpus_text_lowercase_wo_sc[i].strip() + next_words = corpus_text_lowercase_wo_sc[i + 1].strip() + new_line = prev_words + curr_words + next_words + transcript = ''.join(new_line) + if line_to_find in transcript: + found_line = True + row_to_keep[i-1] = False + row_to_keep[i] = False + row_to_keep[i+1] = False + if not found_line: + remaining_utterances[line_id] = line_to_find + +# removing long utterances not found above +row_to_keep[87530] = False; row_to_keep[87531] = False; row_to_keep[87532] = False; +row_to_keep[31724] = False; row_to_keep[31725] = False; row_to_keep[31726] = False; +row_to_keep[16704] = False; row_to_keep[16705] = False; row_to_keep[16706] = False; +row_to_keep[94181] = False; row_to_keep[94182] = False; row_to_keep[94183] = False; +row_to_keep[20171] = False; row_to_keep[20172] = False; row_to_keep[20173] = False; +row_to_keep[16734] = False; row_to_keep[16733] = False; row_to_keep[16732] = False; +row_to_keep[20576] = False; row_to_keep[20577] = False; row_to_keep[20578] = False; +row_to_keep[31715] = False; row_to_keep[31716] = False; row_to_keep[31717] = False; +row_to_keep[31808] = False; row_to_keep[31809] = False; row_to_keep[31810] = False; +row_to_keep[31822] = False; row_to_keep[31823] = False; row_to_keep[31824] = False; +row_to_keep[88791] = False; row_to_keep[88792] = False; row_to_keep[88793] = False; +row_to_keep[31745] = False; row_to_keep[31746] = False; row_to_keep[31825] = False; +row_to_keep[94256] = False; row_to_keep[94257] = False; row_to_keep[88794] = False; +row_to_keep[88665] = False; row_to_keep[17093] = False; row_to_keep[17094] = False; +row_to_keep[20586] = False; row_to_keep[87228] = False; row_to_keep[87229] = False; +row_to_keep[16744] = False; row_to_keep[87905] = False; row_to_keep[87906] = False; +row_to_keep[16669] = False; row_to_keep[16670] = False; row_to_keep[16719] = False; +row_to_keep[87515] = False; row_to_keep[20090] = False; row_to_keep[31748] = False; +for i in range(len(original_corpus_text)): + transcript = original_corpus_text[i].strip() + if row_to_keep[i]: + print(transcript) + +print('Sentences not removed from LOB: {}'.format(remaining_utterances), file=sys.stderr) +print('Total test+dev sentences: {}'.format(len(utterance_dict)), file=sys.stderr) +print('Number of sentences not removed from LOB: {}'. format(len(remaining_utterances)), file=sys.stderr) +print('LOB lines: Before: {} After: {}'.format(len(original_corpus_text), + row_to_keep.count(True)), file=sys.stderr) diff --git a/egs/iam/v2/local/remove_wellington_annotations.py b/egs/iam/v2/local/remove_wellington_annotations.py new file mode 100755 index 00000000000..260a3542985 --- /dev/null +++ b/egs/iam/v2/local/remove_wellington_annotations.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 +# Copyright 2018 Chun-Chieh Chang + +import sys +import io +import re +from collections import OrderedDict + +sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding="utf8"); +sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf8"); + +prev2_line = " "; +prev_line = " "; +for line in sys.stdin: + line = line.strip() + pattern = re.compile("\\*\\*\\[.*?\\*\\*\\]|\\*[0-9]|\\\\[0-9]{0,2}|\\*\\*?[\|,\?,\#,\=,\;,\:,\<,\>]|\||\^") + line_fixed = pattern.sub("", line) + dict=OrderedDict([("*+$","$"), ("*+","£"), ("*-","-"), ("*/","*"), ("*{","{"), ("*}","}"), + ("**\"","\""), ("*\"","\""), ("**'","'"), ("*'","'"), ("*@","°")]) + pattern = re.compile("|".join(re.escape(key) for key in dict.keys())); + line_fixed = pattern.sub(lambda x: dict[x.group()], line_fixed) + + line_fixed = prev2_line + "\n" + prev_line + "\n" + line_fixed + + pattern = re.compile("\{[0-9]{0,2}(.*?)\}", re.DOTALL) + line_fixed = pattern.sub(lambda x: x.group(1), line_fixed) + + output, prev2_line, prev_line = line_fixed.split("\n") + + sys.stdout.write(output + "\n") +sys.stdout.write(prev2_line + "\n") +sys.stdout.write(prev_line + "\n") diff --git a/egs/iam/v2/local/score.sh b/egs/iam/v2/local/score.sh new file mode 100755 index 00000000000..1d84815fc69 --- /dev/null +++ b/egs/iam/v2/local/score.sh @@ -0,0 +1,6 @@ + +#!/bin/bash + + +steps/scoring/score_kaldi_wer.sh "$@" +steps/scoring/score_kaldi_cer.sh --stage 2 "$@" diff --git a/egs/iam/v2/local/train_lm.sh b/egs/iam/v2/local/train_lm.sh new file mode 100755 index 00000000000..cc0119eb748 --- /dev/null +++ b/egs/iam/v2/local/train_lm.sh @@ -0,0 +1,156 @@ +#!/bin/bash + +# Copyright 2016 Vincent Nguyen +# 2016 Johns Hopkins University (author: Daniel Povey) +# 2017 Ashish Arora +# 2017 Hossein Hadian +# Apache 2.0 +# +# This script trains an LM on the LOB+Brown text data and IAM training transcriptions. +# It is based on the example scripts distributed with PocoLM + +# It will check if pocolm is installed and if not will proceed with installation + +set -e +stage=0 +vocab_size=50000 + +echo "$0 $@" # Print the command line for logging +. ./utils/parse_options.sh || exit 1; + +dir=data/local/local_lm +lm_dir=${dir}/data + + +mkdir -p $dir +. ./path.sh || exit 1; # for KALDI_ROOT +export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH +( # First make sure the pocolm toolkit is installed. + cd $KALDI_ROOT/tools || exit 1; + if [ -d pocolm ]; then + echo Not installing the pocolm toolkit since it is already there. + else + echo "$0: Please install the PocoLM toolkit with: " + echo " cd ../../../tools; extras/install_pocolm.sh; cd -" + exit 1; + fi +) || exit 1; + +bypass_metaparam_optim_opt= +# If you want to bypass the metaparameter optimization steps with specific metaparameters +# un-comment the following line, and change the numbers to some appropriate values. +# You can find the values from output log of train_lm.py. +# These example numbers of metaparameters is for 4-gram model (with min-counts) +# running with train_lm.py. +# The dev perplexity should be close to the non-bypassed model. +#bypass_metaparam_optim_opt= +# Note: to use these example parameters, you may need to remove the .done files +# to make sure the make_lm_dir.py be called and tain only 3-gram model +#for order in 3; do +#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done + +if [ $stage -le 0 ]; then + mkdir -p ${dir}/data + mkdir -p ${dir}/data/text + + echo "$0: Getting the Data sources" + + rm ${dir}/data/text/* 2>/dev/null || true + + # Using LOB and brown corpus. + if [ ! -f data/local/lob-train-only.txt ]; then + cat data/local/lobcorpus/0167/download/LOB_COCOA/lob.txt | \ + local/remove_test_utterances_from_lob.py data/test/text.old data/val/text.old \ + > data/local/lob-train-only.txt + fi + cat data/local/lob-train-only.txt | \ + utils/lang/bpe/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ + | sed 's/@@//g' > ${dir}/data/text/lob.txt + cat data/local/browncorpus/brown.txt | \ + utils/lang/bpe/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ + | sed 's/@@//g' > ${dir}/brown.txt + tail -n +5000 ${dir}/brown.txt > ${dir}/data/text/brown.txt + if [ -d "data/local/wellingtoncorpus" ]; then + cat data/local/wellingtoncorpus/Wellington_annotation_removed.txt | \ + utils/lang/bpe/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ + | sed 's/@@//g' > ${dir}/data/text/wellington.txt + fi + + # use the validation data as the dev set. + # Note: the name 'dev' is treated specially by pocolm, it automatically + # becomes the dev set. + head -5000 ${dir}/brown.txt > ${dir}/data/text/dev.txt + + # use the training data as an additional data source. + # we can later fold the dev data into this. + cat data/train/text | cut -d " " -f 2- > ${dir}/data/text/iam.txt + + # for reporting perplexities, we'll use the "real" dev set. + # (the validation data is used as ${dir}/data/text/dev.txt to work + # out interpolation weights.) + # note, we can't put it in ${dir}/data/text/, because then pocolm would use + # it as one of the data sources. + cut -d " " -f 2- < data/test/text > ${dir}/data/real_dev_set.txt + + # get the wordlist from IAM text + if [ -d "data/local/wellingtoncorpus" ]; then + cat ${dir}/data/text/{iam,lob,brown,wellington}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count + else + echo "$0: Wellington Corpus not found. Proceeding without using that corpus." + cat ${dir}/data/text/{iam,lob,brown}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count + fi + head -n $vocab_size ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist +fi + +order=6 + +if [ $stage -le 1 ]; then + # decide on the vocabulary. + # Note: you'd use --wordlist if you had a previously determined word-list + # that you wanted to use. + # Note: if you have more than one order, use a certain amount of words as the + # vocab and want to restrict max memory for 'sort', + echo "$0: training the unpruned LM" + min_counts='brown=2 lob=2 iam=1' + wordlist=${dir}/data/wordlist + + lm_name="`basename ${wordlist}`_${order}" + if [ -n "${min_counts}" ]; then + lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`" + fi + unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm + + train_lm.py --wordlist=${wordlist} --num-splits=10 --warm-start-ratio=20 \ + --limit-unk-history=true \ + ${bypass_metaparam_optim_opt} \ + ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir} + + mkdir -p ${dir}/data/arpa + format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz + + get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' +fi + +if [ $stage -le 2 ]; then + echo "$0: pruning the LM (to larger size)" + # Using 1 million n-grams for a big LM for rescoring purposes. + size=1000000 + prune_lm_dir.py --target-num-ngrams=$size --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big + + get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity' + + mkdir -p ${dir}/data/arpa + format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz +fi + +if [ $stage -le 3 ]; then + echo "$0: pruning the LM (to smaller size)" + # Using 500,000 n-grams for a smaller LM for graph building. Prune from the + # bigger-pruned LM, it'll be faster. + size=500000 + prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small + + get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity' + + format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz +fi diff --git a/egs/iam/v2/local/wer_output_filter b/egs/iam/v2/local/wer_output_filter new file mode 100755 index 00000000000..24691a160a9 --- /dev/null +++ b/egs/iam/v2/local/wer_output_filter @@ -0,0 +1,31 @@ +#!/usr/bin/env python3 +# Copyright 2017 Hossein Hadian + +# This is a filter used in scoring. It separates all +# punctuations from words. For e.g. this sentence: + +# "They have come!" he said reverently, gripping his +# hands. "Isn't it a glorious thing! Long awaited." + +# is converted to this: + +# " They have come ! " he said reverently , gripping his +# hands . " Isn ' t it a glorious thing ! Long awaited . " + +# Sample BPE-based output: +# |He |ro se |from |his |b re ak f as t - s ch oo l |b en ch + +import sys +import re + +punctuations = "!(),.?;:'-\"" +escaped_punctuations = re.escape(punctuations) + +for line in sys.stdin: + words = line.strip().split() + uttid = words[0] + transcript = ''.join(words[1:]) + transcript = transcript.replace('|', ' ') + split_transcript = " ".join(re.split("([{}])".format(escaped_punctuations), + transcript)).strip() + print("{} {}".format(uttid, split_transcript)) diff --git a/egs/iam/v2/path.sh b/egs/iam/v2/path.sh new file mode 100755 index 00000000000..7e458144624 --- /dev/null +++ b/egs/iam/v2/path.sh @@ -0,0 +1,9 @@ +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh + +export LD_LIBRARY_PATH=$KALDI_ROOT/tools/openfst/lib:$LD_LIBRARY_PATH +export LD_LIBRARY_PATH=/home/dpovey/libs:$LD_LIBRARY_PATH +export LC_ALL=C diff --git a/egs/iam/v2/run_end2end.sh b/egs/iam/v2/run_end2end.sh new file mode 100755 index 00000000000..c515c85fc72 --- /dev/null +++ b/egs/iam/v2/run_end2end.sh @@ -0,0 +1,146 @@ +#!/bin/bash +# Copyright 2017 Hossein Hadian + +set -e +stage=0 +nj=20 +username= +password= +process_aachen_split=false +overwrite=false +# iam_database points to the database path on the JHU grid. If you have not +# already downloaded the database you can set it to a local directory +# like "data/download" and follow the instructions +# in "local/prepare_data.sh" to download the database: +iam_database=/export/corpora5/handwriting_ocr/IAM +# wellington_database points to the database path on the JHU grid. The Wellington +# corpus contains two directories WWC and WSC (Wellington Written and Spoken Corpus). +# This corpus is of written NZ English that can be purchased here: +# "https://www.victoria.ac.nz/lals/resources/corpora-default" +wellington_database=/export/corpora5/Wellington/WWC/ + +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. +. ./path.sh +. ./utils/parse_options.sh # e.g. this parses the above options + # if supplied. + + +./local/check_tools.sh + +if [ $stage -le 0 ]; then + + if [ -f data/train/text ] && ! $overwrite; then + echo "$0: Not processing, probably script have run from wrong stage" + echo "Exiting with status 1 to avoid data corruption" + exit 1; + fi + + echo "$0: Preparing data..." + local/prepare_data.sh --download-dir "$iam_database" \ + --wellington-dir "$wellington_database" \ + --username "$username" --password "$password" \ + --process_aachen_split $process_aachen_split +fi + +mkdir -p data/{train,test}/data +if [ $stage -le 1 ]; then + echo "$(date) stage 1: getting allowed image widths for e2e training..." + image/get_image2num_frames.py --feat-dim 40 data/train # This will be needed for the next command + # The next command creates a "allowed_lengths.txt" file in data/train + # which will be used by local/make_features.py to enforce the images to + # have allowed lengths. The allowed lengths will be spaced by 10% difference in length. + image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train + echo "$(date) Extracting features, creating feats.scp file" + local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/train + steps/compute_cmvn_stats.sh data/train || exit 1; + for set in val test; do + local/extract_features.sh --nj $nj --cmd "$cmd" --augment true \ + --feat-dim 40 data/${set} + steps/compute_cmvn_stats.sh data/${set} || exit 1; + done + utils/fix_data_dir.sh data/train +fi + +if [ $stage -le 2 ]; then + for set in train; do + echo "$(date) stage 2: Performing augmentation, it will double training data" + local/augment_data.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/${set} data/${set}_aug data + steps/compute_cmvn_stats.sh data/${set}_aug || exit 1; + done +fi + +if [ $stage -le 3 ]; then + echo "$0: Preparing BPE..." + # getting non-silence phones. + cut -d' ' -f2- data/train/text | \ +python3 <( +cat << "END" +import os, sys, io; +infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8'); +output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8'); +phone_dict = dict(); +for line in infile: + line_vect = line.strip().split(); + for word in line_vect: + for phone in word: + phone_dict[phone] = phone; +for phone in phone_dict.keys(): + output.write(phone+ '\n'); +END + ) > data/local/phones.txt + + cut -d' ' -f2- data/train/text > data/local/train_data.txt + cat data/local/phones.txt data/local/train_data.txt | \ + utils/lang/bpe/prepend_words.py | \ + utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt + for set in test train val train_aug; do + cut -d' ' -f1 data/$set/text > data/$set/ids + cut -d' ' -f2- data/$set/text | \ + utils/lang/bpe/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ + | sed 's/@@//g' > data/$set/bpe_text + mv data/$set/text data/$set/text.old + paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text + done +fi + +if [ $stage -le 4 ]; then + echo "$0: Estimating a language model for decoding..." + local/train_lm.sh +fi + +if [ $stage -le 5 ]; then + echo "$0: Preparing dictionary and lang..." + local/prepare_dict.sh + # This recipe uses byte-pair encoding, the silences are part of the words' pronunciations. + # So we set --sil-prob to 0.0 + utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \ + data/local/dict "" data/lang/temp data/lang + silphonelist=`cat data/lang/phones/silence.csl` + nonsilphonelist=`cat data/lang/phones/nonsilence.csl` + local/gen_topo.py 8 4 4 $nonsilphonelist $silphonelist data/lang/phones.txt >data/lang/topo + utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang + + utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_big.arpa.gz \ + data/local/dict/lexicon.txt data/lang + utils/build_const_arpa_lm.sh data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \ + data/lang data/lang_rescore_6g +fi + +if [ $stage -le 6 ]; then + echo "$0: Calling the flat-start chain recipe..." + local/chain/run_e2e_cnn.sh --train_set train_aug +fi + +if [ $stage -le 7 ]; then + echo "$0: Aligning the training data using the e2e chain model..." + steps/nnet3/align.sh --nj 50 --cmd "$cmd" \ + --use-gpu false \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \ + data/train_aug data/lang exp/chain/e2e_cnn_1b exp/chain/e2e_ali_train +fi + +if [ $stage -le 8 ]; then + echo "$0: Building a tree and training a regular chain model using the e2e alignments..." + local/chain/run_cnn_e2eali.sh --train_set train_aug +fi diff --git a/egs/iam/v2/steps b/egs/iam/v2/steps new file mode 120000 index 00000000000..6e99bf5b5ad --- /dev/null +++ b/egs/iam/v2/steps @@ -0,0 +1 @@ +../../wsj/s5/steps \ No newline at end of file diff --git a/egs/iam/v2/utils b/egs/iam/v2/utils new file mode 120000 index 00000000000..b240885218f --- /dev/null +++ b/egs/iam/v2/utils @@ -0,0 +1 @@ +../../wsj/s5/utils \ No newline at end of file diff --git a/egs/iban/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/iban/s5/local/chain/tuning/run_tdnn_1a.sh index d320f49d3aa..10650a18269 100755 --- a/egs/iban/s5/local/chain/tuning/run_tdnn_1a.sh +++ b/egs/iban/s5/local/chain/tuning/run_tdnn_1a.sh @@ -136,7 +136,7 @@ if [ $stage -le 12 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) opts="l2-regularize=0.08 dropout-per-dim-continuous=true" output_opts="l2-regularize=0.02 bottleneck-dim=256" diff --git a/egs/iban/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/iban/s5/local/chain/tuning/run_tdnn_1b.sh index 56f5255288c..db62e6f8a55 100755 --- a/egs/iban/s5/local/chain/tuning/run_tdnn_1b.sh +++ b/egs/iban/s5/local/chain/tuning/run_tdnn_1b.sh @@ -136,7 +136,7 @@ if [ $stage -le 12 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) opts="l2-regularize=0.08 dropout-per-dim=true dropout-per-dim-continuous=true" linear_opts="orthonormal-constraint=-1.0" output_opts="l2-regularize=0.04" diff --git a/egs/iban/s5/run.sh b/egs/iban/s5/run.sh index 991d32505bf..278a8177c0e 100755 --- a/egs/iban/s5/run.sh +++ b/egs/iban/s5/run.sh @@ -68,7 +68,7 @@ if [ $stage -le 4 ]; then echo "Starting triphone training." steps/align_si.sh --nj $nj --cmd "$train_cmd" \ data/train data/lang exp/mono exp/mono_ali - steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \ + steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \ 3200 30000 data/train data/lang exp/mono_ali exp/tri1 echo "Triphone training done." @@ -78,7 +78,7 @@ if [ $stage -le 4 ]; then steps/decode.sh --nj $dev_nj --cmd "$decode_cmd" \ exp/tri1/graph data/dev exp/tri1/decode_dev - steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ data/lang_test/ data/lang_big/ data/dev \ exp/tri1/decode_dev exp/tri1/decode_dev.rescored echo "Triphone decoding done." @@ -89,7 +89,7 @@ if [ $stage -le 5 ]; then ## Triphones + delta delta # Training echo "Starting (larger) triphone training." - steps/align_si.sh --nj $nj --cmd "$train_cmd" --use-graphs true \ + steps/align_si.sh --nj $nj --cmd "$train_cmd" --use-graphs true \ data/train data/lang exp/tri1 exp/tri1_ali steps/train_deltas.sh --cmd "$train_cmd" \ 4200 40000 data/train data/lang exp/tri1_ali exp/tri2a @@ -97,11 +97,11 @@ if [ $stage -le 5 ]; then ( echo "Decoding the dev set using triphone(large) models." - utils/mkgraph.sh data/lang_test exp/tri2a exp/tri2a/graph + utils/mkgraph.sh data/lang_test exp/tri2a exp/tri2a/graph steps/decode.sh --nj $dev_nj --cmd "$decode_cmd" \ - exp/tri2a/graph data/dev exp/tri2a/decode_dev + exp/tri2a/graph data/dev exp/tri2a/decode_dev - steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ data/lang_test/ data/lang_big/ data/dev \ exp/tri2a/decode_dev exp/tri2a/decode_dev.rescored echo "Triphone(large) decoding done." @@ -112,21 +112,21 @@ if [ $stage -le 6 ]; then ### Triphone + LDA and MLLT # Training echo "Starting LDA+MLLT training." - steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + steps/align_si.sh --nj $nj --cmd "$train_cmd" \ data/train data/lang exp/tri2a exp/tri2a_ali - steps/train_lda_mllt.sh --cmd "$train_cmd" \ + steps/train_lda_mllt.sh --cmd "$train_cmd" \ --splice-opts "--left-context=3 --right-context=3" \ - 4200 40000 data/train data/lang exp/tri2a_ali exp/tri2b + 4200 40000 data/train data/lang exp/tri2a_ali exp/tri2b echo "LDA+MLLT training done." ( echo "Decoding the dev set using LDA+MLLT models." - utils/mkgraph.sh data/lang_test exp/tri2b exp/tri2b/graph - steps/decode.sh --nj $dev_nj --cmd "$decode_cmd" \ - exp/tri2b/graph data/dev exp/tri2b/decode_dev + utils/mkgraph.sh data/lang_test exp/tri2b exp/tri2b/graph + steps/decode.sh --nj $dev_nj --cmd "$decode_cmd" \ + exp/tri2b/graph data/dev exp/tri2b/decode_dev - steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ data/lang_test/ data/lang_big/ data/dev \ exp/tri2b/decode_dev exp/tri2b/decode_dev.rescored echo "LDA+MLLT decoding done." @@ -138,7 +138,7 @@ if [ $stage -le 7 ]; then ### Triphone + LDA and MLLT + SAT and FMLLR # Training echo "Starting SAT+FMLLR training." - steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + steps/align_si.sh --nj $nj --cmd "$train_cmd" \ --use-graphs true data/train data/lang exp/tri2b exp/tri2b_ali steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \ data/train data/lang exp/tri2b_ali exp/tri3b @@ -150,7 +150,7 @@ if [ $stage -le 7 ]; then steps/decode_fmllr.sh --nj $dev_nj --cmd "$decode_cmd" \ exp/tri3b/graph data/dev exp/tri3b/decode_dev - steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ data/lang_test/ data/lang_big/ data/dev \ exp/tri3b/decode_dev exp/tri3b/decode_dev.rescored echo "SAT+FMLLR decoding done." @@ -163,10 +163,10 @@ if [ $stage -le 8 ]; then steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \ data/train data/lang exp/tri3b exp/tri3b_ali - steps/train_ubm.sh --cmd "$train_cmd" \ + steps/train_ubm.sh --cmd "$train_cmd" \ 600 data/train data/lang exp/tri3b_ali exp/ubm5b2 - steps/train_sgmm2.sh --cmd "$train_cmd" \ + steps/train_sgmm2.sh --cmd "$train_cmd" \ 5200 12000 data/train data/lang exp/tri3b_ali exp/ubm5b2/final.ubm exp/sgmm2_5b2 echo "SGMM training done." @@ -180,7 +180,7 @@ if [ $stage -le 8 ]; then --transform-dir exp/tri3b/decode_dev \ exp/sgmm2_5b2/graph data/dev exp/sgmm2_5b2/decode_dev - steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ data/lang_test/ data/lang_big/ data/dev \ exp/sgmm2_5b2/decode_dev exp/sgmm2_5b2/decode_dev.rescored diff --git a/egs/ifnenit/v1/README.txt b/egs/ifnenit/README.txt similarity index 100% rename from egs/ifnenit/v1/README.txt rename to egs/ifnenit/README.txt diff --git a/egs/ifnenit/v1/local/chain/run_cnn_1a.sh b/egs/ifnenit/v1/local/chain/run_cnn_1a.sh index b0e147d157b..b0ecd547741 100755 --- a/egs/ifnenit/v1/local/chain/run_cnn_1a.sh +++ b/egs/ifnenit/v1/local/chain/run_cnn_1a.sh @@ -123,7 +123,7 @@ if [ $stage -le 4 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) common1="required-time-offsets=0 height-offsets=-2,-1,0,1,2 num-filters-out=36" common2="required-time-offsets=0 height-offsets=-2,-1,0,1,2 num-filters-out=70" mkdir -p $dir/configs diff --git a/egs/ifnenit/v1/local/chain/run_cnn_chainali_1a.sh b/egs/ifnenit/v1/local/chain/run_cnn_chainali_1a.sh index b1f33b41a0c..7f3132d657e 100755 --- a/egs/ifnenit/v1/local/chain/run_cnn_chainali_1a.sh +++ b/egs/ifnenit/v1/local/chain/run_cnn_chainali_1a.sh @@ -128,7 +128,7 @@ if [ $stage -le 4 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) common1="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" common2="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" common3="required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" diff --git a/egs/ifnenit/v1/local/make_features.py b/egs/ifnenit/v1/local/make_features.py index 3a485e32eb1..87afa37c00a 100755 --- a/egs/ifnenit/v1/local/make_features.py +++ b/egs/ifnenit/v1/local/make_features.py @@ -10,7 +10,7 @@ eg. local/make_features.py data/train --feat-dim 40 """ - +from __future__ import division import argparse import os @@ -24,8 +24,8 @@ signal(SIGPIPE,SIG_DFL) parser = argparse.ArgumentParser(description="""Generates and saves the feature vectors""") -parser.add_argument('dir', type=str, help='directory of images.scp and is also output directory') -parser.add_argument('--out-ark', type=str, default='-', help='where to write the output feature file') +parser.add_argument('dir', help='directory of images.scp and is also output directory') +parser.add_argument('--out-ark', default='-', help='where to write the output feature file') parser.add_argument('--feat-dim', type=int, default=40, help='size to scale the height of all images') parser.add_argument('--padding', type=int, default=5, help='size to scale the height of all images') args = parser.parse_args() @@ -42,7 +42,7 @@ def write_kaldi_matrix(file_handle, matrix, key): if num_cols != len(matrix[row_index]): raise Exception("All the rows of a matrix are expected to " "have the same length") - file_handle.write(" ".join(map(lambda x: str(x), matrix[row_index]))) + file_handle.write(" ".join([str(x) for x in matrix[row_index]])) if row_index != num_rows - 1: file_handle.write("\n") file_handle.write(" ]\n") @@ -51,7 +51,7 @@ def get_scaled_image(im): scale_size = args.feat_dim sx = im.shape[1] sy = im.shape[0] - scale = (1.0 * scale_size) / sy + scale = (1.0 * scale_size)/ sy nx = int(scale_size) ny = int(scale * sx) im = misc.imresize(im, (nx, ny)) diff --git a/egs/librispeech/s5/RESULTS b/egs/librispeech/s5/RESULTS index 32b39b2c634..dbf54b9384d 100644 --- a/egs/librispeech/s5/RESULTS +++ b/egs/librispeech/s5/RESULTS @@ -1,6 +1,6 @@ # In the results below, "tgsmall" is the pruned 3-gram LM, which is used for lattice generation. # The following language models are then used for rescoring: -# a) tgmed- slightly less pruned 3-gram LM +# a) tgmed- slightly less pruned 3-gram LM # b) tglarge- the full, non-pruned 3-gram LM # c) fglarge- non-pruned 4-gram LM # @@ -8,7 +8,7 @@ # whereas "dev-other" and "test-other" sets contain more challenging speech ### SAT GMM model trained on the "train-clean-100" set (100 hours "clean" speech) -### for test in dev_clean test_clean dev_other test_other; do for lm in fglarge tglarge tgmed tgsmall; do grep WER exp/tri4b/decode_${lm}_${test}/wer* | best_wer.sh; done; echo; done +### for test in dev_clean test_clean dev_other test_other; do for lm in fglarge tglarge tgmed tgsmall; do grep WER exp/tri4b/decode_${lm}_${test}/wer* | utils/best_wer.sh; done; echo; done %WER 8.20 [ 4459 / 54402, 695 ins, 427 del, 3337 sub ] exp/tri4b/decode_fglarge_dev_clean/wer_14_0.5 %WER 8.60 [ 4677 / 54402, 763 ins, 399 del, 3515 sub ] exp/tri4b/decode_tglarge_dev_clean/wer_16_0.0 %WER 10.39 [ 5655 / 54402, 711 ins, 648 del, 4296 sub ] exp/tri4b/decode_tgmed_dev_clean/wer_16_0.0 @@ -31,7 +31,7 @@ ### SAT GMM model trained on the combined "train-clean-100" + "train-clean-360" set (460 hours "clean" speech) -### for test in dev_clean test_clean dev_other test_other; do for lm in fglarge tglarge tgmed tgsmall; do grep WER exp/tri5b/decode_${lm}_${test}/wer* | best_wer.sh; done; echo; done +### for test in dev_clean test_clean dev_other test_other; do for lm in fglarge tglarge tgmed tgsmall; do grep WER exp/tri5b/decode_${lm}_${test}/wer* | utils/best_wer.sh; done; echo; done %WER 7.05 [ 3835 / 54402, 588 ins, 370 del, 2877 sub ] exp/tri5b/decode_fglarge_dev_clean/wer_15_0.5 %WER 7.49 [ 4077 / 54402, 623 ins, 376 del, 3078 sub ] exp/tri5b/decode_tglarge_dev_clean/wer_14_0.5 %WER 9.38 [ 5104 / 54402, 701 ins, 533 del, 3870 sub ] exp/tri5b/decode_tgmed_dev_clean/wer_15_0.0 @@ -54,7 +54,7 @@ ### SAT GMM model trained on the combined "train-clean-100" + "train-clean-360" + "train-other-500" set (960 hours) -### for test in dev_clean test_clean dev_other test_other; do for lm in fglarge tglarge tgmed tgsmall; do grep WER exp/tri6b/decode_${lm}_${test}/wer* | best_wer.sh; done; echo; done +### for test in dev_clean test_clean dev_other test_other; do for lm in fglarge tglarge tgmed tgsmall; do grep WER exp/tri6b/decode_${lm}_${test}/wer* | utils/best_wer.sh; done; echo; done %WER 7.02 [ 3819 / 54402, 516 ins, 424 del, 2879 sub ] exp/tri6b/decode_fglarge_dev_clean/wer_14_1.0 %WER 7.33 [ 3988 / 54402, 506 ins, 468 del, 3014 sub ] exp/tri6b/decode_tglarge_dev_clean/wer_15_1.0 %WER 9.23 [ 5024 / 54402, 744 ins, 481 del, 3799 sub ] exp/tri6b/decode_tgmed_dev_clean/wer_13_0.0 @@ -337,7 +337,7 @@ %WER 4.39 [ 2387 / 54402, 377 ins, 199 del, 1811 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch3_dev_clean_tglarge/wer_14 %WER 5.36 [ 2918 / 54402, 328 ins, 338 del, 2252 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch3_dev_clean_tgmed/wer_17 %WER 6.08 [ 3305 / 54402, 369 ins, 396 del, 2540 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch3_dev_clean_tgsmall/wer_15 -%WER 4.40 [ 2395 / 54402, 375 ins, 200 del, 1820 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch4_dev_clean_tglarge/wer_14 +%WER 4.40 [ 2395 / 54402, 375 ins, 200 del, 1820 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch4_dev_clean_tglarge/wer_14 %WER 5.35 [ 2909 / 54402, 328 ins, 339 del, 2242 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch4_dev_clean_tgmed/wer_17 %WER 6.05 [ 3291 / 54402, 384 ins, 381 del, 2526 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch4_dev_clean_tgsmall/wer_14 %WER 13.45 [ 6850 / 50948, 808 ins, 876 del, 5166 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch0_dev_other_tglarge/wer_15 @@ -423,7 +423,7 @@ %WER 17.64 [ 9231 / 52343, 764 ins, 1662 del, 6805 sub ] exp/nnet2_online/nnet_ms_a_online/decode_pp_test_other_tgsmall_utt_offline/wer_14 # Results with nnet3 tdnn -# local/nnet3/run_tdnn.sh +# local/nnet3/run_tdnn.sh (with old configs, now moved to local/nnet3/tuning/run_tdnn_1a.sh) # (4 epoch training on speed-perturbed data) # num_params=19.3M %WER 4.43 [ 2410 / 54402, 306 ins, 278 del, 1826 sub ] exp/nnet3/tdnn_sp/decode_dev_clean_fglarge/wer_13_1.0 @@ -444,7 +444,7 @@ %WER 16.29 [ 8528 / 52343, 828 ins, 1320 del, 6380 sub ] exp/nnet3/tdnn_sp/decode_test_other_tgsmall/wer_14_0.0 # Results with nnet3 tdnn -# local/nnet3/run_tdnn.sh +# local/nnet3/run_tdnn.sh (with old configs, now moved to local/nnet3/tuning/run_tdnn_1a.sh) # (4 epoch training on speed-perturbed and volumn-perturbed "cleaned" data) # num_params=19.3M, average training time=68.8s per job(on Tesla K80), real-time factor=1.23161 # for x in exp/nnet3_cleaned/tdnn_sp/decode_*; do grep WER $x/wer_* | utils/best_wer.sh ; done @@ -465,6 +465,24 @@ %WER 14.78 [ 7737 / 52343, 807 ins, 1115 del, 5815 sub ] exp/nnet3_cleaned/tdnn_sp/decode_test_other_tgmed/wer_15_0.0 %WER 16.28 [ 8521 / 52343, 843 ins, 1258 del, 6420 sub ] exp/nnet3_cleaned/tdnn_sp/decode_test_other_tgsmall/wer_14_0.0 +# Results with nnet3 tdnn with new configs, a.k.a. xconfig +# local/nnet3/run_tdnn.sh (linked to local/nnet3/tuning/run_tdnn_1b.sh) +%WER 4.60 [ 2502 / 54402, 324 ins, 286 del, 1892 sub ] exp/nnet3_cleaned/tdnn_sp/decode_dev_clean_fglarge/wer_13_1.0 +%WER 4.80 [ 2612 / 54402, 350 ins, 285 del, 1977 sub ] exp/nnet3_cleaned/tdnn_sp/decode_dev_clean_tglarge/wer_11_1.0 +%WER 5.97 [ 3248 / 54402, 460 ins, 310 del, 2478 sub ] exp/nnet3_cleaned/tdnn_sp/decode_dev_clean_tgmed/wer_11_0.0 +%WER 6.66 [ 3625 / 54402, 479 ins, 392 del, 2754 sub ] exp/nnet3_cleaned/tdnn_sp/decode_dev_clean_tgsmall/wer_11_0.0 +%WER 12.29 [ 6262 / 50948, 863 ins, 665 del, 4734 sub ] exp/nnet3_cleaned/tdnn_sp/decode_dev_other_fglarge/wer_15_0.0 +%WER 12.89 [ 6565 / 50948, 773 ins, 853 del, 4939 sub ] exp/nnet3_cleaned/tdnn_sp/decode_dev_other_tglarge/wer_14_0.5 +%WER 15.41 [ 7849 / 50948, 894 ins, 1083 del, 5872 sub ] exp/nnet3_cleaned/tdnn_sp/decode_dev_other_tgmed/wer_15_0.0 +%WER 16.81 [ 8562 / 50948, 896 ins, 1215 del, 6451 sub ] exp/nnet3_cleaned/tdnn_sp/decode_dev_other_tgsmall/wer_14_0.0 +%WER 4.99 [ 2624 / 52576, 393 ins, 253 del, 1978 sub ] exp/nnet3_cleaned/tdnn_sp/decode_test_clean_fglarge/wer_13_0.5 +%WER 5.16 [ 2715 / 52576, 359 ins, 319 del, 2037 sub ] exp/nnet3_cleaned/tdnn_sp/decode_test_clean_tglarge/wer_12_1.0 +%WER 6.29 [ 3307 / 52576, 471 ins, 341 del, 2495 sub ] exp/nnet3_cleaned/tdnn_sp/decode_test_clean_tgmed/wer_12_0.0 +%WER 7.13 [ 3750 / 52576, 473 ins, 452 del, 2825 sub ] exp/nnet3_cleaned/tdnn_sp/decode_test_clean_tgsmall/wer_13_0.0 +%WER 12.73 [ 6665 / 52343, 894 ins, 711 del, 5060 sub ] exp/nnet3_cleaned/tdnn_sp/decode_test_other_fglarge/wer_14_0.0 +%WER 13.33 [ 6979 / 52343, 920 ins, 796 del, 5263 sub ] exp/nnet3_cleaned/tdnn_sp/decode_test_other_tglarge/wer_14_0.0 +%WER 15.90 [ 8323 / 52343, 921 ins, 1126 del, 6276 sub ] exp/nnet3_cleaned/tdnn_sp/decode_test_other_tgmed/wer_13_0.0 +%WER 17.28 [ 9044 / 52343, 894 ins, 1372 del, 6778 sub ] exp/nnet3_cleaned/tdnn_sp/decode_test_other_tgsmall/wer_14_0.0 # Results with nnet3 tdnn+sMBR # local/nnet3/run_tdnn_discriminative.sh diff --git a/egs/librispeech/s5/local/chain/run_cnn_tdnn.sh b/egs/librispeech/s5/local/chain/run_cnn_tdnn.sh new file mode 100755 index 00000000000..cd8f38d8309 --- /dev/null +++ b/egs/librispeech/s5/local/chain/run_cnn_tdnn.sh @@ -0,0 +1 @@ +tuning/run_cnn_tdnn_1a.sh diff --git a/egs/librispeech/s5/local/chain/run_tdnn_discriminative.sh b/egs/librispeech/s5/local/chain/run_tdnn_discriminative.sh index 6bf3a139ad1..ac3b74ed0c5 100755 --- a/egs/librispeech/s5/local/chain/run_tdnn_discriminative.sh +++ b/egs/librispeech/s5/local/chain/run_tdnn_discriminative.sh @@ -95,8 +95,8 @@ if [ $frame_subsampling_factor -ne 1 ]; then data_dirs= for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do - steps/shift_feats.sh --cmd "$train_cmd --max-jobs-run 40" --nj 350 \ - $x $train_data_dir exp/shift_hires mfcc_hires + utils/data/shift_feats.sh \ + $x $train_data_dir ${train_data_dir}_fs$x utils/fix_data_dir.sh ${train_data_dir}_fs$x data_dirs="$data_dirs ${train_data_dir}_fs$x" awk -v nfs=$x '{print "fs"nfs"-"$0}' $train_ivector_dir/ivector_online.scp >> ${train_ivector_dir}_fs/ivector_online.scp diff --git a/egs/librispeech/s5/local/chain/run_tdnn_lstm.sh b/egs/librispeech/s5/local/chain/run_tdnn_lstm.sh new file mode 120000 index 00000000000..a4fa11e0908 --- /dev/null +++ b/egs/librispeech/s5/local/chain/run_tdnn_lstm.sh @@ -0,0 +1 @@ +tuning/run_tdnn_lstm_1b.sh \ No newline at end of file diff --git a/egs/librispeech/s5/local/chain/tuning/run_cnn_tdnn_1a.sh b/egs/librispeech/s5/local/chain/tuning/run_cnn_tdnn_1a.sh new file mode 100755 index 00000000000..db17a35be64 --- /dev/null +++ b/egs/librispeech/s5/local/chain/tuning/run_cnn_tdnn_1a.sh @@ -0,0 +1,274 @@ +#!/bin/bash + +# This is based on tdnn_1d_sp, but adding cnn as the front-end. +# The cnn-tdnn-f (tdnn_cnn_1a_sp) outperforms the tdnn-f (tdnn_1d_sp). + +# bash local/chain/compare_wer.sh exp/chain_cleaned/tdnn_1d_sp exp/chain_cleaned/tdnn_cnn_1a_sp/ +# System tdnn_1d_sp tdnn_cnn_1a_sp +# WER on dev(fglarge) 3.29 3.34 +# WER on dev(tglarge) 3.44 3.39 +# WER on dev(tgmed) 4.22 4.29 +# WER on dev(tgsmall) 4.72 4.77 +# WER on dev_other(fglarge) 8.71 8.62 +# WER on dev_other(tglarge) 9.05 9.00 +# WER on dev_other(tgmed) 11.09 10.93 +# WER on dev_other(tgsmall) 12.13 12.02 +# WER on test(fglarge) 3.80 3.69 +# WER on test(tglarge) 3.89 3.80 +# WER on test(tgmed) 4.72 4.64 +# WER on test(tgsmall) 5.19 5.16 +# WER on test_other(fglarge) 8.76 8.71 +# WER on test_other(tglarge) 9.19 9.11 +# WER on test_other(tgmed) 11.22 11.00 +# WER on test_other(tgsmall) 12.24 12.16 +# Final train prob -0.0378 -0.0420 +# Final valid prob -0.0374 -0.0400 +# Final train prob (xent) -0.6099 -0.6881 +# Final valid prob (xent) -0.6353 -0.7180 +# Num-parameters 22623456 18100736 + + +set -e + +# configs for 'chain' +stage=0 +decode_nj=50 +train_set=train_960_cleaned +gmm=tri6b_cleaned +nnet3_affix=_cleaned + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=cnn_1a +tree_affix= +train_stage=-10 +get_egs_stage=-10 +decode_iter= + +# TDNN options +frames_per_eg=150,110,100 +remove_egs=true +common_egs_dir= +xent_regularize=0.1 +dropout_schedule='0,0@0.20,0.5@0.50,0' + +test_online_decoding=true # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # MFCC to filterbank + idct-layer name=idct input=input dim=40 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat + + linear-component name=ivector-linear $ivector_affine_opts dim=200 input=ReplaceIndex(ivector, t, 0) + batchnorm-component name=ivector-batchnorm target-rms=0.025 + batchnorm-component name=idct-batchnorm input=idct + + combine-feature-maps-layer name=combine_inputs input=Append(idct-batchnorm, ivector-batchnorm) num-filters1=1 num-filters2=5 height=40 + conv-relu-batchnorm-layer name=cnn1 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64 + conv-relu-batchnorm-layer name=cnn2 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64 + conv-relu-batchnorm-layer name=cnn3 $cnn_opts height-in=40 height-out=20 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=128 + conv-relu-batchnorm-layer name=cnn4 $cnn_opts height-in=20 height-out=20 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=128 + conv-relu-batchnorm-layer name=cnn5 $cnn_opts height-in=20 height-out=10 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=256 + conv-relu-batchnorm-layer name=cnn6 $cnn_opts height-in=10 height-out=10 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=256 + + # the first TDNN-F layer has no bypass + tdnnf-layer name=tdnnf7 $tdnnf_first_opts dim=1536 bottleneck-dim=256 time-stride=0 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf16 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf17 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf18 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + linear-component name=prefinal-l dim=256 $linear_opts + + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 15 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b{09,10,11,12}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --use-gpu "wait" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --constrained false" \ + --egs.chunk-width $frames_per_eg \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.num-chunk-per-minibatch 64 \ + --trainer.frames-per-iter 2500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.00015 \ + --trainer.optimization.final-effective-lrate 0.000015 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; + +fi + +graph_dir=$dir/graph_tgsmall +if [ $stage -le 16 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 --remove-oov data/lang_test_tgsmall $dir $graph_dir +fi + +iter_opts= +if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " +fi +if [ $stage -le 17 ]; then + rm $dir/.error 2>/dev/null || true + for decode_set in test_clean test_other dev_clean dev_other; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $decode_nj --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_tgsmall || exit 1 + steps/lmrescore.sh --cmd "$decode_cmd" --self-loop-scale 1.0 data/lang_test_{tgsmall,tgmed} \ + data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,tgmed} || exit 1 + steps/lmrescore_const_arpa.sh \ + --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \ + data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,tglarge} || exit 1 + steps/lmrescore_const_arpa.sh \ + --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \ + data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,fglarge} || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + +if $test_online_decoding && [ $stage -le 18 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang exp/nnet3${nnet3_affix}/extractor $dir ${dir}_online + + rm $dir/.error 2>/dev/null || true + for data in test_clean test_other dev_clean dev_other; do + ( + nspk=$(wc -l from the graph - fstrmsymbols --apply-to-output=true --remove-arcs=true "echo 3|" $graph_dir/HCLG.fst $graph_dir/HCLG.fst fi diff --git a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1b.sh index 7129827fe19..48d6ddb804f 100755 --- a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1b.sh +++ b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1b.sh @@ -122,7 +122,7 @@ if [ $stage -le 14 ]; then # create the config files for nnet initialization num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) mkdir -p $dir/configs cat < $dir/configs/network.xconfig @@ -206,10 +206,6 @@ if [ $stage -le 16 ]; then # far as the 'topo' is concerned, but this script doesn't read the 'topo' from # the lang directory. utils/mkgraph.sh --self-loop-scale 1.0 --remove-oov data/lang_test_tgsmall $dir $graph_dir - # remove from the graph, and convert back to const-FST. - fstrmsymbols --apply-to-output=true --remove-arcs=true "echo 3|" $graph_dir/HCLG.fst - | \ - fstconvert --fst_type=const > $graph_dir/temp.fst - mv $graph_dir/temp.fst $graph_dir/HCLG.fst fi diff --git a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1c.sh b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1c.sh index 29ebe62ddde..101fd6a4c15 100755 --- a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1c.sh +++ b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1c.sh @@ -112,7 +112,7 @@ if [ $stage -le 14 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) opts="l2-regularize=0.002" linear_opts="orthonormal-constraint=1.0" output_opts="l2-regularize=0.0005 bottleneck-dim=256" @@ -197,10 +197,6 @@ if [ $stage -le 16 ]; then # far as the 'topo' is concerned, but this script doesn't read the 'topo' from # the lang directory. utils/mkgraph.sh --self-loop-scale 1.0 --remove-oov data/lang_test_tgsmall $dir $graph_dir - # remove from the graph, and convert back to const-FST. - fstrmsymbols --apply-to-output=true --remove-arcs=true "echo 3|" $graph_dir/HCLG.fst - | \ - fstconvert --fst_type=const > $graph_dir/temp.fst - mv $graph_dir/temp.fst $graph_dir/HCLG.fst fi iter_opts= diff --git a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1d.sh b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1d.sh index 81b621ef86f..865b10dea0c 100755 --- a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1d.sh +++ b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1d.sh @@ -207,7 +207,7 @@ if [ $stage -le 14 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) affine_opts="l2-regularize=0.008 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true" tdnnf_opts="l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.75" linear_opts="l2-regularize=0.008 orthonormal-constraint=-1.0" @@ -297,10 +297,6 @@ if [ $stage -le 16 ]; then # far as the 'topo' is concerned, but this script doesn't read the 'topo' from # the lang directory. utils/mkgraph.sh --self-loop-scale 1.0 --remove-oov data/lang_test_tgsmall $dir $graph_dir - # remove from the graph, and convert back to const-FST. - fstrmsymbols --apply-to-output=true --remove-arcs=true "echo 3|" $graph_dir/HCLG.fst - | \ - fstconvert --fst_type=const > $graph_dir/temp.fst - mv $graph_dir/temp.fst $graph_dir/HCLG.fst fi iter_opts= diff --git a/egs/librispeech/s5/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/librispeech/s5/local/chain/tuning/run_tdnn_lstm_1a.sh new file mode 100755 index 00000000000..0e97e46194d --- /dev/null +++ b/egs/librispeech/s5/local/chain/tuning/run_tdnn_lstm_1a.sh @@ -0,0 +1,218 @@ +#!/bin/bash +# this is the tdnn-lstmp based on the run_tdnn_lstm_1n.sh under Switchboard. + +# training acoustic model and decoding: +# local/chain/tuning/run_tdnn_lstm_1a.sh +# System tdnn_lstm1a_sp +# WER on dev(fglarge) 3.44 +# WER on dev(tglarge) 3.55 +# WER on dev_other(fglarge) 8.63 +# WER on dev_other(tglarge) 9.09 +# WER on test(fglarge) 3.78 +# WER on test(tglarge) 3.94 +# WER on test_other(fglarge) 8.83 +# WER on test_other(tglarge) 9.09 +# Final train prob -0.0452 +# Final valid prob -0.0477 +# Final train prob (xent) -0.7874 +# Final valid prob (xent) -0.8150 +# Num-parameters 27790288 +# exp/chain_cleaned/tdnn_lstm1a_sp/: num-iters=1303 nj=3..16 num-params=27.8M dim=40+100->6056 combine=-0.041->-0.040 (over 9) xent:train/valid[867,1302,final]=(-1.15,-0.782,-0.787/-1.18,-0.810,-0.815) logprob:train/valid[867,1302,final]=(-0.063,-0.047,-0.045/-0.062,-0.049,-0.048) + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +affix=1a +decode_iter= +decode_nj=50 + +# LSTM training options +frames_per_chunk=140,100,160 +frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1) +chunk_left_context=40 +chunk_right_context=0 +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 +# decode options +extra_left_context=50 +extra_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' + +remove_egs=false +common_egs_dir= +nnet3_affix=_cleaned +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 $opts dim=1280 + linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-layer name=tdnn2 $opts input=Append(0,1) dim=1280 + linear-component name=tdnn3l dim=256 $linear_opts + relu-batchnorm-layer name=tdnn3 $opts dim=1280 + linear-component name=tdnn4l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-layer name=tdnn4 $opts input=Append(0,1) dim=1280 + linear-component name=tdnn5l dim=256 $linear_opts + relu-batchnorm-layer name=tdnn5 $opts dim=1280 input=Append(tdnn5l, tdnn3l) + linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn6 $opts input=Append(0,3) dim=1280 + linear-component name=lstm1l dim=256 $linear_opts input=Append(-3,0) + fast-lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=128 delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1280 + linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn8 $opts input=Append(0,3) dim=1280 + linear-component name=lstm2l dim=256 $linear_opts input=Append(-3,0) + fast-lstmp-layer name=lstm2 cell-dim=1280 recurrent-projection-dim=256 non-recurrent-projection-dim=128 delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn4l) dim=1280 + linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn10 $opts input=Append(0,3) dim=1280 + linear-component name=lstm3l dim=256 $linear_opts input=Append(-3,0) + fast-lstmp-layer name=lstm3 cell-dim=1280 recurrent-projection-dim=256 non-recurrent-projection-dim=128 delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm3 include-log-softmax=false $output_opts + + output-layer name=output-xent input=lstm3 learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/c0{1,2,5,7}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 6 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_chunk \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + + +graph_dir=$dir/graph_tgsmall +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 --remove-oov data/lang_test_tgsmall $dir $graph_dir +fi + + +iter_opts= +if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " +fi +if [ $stage -le 15 ]; then + rm $dir/.error 2>/dev/null || true + for decode_set in test_clean test_other dev_clean dev_other; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $decode_nj --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk "$frames_per_chunk_primary" \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_tgsmall || exit 1 + steps/lmrescore.sh --cmd "$decode_cmd" --self-loop-scale 1.0 data/lang_test_{tgsmall,tgmed} \ + data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,tgmed} || exit 1 + steps/lmrescore_const_arpa.sh \ + --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \ + data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,tglarge} || exit 1 + steps/lmrescore_const_arpa.sh \ + --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \ + data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,fglarge} || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi diff --git a/egs/librispeech/s5/local/chain/tuning/run_tdnn_lstm_1b.sh b/egs/librispeech/s5/local/chain/tuning/run_tdnn_lstm_1b.sh new file mode 100755 index 00000000000..0da813267fc --- /dev/null +++ b/egs/librispeech/s5/local/chain/tuning/run_tdnn_lstm_1b.sh @@ -0,0 +1,253 @@ +#!/bin/bash +# this is the tdnn-lstmp based on the run_tdnn_lstm_1a.sh under Librispeech but with larger model size. + +# training acoustic model and decoding: +# local/chain/tuning/run_tdnn_lstm_1b.sh +# local/chain/compare_wer.sh exp/chain_cleaned/tdnn_lstm1a_sp exp/chain_cleaned/tdnn_lstm1b_sp +# System tdnn_lstm1a_sp tdnn_lstm1b_sp +# WER on dev(fglarge) 3.44 3.36 +# WER on dev(tglarge) 3.55 3.48 +# WER on dev(tgmed) 4.41 4.26 +# WER on dev(tgsmall) 4.82 4.71 +# WER on dev_other(fglarge) 8.63 8.43 +# WER on dev_other(tglarge) 9.09 8.94 +# WER on dev_other(tgmed) 10.99 10.65 +# WER on dev_other(tgsmall) 11.95 11.51 +# WER on test(fglarge) 3.78 3.83 +# WER on test(tglarge) 3.94 3.93 +# WER on test(tgmed) 4.68 4.72 +# WER on test(tgsmall) 5.11 5.10 +# WER on test_other(fglarge) 8.83 8.69 +# WER on test_other(tglarge) 9.09 9.10 +# WER on test_other(tgmed) 11.05 10.86 +# WER on test_other(tgsmall) 12.18 11.83 +# Final train prob -0.0452 -0.0417 +# Final valid prob -0.0477 -0.0459 +# Final train prob (xent) -0.7874 -0.7488 +# Final valid prob (xent) -0.8150 -0.7757 +# Num-parameters 27790288 45245520 + +# rnn-lm rescoring: +# local/rnnlm/tuning/run_tdnn_lstm_1a.sh --ac-model-dir exp/chain_cleaned/tdnn_lstm1b_sp/ +# System tdnn_lstm1b_sp +# WER on dev(fglarge_nbe_rnnlm) 2.73 +# WER on dev(fglarge_lat_rnnlm) 2.83 +# WER on dev(fglarge) 3.36 +# WER on dev(tglarge) 3.48 +# WER on dev_other(fglarge_nbe_rnnlm) 7.20 +# WER on dev_other(fglarge_lat_rnnlm) 7.23 +# WER on dev_other(fglarge) 8.43 +# WER on dev_other(tglarge) 8.94 +# WER on test(fglarge_nbe_rnnlm) 3.10 +# WER on test(fglarge_lat_rnnlm) 3.22 +# WER on test(fglarge) 3.83 +# WER on test(tglarge) 3.93 +# WER on test_other(fglarge_nbe_rnnlm) 7.54 +# WER on test_other(fglarge_lat_rnnlm) 7.65 +# WER on test_other(fglarge) 8.69 +# WER on test_other(tglarge) 9.10 +# Final train prob -0.0417 +# Final valid prob -0.0459 +# Final train prob (xent) -0.7488 +# Final valid prob (xent) -0.7757 +# Num-parameters 45245520 + + + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +affix=1b +decode_iter= +decode_nj=50 + +# LSTM training options +frames_per_chunk=140,100,160 +frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1) +chunk_left_context=40 +chunk_right_context=0 +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 +# decode options +extra_left_context=50 +extra_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' + +remove_egs=false +common_egs_dir= +nnet3_affix=_cleaned +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 $opts dim=1280 + linear-component name=tdnn2l dim=320 $linear_opts input=Append(-1,0) + relu-batchnorm-layer name=tdnn2 $opts input=Append(0,1) dim=1280 + linear-component name=tdnn3l dim=320 $linear_opts + relu-batchnorm-layer name=tdnn3 $opts dim=1280 + linear-component name=tdnn4l dim=320 $linear_opts input=Append(-1,0) + relu-batchnorm-layer name=tdnn4 $opts input=Append(0,1) dim=1280 + linear-component name=tdnn5l dim=320 $linear_opts + relu-batchnorm-layer name=tdnn5 $opts dim=1280 input=Append(tdnn5l, tdnn3l) + linear-component name=tdnn6l dim=320 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn6 $opts input=Append(0,3) dim=1280 + linear-component name=lstm1l dim=320 $linear_opts input=Append(-3,0) + fast-lstmp-layer name=lstm1 cell-dim=1536 recurrent-projection-dim=384 non-recurrent-projection-dim=384 delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1280 + linear-component name=tdnn8l dim=320 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn8 $opts input=Append(0,3) dim=1280 + linear-component name=lstm2l dim=320 $linear_opts input=Append(-3,0) + fast-lstmp-layer name=lstm2 cell-dim=1536 recurrent-projection-dim=384 non-recurrent-projection-dim=384 delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn4l) dim=1280 + linear-component name=tdnn10l dim=320 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn10 $opts input=Append(0,3) dim=1280 + linear-component name=lstm3l dim=320 $linear_opts input=Append(-3,0) + fast-lstmp-layer name=lstm3 cell-dim=1536 recurrent-projection-dim=384 non-recurrent-projection-dim=384: delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm3 include-log-softmax=false $output_opts + + output-layer name=output-xent input=lstm3 learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/c0{1,2,5,7}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 6 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_chunk \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + + +graph_dir=$dir/graph_tgsmall +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 --remove-oov data/lang_test_tgsmall $dir $graph_dir +fi + + +iter_opts= +if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " +fi +if [ $stage -le 15 ]; then + rm $dir/.error 2>/dev/null || true + for decode_set in test_clean test_other dev_clean dev_other; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $decode_nj --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk "$frames_per_chunk_primary" \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_tgsmall || exit 1 + steps/lmrescore.sh --cmd "$decode_cmd" --self-loop-scale 1.0 data/lang_test_{tgsmall,tgmed} \ + data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,tgmed} || exit 1 + steps/lmrescore_const_arpa.sh \ + --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \ + data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,tglarge} || exit 1 + steps/lmrescore_const_arpa.sh \ + --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \ + data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,fglarge} || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi diff --git a/egs/librispeech/s5/local/data_prep.sh b/egs/librispeech/s5/local/data_prep.sh index dea93525e28..20c5697d61f 100755 --- a/egs/librispeech/s5/local/data_prep.sh +++ b/egs/librispeech/s5/local/data_prep.sh @@ -31,7 +31,6 @@ wav_scp=$dst/wav.scp; [[ -f "$wav_scp" ]] && rm $wav_scp trans=$dst/text; [[ -f "$trans" ]] && rm $trans utt2spk=$dst/utt2spk; [[ -f "$utt2spk" ]] && rm $utt2spk spk2gender=$dst/spk2gender; [[ -f $spk2gender ]] && rm $spk2gender -utt2dur=$dst/utt2dur; [[ -f "$utt2dur" ]] && rm $utt2dur for reader_dir in $(find -L $src -mindepth 1 -maxdepth 1 -type d | sort); do reader=$(basename $reader_dir) @@ -79,8 +78,6 @@ nutt2spk=$(wc -l <$utt2spk) ! [ "$ntrans" -eq "$nutt2spk" ] && \ echo "Inconsistent #transcripts($ntrans) and #utt2spk($nutt2spk)" && exit 1; -utils/data/get_utt2dur.sh $dst 1>&2 || exit 1 - utils/validate_data_dir.sh --no-feats $dst || exit 1; echo "$0: successfully prepared data in $dst" diff --git a/egs/librispeech/s5/local/download_and_untar.sh b/egs/librispeech/s5/local/download_and_untar.sh index d01e681fed7..1bb6d909edc 100755 --- a/egs/librispeech/s5/local/download_and_untar.sh +++ b/egs/librispeech/s5/local/download_and_untar.sh @@ -67,7 +67,9 @@ if [ -f $data/$part.tar.gz ]; then fi fi -if [ ! -f $data/$part.tar.gz ]; then +pushd $data + +if [ ! -f $part.tar.gz ]; then if ! which wget >/dev/null; then echo "$0: wget is not installed." exit 1; @@ -75,20 +77,19 @@ if [ ! -f $data/$part.tar.gz ]; then full_url=$url/$part.tar.gz echo "$0: downloading data from $full_url. This may take some time, please be patient." - cd $data if ! wget --no-check-certificate $full_url; then echo "$0: error executing wget $full_url" exit 1; fi fi -cd $data - if ! tar -xvzf $part.tar.gz; then echo "$0: error un-tarring archive $data/$part.tar.gz" exit 1; fi +popd >&/dev/null + touch $data/LibriSpeech/$part/.complete echo "$0: Successfully downloaded and un-tarred $data/$part.tar.gz" diff --git a/egs/librispeech/s5/local/lm/python/text_post_process.py b/egs/librispeech/s5/local/lm/python/text_post_process.py index 4ffbbe04b1f..344c1b291bd 100755 --- a/egs/librispeech/s5/local/lm/python/text_post_process.py +++ b/egs/librispeech/s5/local/lm/python/text_post_process.py @@ -21,10 +21,10 @@ def parse_args(): parser.add_argument('--abort-long-sent', type=bool, default=False, help='If True and a sentence longer than "max-sent-len" detected' +\ 'exit with error code 1. If False, just split the long sentences.') - parser.add_argument('--sent-end-marker', type=str, default="DOTDOTDOT") - parser.add_argument("in_text", type=str, help="Input text") - parser.add_argument("out_text", type=str, help="Output text") - parser.add_argument("sent_bounds", type=str, + parser.add_argument('--sent-end-marker', default="DOTDOTDOT") + parser.add_argument("in_text", help="Input text") + parser.add_argument("out_text", help="Output text") + parser.add_argument("sent_bounds", help="A file that will contain a comma separated list of numbers, s.t. if" + "i is in this list, then there is a sententence break after token i") return parser.parse_args() @@ -66,7 +66,7 @@ def parse_args(): n_tokens += 1 start_scan = 4 current_line.append('SUN') - for i in xrange(start_scan, len(opl_tokens)): + for i in range(start_scan, len(opl_tokens)): m = re.match("^[A-Z]+\'?[A-Z\']*$", opl_tokens[i]) if m is not None: n_tokens += 1 diff --git a/egs/librispeech/s5/local/lm/python/text_pre_process.py b/egs/librispeech/s5/local/lm/python/text_pre_process.py index 6228079b3a3..b75d0711d13 100755 --- a/egs/librispeech/s5/local/lm/python/text_pre_process.py +++ b/egs/librispeech/s5/local/lm/python/text_pre_process.py @@ -20,13 +20,13 @@ def parse_args(): parser = argparse.ArgumentParser(description="Pre-process a book's text") - parser.add_argument("--in-encoding", type=str, default="utf-8", + parser.add_argument("--in-encoding", default="utf-8", help="Encoding to use when reading the input text") - parser.add_argument("--out-encoding", type=str, default="ascii", + parser.add_argument("--out-encoding", default="ascii", help="Encoding to use when writing the output text") - parser.add_argument('--sent-end-marker', type=str, default="DOTDOTDOT") - parser.add_argument("in_text", type=str, help="Input text") - parser.add_argument("out_text", type=str, help="Output text") + parser.add_argument('--sent-end-marker', default="DOTDOTDOT") + parser.add_argument("in_text", help="Input text") + parser.add_argument("out_text", help="Output text") return parser.parse_args() # http://rosettacode.org/wiki/Roman_numerals/Decode#Python diff --git a/egs/librispeech/s5/local/lm/train_lm.sh b/egs/librispeech/s5/local/lm/train_lm.sh index 04badd95b26..6e6ae5970fb 100755 --- a/egs/librispeech/s5/local/lm/train_lm.sh +++ b/egs/librispeech/s5/local/lm/train_lm.sh @@ -50,7 +50,7 @@ if [ "$stage" -le 1 ]; then split_files=$(eval "echo $split_prefix-{$(seq -s',' $normjobs | sed 's/,$//')}") find $corpus_dir -mindepth 1 -maxdepth 1 -type d |\ tee $tmp_dir/all_texts.txt |\ - utils/split_scp.pl - $split_files + utils/split_scp.pl /dev/stdin $split_files echo "Checking the splits ..." total_count=$(wc -l <$tmp_dir/all_texts.txt) split_count=$(cat $split_files | wc -l | awk 'BEGIN{c=0} {c+=$1;} END{print c}') diff --git a/egs/librispeech/s5/local/nnet3/run_tdnn.sh b/egs/librispeech/s5/local/nnet3/run_tdnn.sh deleted file mode 100755 index 28ee2b92004..00000000000 --- a/egs/librispeech/s5/local/nnet3/run_tdnn.sh +++ /dev/null @@ -1,127 +0,0 @@ -#!/bin/bash - -# this is the standard "tdnn" system, built in nnet3; it's what we use to -# call multi-splice. - -# without cleanup: -# local/nnet3/run_tdnn.sh --train-set train960 --gmm tri6b --nnet3-affix "" & - - -# At this script level we don't support not running on GPU, as it would be painfully slow. -# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false, -# --num-threads 16 and --minibatch-size 128. - -# First the options that are passed through to run_ivector_common.sh -# (some of which are also used in this script directly). -stage=0 -decode_nj=30 -train_set=train_960_cleaned -gmm=tri6b_cleaned # this is the source gmm-dir for the data-type of interest; it - # should have alignments for the specified training data. -nnet3_affix=_cleaned - -# Options which are not passed through to run_ivector_common.sh -affix= -train_stage=-10 -common_egs_dir= -reporting_email= -remove_egs=true - -. ./cmd.sh -. ./path.sh -. ./utils/parse_options.sh - - -if ! cuda-compiled; then - cat </dev/null || true - for test in test_clean test_other dev_clean dev_other; do - ( - steps/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \ - --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${test}_hires \ - ${graph_dir} data/${test}_hires $dir/decode_${test}_tgsmall || exit 1 - steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \ - data/${test}_hires $dir/decode_${test}_{tgsmall,tgmed} || exit 1 - steps/lmrescore_const_arpa.sh \ - --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \ - data/${test}_hires $dir/decode_${test}_{tgsmall,tglarge} || exit 1 - steps/lmrescore_const_arpa.sh \ - --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \ - data/${test}_hires $dir/decode_${test}_{tgsmall,fglarge} || exit 1 - ) || touch $dir/.error & - done - wait - [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1 -fi - -exit 0; diff --git a/egs/librispeech/s5/local/nnet3/run_tdnn.sh b/egs/librispeech/s5/local/nnet3/run_tdnn.sh new file mode 120000 index 00000000000..61f8f499182 --- /dev/null +++ b/egs/librispeech/s5/local/nnet3/run_tdnn.sh @@ -0,0 +1 @@ +tuning/run_tdnn_1b.sh \ No newline at end of file diff --git a/egs/librispeech/s5/local/nnet3/tuning/run_tdnn_1a.sh b/egs/librispeech/s5/local/nnet3/tuning/run_tdnn_1a.sh new file mode 100755 index 00000000000..28ee2b92004 --- /dev/null +++ b/egs/librispeech/s5/local/nnet3/tuning/run_tdnn_1a.sh @@ -0,0 +1,127 @@ +#!/bin/bash + +# this is the standard "tdnn" system, built in nnet3; it's what we use to +# call multi-splice. + +# without cleanup: +# local/nnet3/run_tdnn.sh --train-set train960 --gmm tri6b --nnet3-affix "" & + + +# At this script level we don't support not running on GPU, as it would be painfully slow. +# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false, +# --num-threads 16 and --minibatch-size 128. + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +decode_nj=30 +train_set=train_960_cleaned +gmm=tri6b_cleaned # this is the source gmm-dir for the data-type of interest; it + # should have alignments for the specified training data. +nnet3_affix=_cleaned + +# Options which are not passed through to run_ivector_common.sh +affix= +train_stage=-10 +common_egs_dir= +reporting_email= +remove_egs=true + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat </dev/null || true + for test in test_clean test_other dev_clean dev_other; do + ( + steps/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${test}_hires \ + ${graph_dir} data/${test}_hires $dir/decode_${test}_tgsmall || exit 1 + steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \ + data/${test}_hires $dir/decode_${test}_{tgsmall,tgmed} || exit 1 + steps/lmrescore_const_arpa.sh \ + --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \ + data/${test}_hires $dir/decode_${test}_{tgsmall,tglarge} || exit 1 + steps/lmrescore_const_arpa.sh \ + --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \ + data/${test}_hires $dir/decode_${test}_{tgsmall,fglarge} || exit 1 + ) || touch $dir/.error & + done + wait + [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1 +fi + +exit 0; diff --git a/egs/librispeech/s5/local/nnet3/tuning/run_tdnn_1b.sh b/egs/librispeech/s5/local/nnet3/tuning/run_tdnn_1b.sh new file mode 100755 index 00000000000..a96a1b33e6c --- /dev/null +++ b/egs/librispeech/s5/local/nnet3/tuning/run_tdnn_1b.sh @@ -0,0 +1,135 @@ +#!/bin/bash + +# 1b is as 1a but uses xconfigs. + +# this is the standard "tdnn" system, built in nnet3; it's what we use to +# call multi-splice. + +# without cleanup: +# local/nnet3/run_tdnn.sh --train-set train960 --gmm tri6b --nnet3-affix "" & + + +# At this script level we don't support not running on GPU, as it would be painfully slow. +# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false, +# --num-threads 16 and --minibatch-size 128. + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +decode_nj=30 +train_set=train_960_cleaned +gmm=tri6b_cleaned # this is the source gmm-dir for the data-type of interest; it + # should have alignments for the specified training data. +nnet3_affix=_cleaned + +# Options which are not passed through to run_ivector_common.sh +affix= +train_stage=-10 +common_egs_dir= +reporting_email= +remove_egs=true + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + relu-batchnorm-layer name=tdnn0 dim=1280 + relu-batchnorm-layer name=tdnn1 dim=1280 input=Append(-1,2) + relu-batchnorm-layer name=tdnn2 dim=1280 input=Append(-3,3) + relu-batchnorm-layer name=tdnn3 dim=1280 input=Append(-7,2) + relu-batchnorm-layer name=tdnn4 dim=1280 + output-layer name=output input=tdnn4 dim=$num_targets max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig \ + --config-dir $dir/configs || exit 1; +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/train_dnn.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0017 \ + --trainer.optimization.final-effective-lrate 0.00017 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --cleanup.preserve-model-interval 100 \ + --feat-dir=$train_data_dir \ + --ali-dir $ali_dir \ + --lang data/lang \ + --reporting.email="$reporting_email" \ + --dir=$dir || exit 1; + +fi + +if [ $stage -le 13 ]; then + # this does offline decoding that should give about the same results as the + # real online decoding (the one with --per-utt true) + rm $dir/.error 2>/dev/null || true + for test in test_clean test_other dev_clean dev_other; do + ( + steps/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${test}_hires \ + ${graph_dir} data/${test}_hires $dir/decode_${test}_tgsmall || exit 1 + steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \ + data/${test}_hires $dir/decode_${test}_{tgsmall,tgmed} || exit 1 + steps/lmrescore_const_arpa.sh \ + --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \ + data/${test}_hires $dir/decode_${test}_{tgsmall,tglarge} || exit 1 + steps/lmrescore_const_arpa.sh \ + --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \ + data/${test}_hires $dir/decode_${test}_{tgsmall,fglarge} || exit 1 + ) || touch $dir/.error & + done + wait + [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1 +fi + +exit 0; diff --git a/egs/librispeech/s5/local/prepare_dict.sh b/egs/librispeech/s5/local/prepare_dict.sh index f798a804355..f9efb2ee46b 100755 --- a/egs/librispeech/s5/local/prepare_dict.sh +++ b/egs/librispeech/s5/local/prepare_dict.sh @@ -75,7 +75,7 @@ if [ $stage -le 1 ]; then auto_vocab_splits=$(eval "echo $auto_vocab_prefix.{$(seq -s',' $nj | sed 's/,$//')}") awk 'NR==FNR{a[$1] = 1; next} !($1 in a)' $cmudict_plain $vocab |\ sort | tee $g2p_dir/vocab_autogen.full |\ - utils/split_scp.pl - $auto_vocab_splits || exit 1 + utils/split_scp.pl /dev/stdin $auto_vocab_splits || exit 1 echo "Autogenerating pronunciations for the words in $auto_vocab_prefix.* ..." $cmd JOB=1:$nj $g2p_dir/log/g2p.JOB.log \ local/g2p.sh $auto_vocab_prefix.JOB $g2p_model_dir $auto_lexicon_prefix.JOB || exit 1 diff --git a/egs/librispeech/s5/local/rnnlm/tuning/run_tdnn_lstm_1a.sh b/egs/librispeech/s5/local/rnnlm/tuning/run_tdnn_lstm_1a.sh new file mode 100755 index 00000000000..137a972f3d9 --- /dev/null +++ b/egs/librispeech/s5/local/rnnlm/tuning/run_tdnn_lstm_1a.sh @@ -0,0 +1,166 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (author: Daniel Povey) +# 2018 Ke Li + +# This script trains LMs on the librispeech-lm-norm.txt.gz. + +# rnnlm/train_rnnlm.sh: best iteration (out of 143) was 142, linking it to final iteration. +# rnnlm/train_rnnlm.sh: train/dev perplexity was 109.2 / 110.7. +# Train objf: -5.74 -5.54 -5.44 -5.37 -5.32 -5.28 -5.25 -5.23 -5.20 -5.18 -5.15 -5.14 -5.12 -5.10 -5.09 -5.08 -5.07 -5.05 -5.04 -5.04 -5.03 -5.02 -5.01 -5.00 -4.99 -4.99 -4.98 -4.97 -4.96 -4.96 -4.95 -4.95 -4.94 -4.93 -4.93 -4.92 -4.92 -4.92 -4.91 -4.90 -4.90 -4.89 -4.89 -4.89 -4.88 -4.88 -4.87 -4.87 -4.87 -4.86 -4.86 -4.86 -4.85 -4.85 -4.84 -4.84 -4.84 -4.84 -4.84 -4.83 -4.83 -4.83 -4.82 -4.82 -4.82 -4.82 -4.81 -4.81 -4.81 -4.81 -4.80 -4.80 -4.80 -4.79 -4.79 -4.79 -4.79 -4.78 -4.79 -4.78 -4.78 -4.78 -4.78 -4.77 -4.77 -4.77 -4.77 -4.77 -4.76 -4.76 -4.76 -4.76 -4.76 -4.75 -4.75 -4.75 -4.75 -4.75 -4.74 -4.74 -4.74 -4.74 -4.74 -4.74 -4.73 -4.74 -4.74 -4.73 -4.73 -4.73 -4.73 -4.73 -4.72 -4.73 -4.73 -4.73 -4.72 -4.72 -4.72 -4.72 -4.72 -4.72 -4.72 -4.72 -4.71 -4.71 -4.71 -4.71 -4.71 -4.70 -4.70 -4.70 -4.70 -4.70 -4.69 -4.69 -4.69 -4.69 -4.69 -4.69 -4.68 -4.68 +# Dev objf: -5.99 -5.65 -5.53 -5.44 -5.38 -5.34 -5.30 -5.27 -5.22 -5.20 -5.18 -5.16 -5.14 -5.12 -5.11 -5.10 -5.09 -5.08 -5.07 -5.05 -5.04 -5.04 -5.03 -5.01 -5.00 -4.99 -4.99 -4.98 -4.97 -4.97 0.00 -4.96 -4.95 -4.95 -4.94 -4.93 -4.93 -4.92 -4.92 -4.91 -4.91 -4.90 -4.90 -4.89 -4.89 -4.89 -4.88 -4.88 -4.88 -4.87 -4.87 -4.87 -4.86 -4.86 -4.85 -4.85 -4.87 -4.84 -4.84 -4.84 -4.83 -4.91 -4.83 -4.83 -4.83 -4.82 -4.82 -4.82 -4.82 -4.81 -4.81 -4.81 -4.80 -4.80 -4.80 -4.80 -4.80 -4.79 -4.79 -4.79 -4.79 -4.79 -4.79 -4.78 -4.78 -4.79 -4.78 -4.77 -4.77 -4.77 -4.77 -4.77 -4.77 -4.77 -4.76 -4.76 -4.76 -4.76 -4.76 -4.75 -4.75 -4.75 -4.75 -4.75 -4.75 -4.75 -4.75 -4.75 -4.75 -4.75 -4.75 -4.74 -4.74 -4.74 -4.74 -4.74 -4.74 -4.74 -4.73 -4.74 -4.73 -4.73 -4.73 -4.73 -4.73 -4.73 -4.72 -4.72 -4.72 -4.72 -4.72 -4.72 -4.72 -4.72 -4.71 -4.71 -4.71 -4.71 -4.71 -4.71 -4.71 -4.71 + +# WER summary on dev and test sets +# System tdnn_1d_sp +lattice_rescore +nbest_rescore +# WER on dev(fglarge) 3.34 2.71 2.62 +# WER on dev(tglarge) 3.44 2.75 2.66 +# WER on dev_other(fglarge) 8.70 7.37 7.55 +# WER on dev_other(tglarge) 9.25 7.56 7.73 +# WER on test(fglarge) 3.77 3.12 3.06 +# WER on test(tglarge) 3.85 3.18 3.11 +# WER on test_other(fglarge) 8.91 7.63 7.68 +# WER on test_other(tglarge) 9.31 7.83 7.95 + +# command to get the WERs above: +# tdnn_1d_sp +# for test in dev_clean test_clean dev_other test_other; do for lm in fglarge tglarge; do grep WER exp/chain_cleaned/tdnn_1d_sp/decode_${test}_${lm}/wer* | best_wer.sh; done; done +# tdnn_1d_sp with lattice rescoring +# for test in dev_clean test_clean dev_other test_other; do for lm in fglarge tglarge; do grep WER exp/chain_cleaned/tdnn_1d_sp/decode_${test}_${lm}_rnnlm_1a_rescore/wer* | best_wer.sh; done; done +# tdnn_1d_sp with nbest rescoring +# for test in dev_clean test_clean dev_other test_other; do for lm in fglarge tglarge; do grep WER exp/chain_cleaned/tdnn_1d_sp/decode_${test}_${lm}_rnnlm_1a_nbest_rescore/wer* | best_wer.sh; done; done + +# Begin configuration section. + +dir=exp/rnnlm_lstm_1a +embedding_dim=1024 +lstm_rpd=256 +lstm_nrpd=256 +stage=-10 +train_stage=-10 +epochs=4 + +# variables for lattice rescoring +run_lat_rescore=true +run_nbest_rescore=true +run_backward_rnnlm=false +ac_model_dir=exp/chain_cleaned/tdnn_1d_sp +decode_dir_suffix=rnnlm_1a +ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order + # if it's set, it merges histories in the lattice if they share + # the same ngram history and this prevents the lattice from + # exploding exponentially +pruned_rescore=true + +. ./cmd.sh +. ./utils/parse_options.sh + +text=data/local/lm/librispeech-lm-norm.txt.gz +lexicon=data/lang_nosp/words.txt +text_dir=data/rnnlm/text +mkdir -p $dir/config +set -e + +for f in $lexicon; do + [ ! -f $f ] && \ + echo "$0: expected file $f to exist; search for run.sh in run.sh" && exit 1 +done + +if [ $stage -le 0 ]; then + mkdir -p $text_dir + if [ ! -f $text ]; then + wget http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz -P data/local/lm + fi + echo -n >$text_dir/dev.txt + # hold out one in every 2000 lines as dev data. + gunzip -c $text | cut -d ' ' -f2- | awk -v text_dir=$text_dir '{if(NR%2000 == 0) { print >text_dir"/dev.txt"; } else {print;}}' >$text_dir/librispeech.txt +fi + +if [ $stage -le 1 ]; then + cp $lexicon $dir/config/ + n=`cat $dir/config/words.txt | wc -l` + echo " $n" >> $dir/config/words.txt + + # words that are not present in words.txt but are in the training or dev data, will be + # mapped to during training. + echo "" >$dir/config/oov.txt + + cat > $dir/config/data_weights.txt <$dir/config/unigram_probs.txt + + # choose features + rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \ + --top-word-features=5000 \ + --use-constant-feature=true \ + --special-words=',,,,' \ + $dir/config/words.txt > $dir/config/features.txt + + cat >$dir/config/xconfig <$lat_dir/splice_opts - fi if [ $stage -le 3 ]; then @@ -133,7 +129,7 @@ if [ $stage -le 4 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) common1="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" common2="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" common3="required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" @@ -185,7 +181,7 @@ if [ $stage -le 5 ]; then --chain.leaky-hmm-coefficient=0.1 \ --chain.l2-regularize=0.00005 \ --chain.apply-deriv-weights=false \ - --chain.lm-opts="--num-extra-lm-states=500" \ + --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \ --chain.frame-subsampling-factor=$frame_subsampling_factor \ --chain.alignment-subsampling-factor=1 \ --chain.left-tolerance 3 \ @@ -193,7 +189,7 @@ if [ $stage -le 5 ]; then --trainer.srand=$srand \ --trainer.max-param-change=2.0 \ --trainer.num-epochs=4 \ - --trainer.frames-per-iter=1000000 \ + --trainer.frames-per-iter=2000000 \ --trainer.optimization.num-jobs-initial=3 \ --trainer.optimization.num-jobs-final=16 \ --trainer.optimization.initial-effective-lrate=0.001 \ @@ -201,11 +197,8 @@ if [ $stage -le 5 ]; then --trainer.optimization.shrink-value=1.0 \ --trainer.num-chunk-per-minibatch=64,32 \ --trainer.optimization.momentum=0.0 \ + --trainer.add-option="--optimization.memory-compression-level=2" \ --egs.chunk-width=$chunk_width \ - --egs.chunk-left-context=$chunk_left_context \ - --egs.chunk-right-context=$chunk_right_context \ - --egs.chunk-left-context-initial=0 \ - --egs.chunk-right-context-final=0 \ --egs.dir="$common_egs_dir" \ --egs.opts="--frames-overlap-per-eg 0 --constrained false" \ --cleanup.remove-egs=$remove_egs \ @@ -226,18 +219,20 @@ if [ $stage -le 6 ]; then # as long as phones.txt was compatible. utils/mkgraph.sh \ - --self-loop-scale 1.0 data/$lang_test \ + --self-loop-scale 1.0 $lang_decode \ $dir $dir/graph || exit 1; fi if [ $stage -le 7 ]; then frames_per_chunk=$(echo $chunk_width | cut -d, -f1) steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context $chunk_left_context \ - --extra-right-context $chunk_right_context \ - --extra-left-context-initial 0 \ - --extra-right-context-final 0 \ --frames-per-chunk $frames_per_chunk \ --nj $nj --cmd "$cmd" \ $dir/graph data/test $dir/decode_test || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/test $dir/decode_test{,_rescored} || exit 1 fi + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/madcat_ar/v1/local/chain/tuning/run_e2e_cnn_1a.sh b/egs/madcat_ar/v1/local/chain/tuning/run_e2e_cnn_1a.sh new file mode 100755 index 00000000000..3caf8ae4494 --- /dev/null +++ b/egs/madcat_ar/v1/local/chain/tuning/run_e2e_cnn_1a.sh @@ -0,0 +1,133 @@ +#!/bin/bash +# Copyright 2017 Hossein Hadian + +# This script does end2end chain training (i.e. from scratch) +# ./local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/ +# System e2e_cnn_1a e2e_cnn_1a (with extra corpus text) +# WER 9.47 5.73 +# WER (rescored) 8.05 5.67 +# CER 2.45 1.45 +# CER (rescored) 2.10 1.42 +# Final train prob -0.0934 -0.0934 +# Final valid prob -0.0746 -0.0746 +# Final train prob (xent) +# Final valid prob (xent) +# Parameters 2.94M 2.94M + +# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a/ +# exp/chain/e2e_cnn_1a/: num-iters=98 nj=6..16 num-params=2.9M dim=40->330 combine=-0.071->-0.070 (over 5) logprob:train/valid[64,97,final]=(-0.089,-0.084,-0.093/-0.075,-0.073,-0.075) +set -e + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +affix=1a + +# training options +tdnn_dim=450 +minibatch_size=150=128,64/300=128,64/600=64,32/1200=32,16 +common_egs_dir= +cmvn_opts="--norm-means=false --norm-vars=false" +train_set=train +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 1 ]; then + steps/nnet3/chain/e2e/prepare_e2e.sh --nj 30 --cmd "$cmd" \ + --shared-phones true \ + --type mono \ + data/$train_set $lang $treedir + $cmd $treedir/log/make_phone_lm.log \ + cat data/$train_set/text \| \ + steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \ + utils/sym2int.pl -f 2- data/lang/phones.txt \| \ + chain-est-phone-lm --num-extra-lm-states=500 \ + ark:- $treedir/phone_lm.fst +fi + +if [ $stage -le 2 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}') + common1="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs +fi + +if [ $stage -le 3 ]; then + steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \ + --cmd "$cmd" \ + --feat.cmvn-opts "$cmvn_opts" \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \ + --chain.frame-subsampling-factor 4 \ + --chain.alignment-subsampling-factor 4 \ + --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 2000000 \ + --trainer.num-epochs 2 \ + --trainer.optimization.momentum 0 \ + --trainer.optimization.num-jobs-initial 6 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.shrink-value 1.0 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs true \ + --feat-dir data/${train_set} \ + --tree-dir $treedir \ + --dir $dir || exit 1; +fi diff --git a/egs/madcat_ar/v1/local/create_line_image_from_page_image.py b/egs/madcat_ar/v1/local/create_line_image_from_page_image.py index ba35f8b9ace..650a0704d80 100755 --- a/egs/madcat_ar/v1/local/create_line_image_from_page_image.py +++ b/egs/madcat_ar/v1/local/create_line_image_from_page_image.py @@ -13,6 +13,7 @@ be vertically or horizontally aligned). Hence to extract line image from line bounding box, page image is rotated and line image is cropped and saved. """ +from __future__ import division import sys import argparse @@ -21,22 +22,10 @@ import numpy as np from math import atan2, cos, sin, pi, degrees, sqrt from collections import namedtuple - +import random from scipy.spatial import ConvexHull from PIL import Image from scipy.misc import toimage -import logging - -sys.path.insert(0, 'steps') -logger = logging.getLogger('libs') -logger.setLevel(logging.INFO) -handler = logging.StreamHandler() -handler.setLevel(logging.INFO) -formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - " - "%(funcName)s - %(levelname)s ] %(message)s") -handler.setFormatter(formatter) -logger.addHandler(handler) - parser = argparse.ArgumentParser(description="Creates line images from page image", epilog="E.g. " + sys.argv[0] + " data/LDC2012T15" " data/LDC2013T09 data/LDC2013T15 data/madcat.train.raw.lineid " @@ -60,6 +49,12 @@ help='Path to the downloaded (and extracted) writing conditions file 3') parser.add_argument('--padding', type=int, default=400, help='padding across horizontal/verticle direction') +parser.add_argument('--pixel-scaling', type=int, default=30, + help='padding across horizontal/verticle direction') +parser.add_argument("--subset", type=lambda x: (str(x).lower()=='true'), default=False, + help="only processes subset of data based on writing condition") +parser.add_argument("--augment", type=lambda x: (str(x).lower()=='true'), default=False, + help="performs image augmentation") args = parser.parse_args() """ @@ -93,8 +88,8 @@ def unit_vector(pt0, pt1): (float, float): unit vector """ dis_0_to_1 = sqrt((pt0[0] - pt1[0])**2 + (pt0[1] - pt1[1])**2) - return (pt1[0] - pt0[0]) / dis_0_to_1, \ - (pt1[1] - pt0[1]) / dis_0_to_1 + return (pt1[0] - pt0[0])/ dis_0_to_1, \ + (pt1[1] - pt0[1])/ dis_0_to_1 def orthogonal_vector(vector): @@ -136,7 +131,7 @@ def bounding_area(index, hull): return {'area': len_p * len_o, 'length_parallel': len_p, 'length_orthogonal': len_o, - 'rectangle_center': (min_p + len_p / 2, min_o + len_o / 2), + 'rectangle_center': (min_p + float(len_p)/ 2, min_o + float(len_o)/ 2), 'unit_vector': unit_vector_p, } @@ -149,7 +144,7 @@ def to_xy_coordinates(unit_vector_angle, point): ------ (float, float): converted x,y coordinate of the unit vector. """ - angle_orthogonal = unit_vector_angle + pi / 2 + angle_orthogonal = unit_vector_angle + pi/ 2 return point[0] * cos(unit_vector_angle) + point[1] * cos(angle_orthogonal), \ point[0] * sin(unit_vector_angle) + point[1] * sin(angle_orthogonal) @@ -194,65 +189,6 @@ def rectangle_corners(rectangle): return rotate_points(rectangle['rectangle_center'], rectangle['unit_vector_angle'], corner_points) -def get_orientation(origin, p1, p2): - """ - Given origin and two points, return the orientation of the Point p1 with - regards to Point p2 using origin. - Returns - ------- - integer: Negative if p1 is clockwise of p2. - """ - difference = ( - ((p2[0] - origin[0]) * (p1[1] - origin[1])) - - ((p1[0] - origin[0]) * (p2[1] - origin[1])) - ) - return difference - - -def compute_hull(points): - """ - Given input list of points, return a list of points that - made up the convex hull. - Returns - ------- - [(float, float)]: convexhull points - """ - hull_points = [] - start = points[0] - min_x = start[0] - for p in points[1:]: - if p[0] < min_x: - min_x = p[0] - start = p - - point = start - hull_points.append(start) - - far_point = None - while far_point is not start: - p1 = None - for p in points: - if p is point: - continue - else: - p1 = p - break - - far_point = p1 - - for p2 in points: - if p2 is point or p2 is p1: - continue - else: - direction = get_orientation(point, far_point, p2) - if direction > 0: - far_point = p2 - - hull_points.append(far_point) - point = far_point - return hull_points - - def minimum_bounding_box(points): """ Given a list of 2D points, it returns the minimum area rectangle bounding all the points in the point cloud. @@ -272,7 +208,6 @@ def minimum_bounding_box(points): hull_ordered = [points[index] for index in ConvexHull(points).vertices] hull_ordered.append(hull_ordered[0]) - #hull_ordered = compute_hull(points) hull_ordered = tuple(hull_ordered) min_rectangle = bounding_area(0, hull_ordered) @@ -301,8 +236,8 @@ def get_center(im): ------- (int, int): center of the image """ - center_x = im.size[0] / 2 - center_y = im.size[1] / 2 + center_x = float(im.size[0])/ 2 + center_y = float(im.size[1])/ 2 return int(center_x), int(center_y) @@ -314,9 +249,9 @@ def get_horizontal_angle(unit_vector_angle): (float): updated angle of the unit vector to be in radians. It is only in first or fourth quadrant. """ - if unit_vector_angle > pi / 2 and unit_vector_angle <= pi: + if unit_vector_angle > pi/ 2 and unit_vector_angle <= pi: unit_vector_angle = unit_vector_angle - pi - elif unit_vector_angle > -pi and unit_vector_angle < -pi / 2: + elif unit_vector_angle > -pi and unit_vector_angle < -pi/ 2: unit_vector_angle = unit_vector_angle + pi return unit_vector_angle @@ -400,6 +335,36 @@ def update_minimum_bounding_box_input(bounding_box_input): return updated_minimum_bounding_box_input +def dilate_polygon(points, amount_increase): + """ Increases size of polygon given as a list of tuples. + Assumes points in polygon are given in CCW + """ + expanded_points = [] + for index, point in enumerate(points): + prev_point = points[(index - 1) % len(points)] + next_point = points[(index + 1) % len(points)] + prev_edge = np.subtract(point, prev_point) + next_edge = np.subtract(next_point, point) + + prev_normal = ((1 * prev_edge[1]), (-1 * prev_edge[0])) + prev_normal = np.divide(prev_normal, np.linalg.norm(prev_normal)) + next_normal = ((1 * next_edge[1]), (-1 * next_edge[0])) + next_normal = np.divide(next_normal, np.linalg.norm(next_normal)) + + bisect = np.add(prev_normal, next_normal) + bisect = np.divide(bisect, np.linalg.norm(bisect)) + + cos_theta = np.dot(next_normal, bisect) + hyp = float(amount_increase)/ cos_theta + + new_point = np.around(point + hyp * bisect) + new_point = new_point.astype(int) + new_point = new_point.tolist() + new_point = tuple(new_point) + expanded_points.append(new_point) + return expanded_points + + def set_line_image_data(image, line_id, image_file_name, image_fh): """ Given an image, saves a flipped line image. Line image file name is formed by appending the line id at the end page image name. @@ -438,50 +403,83 @@ def get_line_images_from_page_image(image_file_name, madcat_file_path, image_fh) word_coordinate = (int(word_node.getAttribute('x')), int(word_node.getAttribute('y'))) minimum_bounding_box_input.append(word_coordinate) updated_mbb_input = update_minimum_bounding_box_input(minimum_bounding_box_input) - bounding_box = minimum_bounding_box(updated_mbb_input) - - p1, p2, p3, p4 = bounding_box.corner_points - x1, y1 = p1 - x2, y2 = p2 - x3, y3 = p3 - x4, y4 = p4 - min_x = int(min(x1, x2, x3, x4)) - min_y = int(min(y1, y2, y3, y4)) - max_x = int(max(x1, x2, x3, x4)) - max_y = int(max(y1, y2, y3, y4)) - box = (min_x, min_y, max_x, max_y) - region_initial = im.crop(box) - rot_points = [] - p1_new = (x1 - min_x, y1 - min_y) - p2_new = (x2 - min_x, y2 - min_y) - p3_new = (x3 - min_x, y3 - min_y) - p4_new = (x4 - min_x, y4 - min_y) - rot_points.append(p1_new) - rot_points.append(p2_new) - rot_points.append(p3_new) - rot_points.append(p4_new) - - cropped_bounding_box = bounding_box_tuple(bounding_box.area, - bounding_box.length_parallel, - bounding_box.length_orthogonal, - bounding_box.length_orthogonal, - bounding_box.unit_vector, - bounding_box.unit_vector_angle, - set(rot_points) - ) - - rotation_angle_in_rad = get_smaller_angle(cropped_bounding_box) - img2 = region_initial.rotate(degrees(rotation_angle_in_rad), resample = Image.BICUBIC) - x_dash_1, y_dash_1, x_dash_2, y_dash_2, x_dash_3, y_dash_3, x_dash_4, y_dash_4 = rotated_points( + points_ordered = [updated_mbb_input[index] for index in ConvexHull(updated_mbb_input).vertices] + if args.augment: + for i in range(0, 3): + additional_pixel = random.randint(1, args.pixel_scaling) + mar = dilate_polygon(points_ordered, (i-1)*args.pixel_scaling + additional_pixel + 1) + bounding_box = minimum_bounding_box(mar) + (x1, y1), (x2, y2), (x3, y3), (x4, y4) = bounding_box.corner_points + min_x, min_y = int(min(x1, x2, x3, x4)), int(min(y1, y2, y3, y4)) + max_x, max_y = int(max(x1, x2, x3, x4)), int(max(y1, y2, y3, y4)) + box = (min_x, min_y, max_x, max_y) + region_initial = im.crop(box) + rot_points = [] + p1, p2 = (x1 - min_x, y1 - min_y), (x2 - min_x, y2 - min_y) + p3, p4 = (x3 - min_x, y3 - min_y), (x4 - min_x, y4 - min_y) + rot_points.append(p1) + rot_points.append(p2) + rot_points.append(p3) + rot_points.append(p4) + + cropped_bounding_box = bounding_box_tuple(bounding_box.area, + bounding_box.length_parallel, + bounding_box.length_orthogonal, + bounding_box.length_orthogonal, + bounding_box.unit_vector, + bounding_box.unit_vector_angle, + set(rot_points) + ) + + rotation_angle_in_rad = get_smaller_angle(cropped_bounding_box) + img2 = region_initial.rotate(degrees(rotation_angle_in_rad), resample = Image.BICUBIC) + x_dash_1, y_dash_1, x_dash_2, y_dash_2, x_dash_3, y_dash_3, x_dash_4, y_dash_4 = rotated_points( + cropped_bounding_box, get_center(region_initial)) + + min_x = int(min(x_dash_1, x_dash_2, x_dash_3, x_dash_4)) + min_y = int(min(y_dash_1, y_dash_2, y_dash_3, y_dash_4)) + max_x = int(max(x_dash_1, x_dash_2, x_dash_3, x_dash_4)) + max_y = int(max(y_dash_1, y_dash_2, y_dash_3, y_dash_4)) + box = (min_x, min_y, max_x, max_y) + region_final = img2.crop(box) + line_id = id + '_scale' + str(i) + set_line_image_data(region_final, line_id, image_file_name, image_fh) + else: + bounding_box = minimum_bounding_box(points_ordered) + (x1, y1), (x2, y2), (x3, y3), (x4, y4) = bounding_box.corner_points + min_x, min_y = int(min(x1, x2, x3, x4)), int(min(y1, y2, y3, y4)) + max_x, max_y = int(max(x1, x2, x3, x4)), int(max(y1, y2, y3, y4)) + box = (min_x, min_y, max_x, max_y) + region_initial = im.crop(box) + rot_points = [] + p1, p2 = (x1 - min_x, y1 - min_y), (x2 - min_x, y2 - min_y) + p3, p4 = (x3 - min_x, y3 - min_y), (x4 - min_x, y4 - min_y) + rot_points.append(p1) + rot_points.append(p2) + rot_points.append(p3) + rot_points.append(p4) + + cropped_bounding_box = bounding_box_tuple(bounding_box.area, + bounding_box.length_parallel, + bounding_box.length_orthogonal, + bounding_box.length_orthogonal, + bounding_box.unit_vector, + bounding_box.unit_vector_angle, + set(rot_points) + ) + + rotation_angle_in_rad = get_smaller_angle(cropped_bounding_box) + img2 = region_initial.rotate(degrees(rotation_angle_in_rad), resample = Image.BICUBIC) + x_dash_1, y_dash_1, x_dash_2, y_dash_2, x_dash_3, y_dash_3, x_dash_4, y_dash_4 = rotated_points( cropped_bounding_box, get_center(region_initial)) - min_x = int(min(x_dash_1, x_dash_2, x_dash_3, x_dash_4)) - min_y = int(min(y_dash_1, y_dash_2, y_dash_3, y_dash_4)) - max_x = int(max(x_dash_1, x_dash_2, x_dash_3, x_dash_4)) - max_y = int(max(y_dash_1, y_dash_2, y_dash_3, y_dash_4)) - box = (min_x, min_y, max_x, max_y) - region_final = img2.crop(box) - set_line_image_data(region_final, id, image_file_name, image_fh) + min_x = int(min(x_dash_1, x_dash_2, x_dash_3, x_dash_4)) + min_y = int(min(y_dash_1, y_dash_2, y_dash_3, y_dash_4)) + max_x = int(max(x_dash_1, x_dash_2, x_dash_3, x_dash_4)) + max_y = int(max(y_dash_1, y_dash_2, y_dash_3, y_dash_4)) + box = (min_x, min_y, max_x, max_y) + region_final = img2.crop(box) + set_line_image_data(region_final, id, image_file_name, image_fh) def check_file_location(base_name, wc_dict1, wc_dict2, wc_dict3): @@ -535,16 +533,16 @@ def check_writing_condition(wc_dict, base_name): Returns (bool): True if writing condition matches. """ - return True - writing_condition = wc_dict[base_name].strip() - if writing_condition != 'IUC': - return False - - return True - + if args.subset: + writing_condition = wc_dict[base_name].strip() + if writing_condition != 'IUC': + return False + else: + return True + else: + return True ### main ### - def main(): wc_dict1 = parse_writing_conditions(args.writing_condition1) @@ -564,8 +562,7 @@ def main(): madcat_file_path, image_file_path, wc_dict = check_file_location(base_name, wc_dict1, wc_dict2, wc_dict3) if wc_dict is None or not check_writing_condition(wc_dict, base_name): continue - if madcat_file_path is not None: - get_line_images_from_page_image(image_file_path, madcat_file_path, image_fh) + get_line_images_from_page_image(image_file_path, madcat_file_path, image_fh) if __name__ == '__main__': diff --git a/egs/madcat_ar/v1/local/download_data.sh b/egs/madcat_ar/v1/local/download_data.sh deleted file mode 100755 index 7061be49c2a..00000000000 --- a/egs/madcat_ar/v1/local/download_data.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/bin/bash - -# Copyright 2018 Ashish Arora -# Apache 2.0 - -# This script downloads data splits for MADCAT Arabic dataset. -# It also check if madcat arabic data is present or not. - -download_dir1=/export/corpora/LDC/LDC2012T15/data -download_dir2=/export/corpora/LDC/LDC2013T09/data -download_dir3=/export/corpora/LDC/LDC2013T15/data -train_split_url=http://www.openslr.org/resources/48/madcat.train.raw.lineid -test_split_url=http://www.openslr.org/resources/48/madcat.test.raw.lineid -dev_split_url=http://www.openslr.org/resources/48/madcat.dev.raw.lineid -data_splits=data/download/data_splits - -. ./cmd.sh -. ./path.sh -. ./utils/parse_options.sh || exit 1; - -if [ -d $data_splits ]; then - echo "$0: Not downloading the data splits as it is already there." -else - if [ ! -f $data_splits/madcat.train.raw.lineid ]; then - mkdir -p $data_splits - echo "$0: Downloading the data splits..." - wget -P $data_splits $train_split_url || exit 1; - wget -P $data_splits $test_split_url || exit 1; - wget -P $data_splits $dev_split_url || exit 1; - fi - echo "$0: Done downloading the data splits" -fi - -if [ -d $download_dir1 ]; then - echo "$0: madcat arabic data directory is present." -else - if [ ! -f $download_dir1/madcat/*.madcat.xml ]; then - echo "$0: please download madcat data..." - fi -fi diff --git a/egs/madcat_ar/v1/local/extract_features.sh b/egs/madcat_ar/v1/local/extract_features.sh index 70c5498626c..9fe588f31b8 100755 --- a/egs/madcat_ar/v1/local/extract_features.sh +++ b/egs/madcat_ar/v1/local/extract_features.sh @@ -1,10 +1,16 @@ #!/bin/bash + # Copyright 2017 Yiwen Shao # 2018 Ashish Arora +# Apache 2.0 +# This script runs the make features script in parallel. + nj=4 cmd=run.pl feat_dim=40 +augment='no_aug' +verticle_shift=0 echo "$0 $@" . ./cmd.sh @@ -30,9 +36,10 @@ done utils/split_scp.pl $scp $split_scps || exit 1; $cmd JOB=1:$nj $logdir/extract_features.JOB.log \ - local/make_features.py $logdir/images.JOB.scp \ + image/ocr/make_features.py $logdir/images.JOB.scp \ --allowed_len_file_path $data/allowed_lengths.txt \ - --feat-dim $feat_dim \| \ + --feat-dim $feat_dim --augment_type $augment \ + --vertical-shift $verticle_shift \| \ copy-feats --compress=true --compression-method=7 \ ark:- ark,scp:$featdir/images.JOB.ark,$featdir/images.JOB.scp diff --git a/egs/madcat_ar/v1/local/extract_lines.sh b/egs/madcat_ar/v1/local/extract_lines.sh index 50129ad38c9..ab87836ae3a 100755 --- a/egs/madcat_ar/v1/local/extract_lines.sh +++ b/egs/madcat_ar/v1/local/extract_lines.sh @@ -11,6 +11,8 @@ writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab data_split_file=data/download/data_splits/madcat.dev.raw.lineid data=data/local/dev +subset=false +augment=false echo "$0 $@" . ./cmd.sh @@ -35,7 +37,7 @@ done $cmd JOB=1:$nj $log_dir/extract_lines.JOB.log \ local/create_line_image_from_page_image.py $download_dir1 $download_dir2 $download_dir3 \ $log_dir/lines.JOB.scp $data/JOB $writing_condition1 $writing_condition2 $writing_condition3 \ - || exit 1; + --subset $subset --augment $augment || exit 1; ## concatenate the .scp files together. for n in $(seq $nj); do diff --git a/egs/madcat_ar/v1/local/prepare_data.sh b/egs/madcat_ar/v1/local/prepare_data.sh index d808d736845..1049db9826d 100755 --- a/egs/madcat_ar/v1/local/prepare_data.sh +++ b/egs/madcat_ar/v1/local/prepare_data.sh @@ -5,49 +5,65 @@ # 2017 Hossein Hadian # Apache 2.0 -# This script prepares the training and test data for MADCAT Arabic dataset -# (i.e text, images.scp, utt2spk and spk2utt). It calls process_data.py. +# This script downloads the data splits for MADCAT Arabic dataset and prepares the training +# validation, and test data (i.e text, images.scp, utt2spk and spk2utt) by calling process_data.py. +# It also uses Arabic Gigaword text corpus for language modeling. # Eg. local/prepare_data.sh -# Eg. text file: LDC0001_000404_NHR_ARB_20070113.0052_11_LDC0001_00z2 ﻮﺠﻫ ﻮﻌﻘﻟ ﻍﺍﺮﻗ ﺢﺗّﻯ ﺎﻠﻨﺧﺎﻋ +# Eg. text file: LDC0001_000399_NHR_ARB_20070113.0052_11_LDC0001_0z11 +# وهناك تداخل بين الرأسمالية الإسرائيلية # utt2spk file: LDC0001_000397_NHR_ARB_20070113.0052_11_LDC0001_00z1 LDC0001 -# images.scp file: LDC0009_000000_arb-NG-2-76513-5612324_2_LDC0009_00z0 -# data/local/lines/1/arb-NG-2-76513-5612324_2_LDC0009_00z0.tif +# images.scp file: LDC0001_000397_NHR_ARB_20070113.0052_11_LDC0001_00z1 +# data/local/train/1/NHR_ARB_20070113.0052_11_LDC0001_00z1.png -stage=0 download_dir1=/export/corpora/LDC/LDC2012T15/data download_dir2=/export/corpora/LDC/LDC2013T09/data download_dir3=/export/corpora/LDC/LDC2013T15/data -writing_condition1=/export/corpora/LDC/LDC2012T15/docs/writing_conditions.tab -writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab -writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab -data_splits_dir=data/download/data_splits -images_scp_dir=data/local +train_split_url=http://www.openslr.org/resources/48/madcat.train.raw.lineid +test_split_url=http://www.openslr.org/resources/48/madcat.test.raw.lineid +dev_split_url=http://www.openslr.org/resources/48/madcat.dev.raw.lineid +data_splits=data/download/data_splits +stage=0 +download_dir=data/download +gigacorpus=data/local/gigawordcorpus +gigaword_loc=/export/corpora5/LDC/LDC2011T11 +use_extra_corpus_text=true . ./cmd.sh . ./path.sh . ./utils/parse_options.sh || exit 1; -mkdir -p data/{train,test,dev} - -if [ $stage -le 1 ]; then - echo "$0: Processing dev, train and test data..." - echo "Date: $(date)." - local/process_data.py $download_dir1 $download_dir2 $download_dir3 \ - $data_splits_dir/madcat.dev.raw.lineid data/dev $images_scp_dir/dev/images.scp \ - $writing_condition1 $writing_condition2 $writing_condition3 || exit 1 - - local/process_data.py $download_dir1 $download_dir2 $download_dir3 \ - $data_splits_dir/madcat.test.raw.lineid data/test $images_scp_dir/test/images.scp \ - $writing_condition1 $writing_condition2 $writing_condition3 || exit 1 +if [ -d $data_splits ]; then + echo "$0: Not downloading the data splits as it is already there." +else + if [ ! -f $data_splits/madcat.train.raw.lineid ]; then + mkdir -p $data_splits + echo "$0: Downloading the data splits..." + wget -P $data_splits $train_split_url || exit 1; + wget -P $data_splits $test_split_url || exit 1; + wget -P $data_splits $dev_split_url || exit 1; + fi + echo "$0: Done downloading the data splits" +fi - local/process_data.py $download_dir1 $download_dir2 $download_dir3 \ - $data_splits_dir/madcat.train.raw.lineid data/train $images_scp_dir/train/images.scp \ - $writing_condition1 $writing_condition2 $writing_condition3 || exit 1 +if [ -d $download_dir1 ]; then + echo "$0: madcat arabic data directory is present." +else + if [ ! -f $download_dir1/madcat/*.madcat.xml ]; then + echo "$0: please download madcat data..." + fi +fi - for dataset in dev test train; do - echo "$0: Fixing data directory for dataset: $dataset" - echo "Date: $(date)." - image/fix_data_dir.sh data/$dataset +mkdir -p $download_dir data/local +if $use_extra_corpus_text; then + mkdir -p $gigacorpus + cp -r $gigaword_loc/. $gigacorpus + for newswire in aaw_arb afp_arb ahr_arb asb_arb hyt_arb nhr_arb qds_arb umh_arb xin_arb; do + for file in $gigacorpus/arb_gw_5/data/$newswire/*.gz; do + gzip -d $file + done + for file in $gigacorpus/arb_gw_5/data/$newswire/*; do + sed -e '/^<[^>]*>$/d; s/``/"/g; s/\x27\x27/"/g' $file >> $gigacorpus/arb_gw_5/data/${newswire}_combined.txt + done done fi diff --git a/egs/madcat_ar/v1/local/prepend_words.py b/egs/madcat_ar/v1/local/prepend_words.py deleted file mode 100755 index d53eb8974bf..00000000000 --- a/egs/madcat_ar/v1/local/prepend_words.py +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -# This script, prepend '|' to every words in the transcript to mark -# the beginning of the words for finding the initial-space of every word -# after decoding. - -import sys, io - -infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') -output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') -for line in infile: - output.write(' '.join(["|" + word for word in line.split()]) + '\n') diff --git a/egs/madcat_ar/v1/local/process_data.py b/egs/madcat_ar/v1/local/process_data.py index b57500cf2fa..a39bcfa87d3 100755 --- a/egs/madcat_ar/v1/local/process_data.py +++ b/egs/madcat_ar/v1/local/process_data.py @@ -24,24 +24,28 @@ " data/LDC2013T09 data/LDC2013T15 data/madcat.train.raw.lineid " " data/train data/local/lines ", formatter_class=argparse.ArgumentDefaultsHelpFormatter) -parser.add_argument('database_path1', type=str, +parser.add_argument('database_path1', help='Path to the downloaded (and extracted) madcat data') -parser.add_argument('database_path2', type=str, +parser.add_argument('database_path2', help='Path to the downloaded (and extracted) madcat data') -parser.add_argument('database_path3', type=str, +parser.add_argument('database_path3', help='Path to the downloaded (and extracted) madcat data') -parser.add_argument('data_splits', type=str, +parser.add_argument('data_splits', help='Path to file that contains the train/test/dev split information') -parser.add_argument('out_dir', type=str, +parser.add_argument('out_dir', help='directory location to write output files.') -parser.add_argument('images_scp_path', type=str, +parser.add_argument('images_scp_path', help='Path of input images.scp file(maps line image and location)') -parser.add_argument('writing_condition1', type=str, +parser.add_argument('writing_condition1', help='Path to the downloaded (and extracted) writing conditions file 1') -parser.add_argument('writing_condition2', type=str, +parser.add_argument('writing_condition2', help='Path to the downloaded (and extracted) writing conditions file 2') -parser.add_argument('writing_condition3', type=str, +parser.add_argument('writing_condition3', help='Path to the downloaded (and extracted) writing conditions file 3') +parser.add_argument("--augment", type=lambda x: (str(x).lower()=='true'), default=False, + help="performs image augmentation") +parser.add_argument("--subset", type=lambda x: (str(x).lower()=='true'), default=False, + help="only processes subset of data based on writing condition") args = parser.parse_args() @@ -97,50 +101,42 @@ def check_writing_condition(wc_dict): Returns: (bool): True if writing condition matches. """ - return True - writing_condition = wc_dict[base_name].strip() - if writing_condition != 'IUC': - return False + if args.subset: + writing_condition = wc_dict[base_name].strip() + if writing_condition != 'IUC': + return False + else: + return True + else: + return True - return True - -def get_word_line_mapping(madcat_file_path): +def read_text(madcat_file_path): """ Maps every word in the page image to a corresponding line. Args: - madcat_file_path (string): complete path and name of the madcat xml file + madcat_file_path (string): complete path and name of the madcat xml file corresponding to the page image. Returns: + dict: Mapping every word in the page image to a corresponding line. """ + + word_line_dict = dict() doc = minidom.parse(madcat_file_path) zone = doc.getElementsByTagName('zone') for node in zone: line_id = node.getAttribute('id') - line_word_dict[line_id] = list() word_image = node.getElementsByTagName('token-image') for tnode in word_image: word_id = tnode.getAttribute('id') - line_word_dict[line_id].append(word_id) word_line_dict[word_id] = line_id - -def read_text(madcat_file_path): - """ Maps every word in the page image to a corresponding line. - Args: - madcat_file_path (string): complete path and name of the madcat xml file - corresponding to the page image. - Returns: - dict: Mapping every word in the page image to a corresponding line. - """ text_line_word_dict = dict() - doc = minidom.parse(madcat_file_path) segment = doc.getElementsByTagName('segment') for node in segment: token = node.getElementsByTagName('token') for tnode in token: ref_word_id = tnode.getAttribute('ref_id') word = tnode.getElementsByTagName('source')[0].firstChild.nodeValue - word = unicodedata.normalize('NFKC',word) ref_line_id = word_line_dict[ref_word_id] if ref_line_id not in text_line_word_dict: text_line_word_dict[ref_line_id] = list() @@ -160,7 +156,6 @@ def get_line_image_location(): ### main ### - print("Processing '{}' data...".format(args.out_dir)) text_file = os.path.join(args.out_dir, 'text') @@ -188,23 +183,34 @@ def get_line_image_location(): madcat_xml_path, image_file_path, wc_dict = check_file_location() if wc_dict is None or not check_writing_condition(wc_dict): continue - if madcat_xml_path is not None: - madcat_doc = minidom.parse(madcat_xml_path) - writer = madcat_doc.getElementsByTagName('writer') - writer_id = writer[0].getAttribute('id') - line_word_dict = dict() - word_line_dict = dict() - get_word_line_mapping(madcat_xml_path) - text_line_word_dict = read_text(madcat_xml_path) - base_name = os.path.basename(image_file_path) - base_name, b = base_name.split('.tif') - for lineID in sorted(text_line_word_dict): - updated_base_name = base_name + '_' + str(lineID).zfill(4) +'.png' + madcat_doc = minidom.parse(madcat_xml_path) + writer = madcat_doc.getElementsByTagName('writer') + writer_id = writer[0].getAttribute('id') + text_line_word_dict = read_text(madcat_xml_path) + base_name = os.path.basename(image_file_path).split('.tif')[0] + for line_id in sorted(text_line_word_dict): + if args.augment: + key = (line_id + '.')[:-1] + for i in range(0, 3): + location_id = "_{}_scale{}".format(line_id, i) + line_image_file_name = base_name + location_id + '.png' + location = image_loc_dict[line_image_file_name] + image_file_path = os.path.join(location, line_image_file_name) + line = text_line_word_dict[key] + text = ' '.join(line) + base_line_image_file_name = line_image_file_name.split('.png')[0] + utt_id = "{}_{}_{}".format(writer_id, str(image_num).zfill(6), base_line_image_file_name) + text_fh.write(utt_id + ' ' + text + '\n') + utt2spk_fh.write(utt_id + ' ' + writer_id + '\n') + image_fh.write(utt_id + ' ' + image_file_path + '\n') + image_num += 1 + else: + updated_base_name = "{}_{}.png".format(base_name, str(line_id).zfill(4)) location = image_loc_dict[updated_base_name] image_file_path = os.path.join(location, updated_base_name) - line = text_line_word_dict[lineID] + line = text_line_word_dict[line_id] text = ' '.join(line) - utt_id = writer_id + '_' + str(image_num).zfill(6) + '_' + base_name + '_' + str(lineID).zfill(4) + utt_id = "{}_{}_{}_{}".format(writer_id, str(image_num).zfill(6), base_name, str(line_id).zfill(4)) text_fh.write(utt_id + ' ' + text + '\n') utt2spk_fh.write(utt_id + ' ' + writer_id + '\n') image_fh.write(utt_id + ' ' + image_file_path + '\n') diff --git a/egs/madcat_ar/v1/local/score.sh b/egs/madcat_ar/v1/local/score.sh index 2c11aba3e13..31564d25326 100755 --- a/egs/madcat_ar/v1/local/score.sh +++ b/egs/madcat_ar/v1/local/score.sh @@ -1,5 +1,5 @@ #!/bin/bash -steps/scoring/score_kaldi_wer.sh --word_ins_penalty 0.0,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,4.5,5.0,5.5,6.0,6.5,7.0 "$@" -steps/scoring/score_kaldi_cer.sh --stage 2 --word_ins_penalty 0.0,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,4.5,5.0,5.5,6.0,6.5,7.0 "$@" +steps/scoring/score_kaldi_wer.sh "$@" +steps/scoring/score_kaldi_cer.sh --stage 2 "$@" diff --git a/egs/madcat_ar/v1/local/tl/augment_data.sh b/egs/madcat_ar/v1/local/tl/augment_data.sh new file mode 100755 index 00000000000..cc44aa58a62 --- /dev/null +++ b/egs/madcat_ar/v1/local/tl/augment_data.sh @@ -0,0 +1,36 @@ +#!/bin/bash +# Copyright 2018 Hossein Hadian +# 2018 Ashish Arora + +# Apache 2.0 +# This script performs data augmentation. + +nj=4 +cmd=run.pl +feat_dim=40 +verticle_shift=0 +echo "$0 $@" + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; + +srcdir=$1 +outdir=$2 +datadir=$3 +aug_set=aug1 +mkdir -p $datadir/augmentations +echo "copying $srcdir to $datadir/augmentations/$aug_set, allowed length, creating feats.scp" + +for set in $aug_set; do + image/copy_data_dir.sh --spk-prefix $set- --utt-prefix $set- \ + $srcdir $datadir/augmentations/$set + cat $srcdir/allowed_lengths.txt > $datadir/augmentations/$set/allowed_lengths.txt + local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim $feat_dim \ + --vertical-shift $verticle_shift \ + --augment 'random_shift' $datadir/augmentations/$set +done + +echo " combine original data and data from different augmentations" +utils/combine_data.sh --extra-files images.scp $outdir $srcdir $datadir/augmentations/$aug_set +cat $srcdir/allowed_lengths.txt > $outdir/allowed_lengths.txt diff --git a/egs/madcat_ar/v1/local/tl/chain/run_cnn_e2eali.sh b/egs/madcat_ar/v1/local/tl/chain/run_cnn_e2eali.sh new file mode 100755 index 00000000000..ccbb7119674 --- /dev/null +++ b/egs/madcat_ar/v1/local/tl/chain/run_cnn_e2eali.sh @@ -0,0 +1,229 @@ +#!/bin/bash + +# ./local/chain/compare_wer.sh exp/chain/cnn_e2eali_1a/ +# System cnn_e2eali_1a +# WER 16.78 +# CER 5.22 +# Final train prob -0.1189 +# Final valid prob -0.1319 +# Final train prob (xent) -0.6395 +# Final valid prob (xent) -0.6732 +# Parameters 3.73M + +# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1a/ +# exp/chain/cnn_e2eali_1a/: num-iters=24 nj=3..15 num-params=3.7M dim=56->392 combine=-0.125->-0.125 (over 1) xent:train/valid[15,23,final]=(-0.850,-1.24,-0.640/-0.901,-1.31,-0.673) logprob:train/valid[15,23,final]=(-0.149,-0.209,-0.119/-0.166,-0.229,-0.132) +set -e -o pipefail + +stage=0 + +nj=30 +train_set=train +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +# training chunk-options +chunk_width=340,300,200,100 +num_leaves=500 +tdnn_dim=450 +srand=0 +remove_egs=true +lang_decode=data/lang +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --acoustic-scale 1.0 \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + ${train_data_dir} data/lang $e2echain_model_dir $lat_dir + echo "" >$lat_dir/splice_opts + +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 4 \ + --alignment-subsampling-factor 1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves ${train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) + cnn_opts="l2-regularize=0.075" + tdnn_opts="l2-regularize=0.075" + output_opts="l2-regularize=0.1" + common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=56 name=input + conv-relu-batchnorm-layer name=cnn1 height-in=56 height-out=56 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=56 height-out=28 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=28 height-out=28 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=28 height-out=28 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=28 height-out=14 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=14 height-out=14 time-offsets=-4,0,4 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=14 height-out=14 time-offsets=-4,0,4 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \ + --chain.frame-subsampling-factor=4 \ + --chain.alignment-subsampling-factor=1 \ + --chain.left-tolerance 3 \ + --chain.right-tolerance 3 \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=2 \ + --trainer.frames-per-iter=2000000 \ + --trainer.optimization.num-jobs-initial=3 \ + --trainer.optimization.num-jobs-final=16 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=64,32 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0 --constrained false" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 $lang_decode \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; +fi + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/madcat_ar/v1/local/chain/run_flatstart_cnn1a.sh b/egs/madcat_ar/v1/local/tl/chain/run_e2e_cnn.sh similarity index 78% rename from egs/madcat_ar/v1/local/chain/run_flatstart_cnn1a.sh rename to egs/madcat_ar/v1/local/tl/chain/run_e2e_cnn.sh index 2c85e982ce6..3fca8cf5fdc 100755 --- a/egs/madcat_ar/v1/local/chain/run_flatstart_cnn1a.sh +++ b/egs/madcat_ar/v1/local/tl/chain/run_e2e_cnn.sh @@ -3,40 +3,37 @@ # This script does end2end chain training (i.e. from scratch) -# local/chain/compare_wer.sh exp/chain/e2e_cnn_1a +# ./local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/ # System e2e_cnn_1a -# WER 10.71 -# CER 2.85 -# Final train prob -0.0859 -# Final valid prob -0.1266 +# WER 19.30 +# CER 5.72 +# Final train prob -0.0734 +# Final valid prob -0.0607 # Final train prob (xent) # Final valid prob (xent) -# Parameters 2.94M +# Parameters 3.30M # steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a/ -# exp/chain/e2e_cnn_1a/: num-iters=195 nj=6..16 num-params=2.9M dim=40->324 combine=-0.065->-0.064 (over 5) logprob:train/valid[129,194,final]=(-0.078,-0.077,-0.086/-0.129,-0.126,-0.127) +# exp/chain/e2e_cnn_1a/: num-iters=24 nj=3..15 num-params=3.3M dim=56->292 combine=-0.060->-0.060 (over 1) logprob:train/valid[15,23,final]=(-0.122,-0.143,-0.073/-0.105,-0.132,-0.061) set -e + # configs for 'chain' stage=0 -nj=70 +nj=30 train_stage=-10 get_egs_stage=-10 affix=1a # training options tdnn_dim=450 -num_epochs=2 -num_jobs_initial=6 -num_jobs_final=16 -minibatch_size=150=128,64/300=128,64/600=64,32/1200=32,16 +minibatch_size=150=64,32/300=32,16/600=16,8/1200=8,4 common_egs_dir= -l2_regularize=0.00005 frames_per_iter=1000000 -cmvn_opts="--norm-means=true --norm-vars=true" +cmvn_opts="--norm-means=false --norm-vars=false" train_set=train -lang_test=lang_test +lang_decode=data/lang # End configuration section. echo "$0 $@" # Print the command line for logging @@ -89,16 +86,17 @@ if [ $stage -le 2 ]; then common1="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" common2="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" common3="required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs cat < $dir/configs/network.xconfig - input dim=40 name=input - conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 - conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 - conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 - conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 - conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 - conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 - conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 + input dim=56 name=input + conv-relu-batchnorm-layer name=cnn1 height-in=56 height-out=56 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=56 height-out=28 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=28 height-out=28 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=28 height-out=28 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=28 height-out=14 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=14 height-out=14 time-offsets=-4,0,4 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=14 height-out=14 time-offsets=-4,0,4 $common3 relu-batchnorm-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim @@ -118,20 +116,21 @@ if [ $stage -le 3 ]; then --cmd "$cmd" \ --feat.cmvn-opts "$cmvn_opts" \ --chain.leaky-hmm-coefficient 0.1 \ - --chain.l2-regularize $l2_regularize \ + --chain.l2-regularize 0.00005 \ --chain.apply-deriv-weights false \ --egs.dir "$common_egs_dir" \ --egs.stage $get_egs_stage \ --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \ --chain.frame-subsampling-factor 4 \ --chain.alignment-subsampling-factor 4 \ + --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \ --trainer.add-option="--optimization.memory-compression-level=2" \ --trainer.num-chunk-per-minibatch $minibatch_size \ - --trainer.frames-per-iter $frames_per_iter \ - --trainer.num-epochs $num_epochs \ + --trainer.frames-per-iter 2000000 \ + --trainer.num-epochs 2 \ --trainer.optimization.momentum 0 \ - --trainer.optimization.num-jobs-initial $num_jobs_initial \ - --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.shrink-value 1.0 \ @@ -151,7 +150,7 @@ if [ $stage -le 4 ]; then # as long as phones.txt was compatible. utils/mkgraph.sh \ - --self-loop-scale 1.0 data/$lang_test \ + --self-loop-scale 1.0 $lang_decode \ $dir $dir/graph || exit 1; fi diff --git a/egs/madcat_ar/v1/local/tl/process_waldo_data.py b/egs/madcat_ar/v1/local/tl/process_waldo_data.py new file mode 100755 index 00000000000..0d278e64122 --- /dev/null +++ b/egs/madcat_ar/v1/local/tl/process_waldo_data.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 + +""" This script reads image and transcription mapping and creates the following files :text, utt2spk, images.scp. + Eg. local/process_waldo_data.py lines/hyp_line_image_transcription_mapping_kaldi.txt data/test + Eg. text file: LDC0001_000404_NHR_ARB_20070113.0052_11_LDC0001_00z2 ﻮﺠﻫ ﻮﻌﻘﻟ ﻍﺍﺮﻗ ﺢﺗّﻯ ﺎﻠﻨﺧﺎﻋ + utt2spk file: LDC0001_000397_NHR_ARB_20070113.0052_11_LDC0001_00z1 LDC0001 + images.scp file: LDC0009_000000_arb-NG-2-76513-5612324_2_LDC0009_00z0 + data/local/lines/1/arb-NG-2-76513-5612324_2_LDC0009_00z0.tif +""" + +import argparse +import os +import sys + +parser = argparse.ArgumentParser(description="Creates text, utt2spk and images.scp files", + epilog="E.g. " + sys.argv[0] + " data/train data/local/lines ", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument('image_transcription_file', type=str, + help='Path to the file containing line image path and transcription information') +parser.add_argument('out_dir', type=str, + help='directory location to write output files.') +args = parser.parse_args() + + +def read_image_text(image_text_path): + """ Given the file path containing, mapping information of line image + and transcription, it returns a dict. The dict contains this mapping + info. It can be accessed via line_id and will provide transcription. + Returns: + -------- + dict: line_id and transcription mapping + """ + image_transcription_dict = dict() + with open(image_text_path, encoding='utf-8') as f: + for line in f: + line_vect = line.strip().split(' ') + image_path = line_vect[0] + line_id = os.path.basename(image_path).split('.png')[0] + transcription = line_vect[1:] + joined_transcription = list() + for word in transcription: + joined_transcription.append(word) + joined_transcription = " ".join(joined_transcription) + image_transcription_dict[line_id] = joined_transcription + return image_transcription_dict + + +### main ### +print("Processing '{}' data...".format(args.out_dir)) +text_file = os.path.join(args.out_dir, 'text') +text_fh = open(text_file, 'w', encoding='utf-8') +utt2spk_file = os.path.join(args.out_dir, 'utt2spk') +utt2spk_fh = open(utt2spk_file, 'w', encoding='utf-8') +image_file = os.path.join(args.out_dir, 'images.scp') +image_fh = open(image_file, 'w', encoding='utf-8') + +image_transcription_dict = read_image_text(args.image_transcription_file) +for line_id in sorted(image_transcription_dict.keys()): + writer_id = line_id.strip().split('_')[-3] + updated_line_id = line_id + '.png' + image_file_path = os.path.join('lines', updated_line_id) + text = image_transcription_dict[line_id] + utt_id = line_id + text_fh.write(utt_id + ' ' + text + '\n') + utt2spk_fh.write(utt_id + ' ' + writer_id + '\n') + image_fh.write(utt_id + ' ' + image_file_path + '\n') + diff --git a/egs/madcat_ar/v1/local/tl/run_text_localization.sh b/egs/madcat_ar/v1/local/tl/run_text_localization.sh new file mode 100755 index 00000000000..8d12f7d802f --- /dev/null +++ b/egs/madcat_ar/v1/local/tl/run_text_localization.sh @@ -0,0 +1,143 @@ +#!/bin/bash +# Copyright 2017 Hossein Hadian +# 2018 Ashish Arora + +# This script performs full page text recognition on automatically extracted line images +# from madcat arabic data. It is created as a separate scrip, because it performs +# data augmentation, uses smaller language model and calls process_waldo_data for +# test images (automatically extracted line images). Data augmentation increases image +# height hence requires different DNN arachitecture and different chain scripts. + +set -e +stage=0 +nj=70 +# download_dir{1,2,3} points to the database path on the JHU grid. If you have not +# already downloaded the database you can set it to a local directory +# This corpus can be purchased here: +# https://catalog.ldc.upenn.edu/{LDC2012T15,LDC2013T09/,LDC2013T15/} +download_dir1=/export/corpora/LDC/LDC2012T15/data +download_dir2=/export/corpora/LDC/LDC2013T09/data +download_dir3=/export/corpora/LDC/LDC2013T15/data +writing_condition1=/export/corpora/LDC/LDC2012T15/docs/writing_conditions.tab +writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab +writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab +data_splits_dir=data/download/data_splits +images_scp_dir=data/local +overwrite=false +subset=true +augment=true +verticle_shift=16 +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. +. ./path.sh +. ./utils/parse_options.sh # e.g. this parses the above options + # if supplied. +./local/check_tools.sh + +mkdir -p data/{train,test,dev}/data +mkdir -p data/local/{train,test,dev} +if [ $stage -le 0 ]; then + + if [ -f data/train/text ] && ! $overwrite; then + echo "$0: Not processing, probably script have run from wrong stage" + echo "Exiting with status 1 to avoid data corruption" + exit 1; + fi + echo "$0: Downloading data splits...$(date)" + local/download_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \ + --download_dir2 $download_dir2 --download_dir3 $download_dir3 + + for set in train dev; do + data_split_file=$data_splits_dir/madcat.$set.raw.lineid + local/extract_lines.sh --nj $nj --cmd $cmd --data_split_file $data_split_file \ + --download_dir1 $download_dir1 --download_dir2 $download_dir2 \ + --download_dir3 $download_dir3 --writing_condition1 $writing_condition1 \ + --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 \ + --data data/local/$set --subset $subset --augment $augment || exit 1 + done + + echo "$0: Preparing data..." + for set in dev train; do + local/process_data.py $download_dir1 $download_dir2 $download_dir3 \ + $data_splits_dir/madcat.$set.raw.lineid data/$set $images_scp_dir/$set/images.scp \ + $writing_condition1 $writing_condition2 $writing_condition3 --augment $augment --subset $subset + image/fix_data_dir.sh data/${set} + done + + local/tl/process_waldo_data.py lines/hyp_line_image_transcription_mapping_kaldi.txt data/test + utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt +fi + +if [ $stage -le 1 ]; then + echo "$0: Obtaining image groups. calling get_image2num_frames $(date)." + image/get_image2num_frames.py data/train + image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train + for set in dev train test; do + echo "$0: Extracting features and calling compute_cmvn_stats for dataset: $set. $(date)" + local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 \ + --verticle_shift $verticle_shift data/$set + steps/compute_cmvn_stats.sh data/$set || exit 1; + done + echo "$0: Fixing data directory for train dataset $(date)." + image/fix_data_dir.sh data/train +fi + +if [ $stage -le 2 ]; then + for set in train; do + echo "$(date) stage 2: Performing augmentation, it will double training data" + local/tl/augment_data.sh --nj $nj --cmd "$cmd" --feat-dim 40 \ + --verticle_shift $verticle_shift data/${set} data/${set}_aug data + steps/compute_cmvn_stats.sh data/${set}_aug || exit 1; + done +fi + +if [ $stage -le 3 ]; then + echo "$0: Preparing BPE..." + cut -d' ' -f2- data/train/text | utils/lang/bpe/reverse.py | \ + utils/lang/bpe/prepend_words.py | \ + utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt + + for set in test train dev train_aug; do + cut -d' ' -f1 data/$set/text > data/$set/ids + cut -d' ' -f2- data/$set/text | utils/lang/bpe/reverse.py | \ + utils/lang/bpe/prepend_words.py | \ + utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ + | sed 's/@@//g' > data/$set/bpe_text + + mv data/$set/text data/$set/text.old + paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text + rm -f data/$set/bpe_text data/$set/ids + done + + echo "$0:Preparing dictionary and lang..." + local/prepare_dict.sh + utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \ + data/local/dict "" data/lang/temp data/lang + utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang +fi + +if [ $stage -le 4 ]; then + echo "$0: Estimating a language model for decoding..." + local/tl/train_lm.sh --order 3 + utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_unpruned.arpa.gz \ + data/local/dict/lexicon.txt data/lang +fi + +nj=30 +if [ $stage -le 5 ]; then + echo "$0: Calling the flat-start chain recipe... $(date)." + local/tl/chain/run_e2e_cnn.sh --nj $nj --train_set train_aug +fi + +if [ $stage -le 6 ]; then + echo "$0: Aligning the training data using the e2e chain model...$(date)." + steps/nnet3/align.sh --nj $nj --cmd "$cmd" \ + --use-gpu false \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \ + data/train_aug data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train +fi + +if [ $stage -le 7 ]; then + echo "$0: Building a tree and training a regular chain model using the e2e alignments...$(date)" + local/tl/chain/run_cnn_e2eali.sh --nj $nj --train_set train_aug +fi diff --git a/egs/madcat_ar/v1/local/tl/train_lm.sh b/egs/madcat_ar/v1/local/tl/train_lm.sh new file mode 100755 index 00000000000..524bb2e9f40 --- /dev/null +++ b/egs/madcat_ar/v1/local/tl/train_lm.sh @@ -0,0 +1,102 @@ +#!/bin/bash + +# Copyright 2016 Vincent Nguyen +# 2016 Johns Hopkins University (author: Daniel Povey) +# 2017 Ashish Arora +# 2017 Hossein Hadian +# Apache 2.0 +# +# This script trains a LM on the training transcriptions. +# It is based on the example scripts distributed with PocoLM + +# It will check if pocolm is installed and if not will proceed with installation + +set -e +stage=0 +dir=data/local/local_lm +order=3 +echo "$0 $@" # Print the command line for logging +. ./utils/parse_options.sh || exit 1; + +lm_dir=${dir}/data + + +mkdir -p $dir +. ./path.sh || exit 1; # for KALDI_ROOT +export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH +( # First make sure the pocolm toolkit is installed. + cd $KALDI_ROOT/tools || exit 1; + if [ -d pocolm ]; then + echo Not installing the pocolm toolkit since it is already there. + else + echo "$0: Please install the PocoLM toolkit with: " + echo " cd ../../../tools; extras/install_pocolm.sh; cd -" + exit 1; + fi +) || exit 1; + +bypass_metaparam_optim_opt= +# If you want to bypass the metaparameter optimization steps with specific metaparameters +# un-comment the following line, and change the numbers to some appropriate values. +# You can find the values from output log of train_lm.py. +# These example numbers of metaparameters is for 4-gram model (with min-counts) +# running with train_lm.py. +# The dev perplexity should be close to the non-bypassed model. +# Note: to use these example parameters, you may need to remove the .done files +# to make sure the make_lm_dir.py be called and tain only 3-gram model +#for order in 3; do +#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done +if [ $stage -le 0 ]; then + mkdir -p ${dir}/data + mkdir -p ${dir}/data/text + + echo "$0: Getting the Data sources" + + rm ${dir}/data/text/* 2>/dev/null || true + + # use the validation data as the dev set. + # Note: the name 'dev' is treated specially by pocolm, it automatically + # becomes the dev set. + + cat data/dev/text | cut -d " " -f 2- > ${dir}/data/text/dev.txt + + # use the training data as an additional data source. + # we can later fold the dev data into this. + cat data/train/text | cut -d " " -f 2- > ${dir}/data/text/train.txt + + # for reporting perplexities, we'll use the "real" dev set. + # (the validation data is used as ${dir}/data/text/dev.txt to work + # out interpolation weights.) + # note, we can't put it in ${dir}/data/text/, because then pocolm would use + # it as one of the data sources. + cut -d " " -f 2- < data/test/text > ${dir}/data/real_dev_set.txt + + # get the wordlist from MADCAT text + cat ${dir}/data/text/train.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count + cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist +fi + +if [ $stage -le 1 ]; then + # decide on the vocabulary. + # Note: you'd use --wordlist if you had a previously determined word-list + # that you wanted to use. + # Note: if you have more than one order, use a certain amount of words as the + # vocab and want to restrict max memory for 'sort', + echo "$0: training the unpruned LM" + min_counts='train=1' + wordlist=${dir}/data/wordlist + + lm_name="`basename ${wordlist}`_${order}" + if [ -n "${min_counts}" ]; then + lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`" + fi + unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm + train_lm.py --wordlist=${wordlist} --num-splits=20 --warm-start-ratio=20 \ + --limit-unk-history=true \ + ${bypass_metaparam_optim_opt} \ + ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir} + + get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' + mkdir -p ${dir}/data/arpa + format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz +fi diff --git a/egs/madcat_ar/v1/local/train_lm.sh b/egs/madcat_ar/v1/local/train_lm.sh index 3b8a382cb00..903b288a834 100755 --- a/egs/madcat_ar/v1/local/train_lm.sh +++ b/egs/madcat_ar/v1/local/train_lm.sh @@ -6,20 +6,19 @@ # 2017 Hossein Hadian # Apache 2.0 # -# This script trains a LM on the MADCAT training transcriptions. +# This script trains a LM on the training transcriptions. # It is based on the example scripts distributed with PocoLM # It will check if pocolm is installed and if not will proceed with installation set -e stage=0 - +dir=data/local/local_lm +order=6 echo "$0 $@" # Print the command line for logging . ./utils/parse_options.sh || exit 1; -dir=data/local/local_lm lm_dir=${dir}/data -segments=data/train/segmented_words mkdir -p $dir @@ -43,12 +42,10 @@ bypass_metaparam_optim_opt= # These example numbers of metaparameters is for 4-gram model (with min-counts) # running with train_lm.py. # The dev perplexity should be close to the non-bypassed model. -#bypass_metaparam_optim_opt= # Note: to use these example parameters, you may need to remove the .done files # to make sure the make_lm_dir.py be called and tain only 3-gram model #for order in 3; do #rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done - if [ $stage -le 0 ]; then mkdir -p ${dir}/data mkdir -p ${dir}/data/text @@ -65,7 +62,13 @@ if [ $stage -le 0 ]; then # use the training data as an additional data source. # we can later fold the dev data into this. - cat data/train/text | cut -d " " -f 2- > ${dir}/data/text/madcat.txt + cat data/train/text | cut -d " " -f 2- > ${dir}/data/text/train.txt + + if [ -d "data/local/gigawordcorpus/arb_gw_5/data" ]; then + cat data/local/gigawordcorpus/arb_gw_5/data/nhr_arb_combined.txt | \ + utils/lang/bpe/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ + | sed 's/@@//g' > ${dir}/data/text/corpus_text.txt + fi # for reporting perplexities, we'll use the "real" dev set. # (the validation data is used as ${dir}/data/text/dev.txt to work @@ -75,12 +78,10 @@ if [ $stage -le 0 ]; then cut -d " " -f 2- < data/test/text > ${dir}/data/real_dev_set.txt # get the wordlist from MADCAT text - cat ${dir}/data/text/madcat.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count + cat ${dir}/data/text/{train,corpus_text}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist fi -order=3 - if [ $stage -le 1 ]; then # decide on the vocabulary. # Note: you'd use --wordlist if you had a previously determined word-list @@ -88,7 +89,7 @@ if [ $stage -le 1 ]; then # Note: if you have more than one order, use a certain amount of words as the # vocab and want to restrict max memory for 'sort', echo "$0: training the unpruned LM" - min_counts='train=2 madcat=1' + min_counts='corpus_text=2 train=1' wordlist=${dir}/data/wordlist lm_name="`basename ${wordlist}`_${order}" @@ -96,13 +97,34 @@ if [ $stage -le 1 ]; then lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`" fi unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm - train_lm.py --wordlist=${wordlist} --num-splits=5 --warm-start-ratio=1 \ + train_lm.py --wordlist=${wordlist} --num-splits=20 --warm-start-ratio=20 \ --limit-unk-history=true \ ${bypass_metaparam_optim_opt} \ ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir} get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' - mkdir -p ${dir}/data/arpa format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz fi + +if [ $stage -le 2 ]; then + echo "$0: pruning the LM (to larger size)" + # Using 20 million n-grams for a big LM for rescoring purposes. + size=20000000 + prune_lm_dir.py --target-num-ngrams=$size --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big + + get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity' + mkdir -p ${dir}/data/arpa + format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz +fi + +if [ $stage -le 3 ]; then + echo "$0: pruning the LM (to smaller size)" + # Using 10 million n-grams for a smaller LM for graph building. Prune from the + # bigger-pruned LM, it'll be faster. + size=10000000 + prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small + + get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity' + format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz +fi diff --git a/egs/madcat_ar/v1/local/wer_output_filter b/egs/madcat_ar/v1/local/wer_output_filter index c0f03e7178a..d6d46f3f565 100755 --- a/egs/madcat_ar/v1/local/wer_output_filter +++ b/egs/madcat_ar/v1/local/wer_output_filter @@ -2,6 +2,9 @@ # Copyright 2012-2014 Johns Hopkins University (Author: Yenda Trmal) # Apache 2.0 +# This script converts a BPE-encoded text to normal text and performs normalization. +# It is used in scoring. + use utf8; use open qw(:encoding(utf8)); diff --git a/egs/madcat_ar/v1/run.sh b/egs/madcat_ar/v1/run.sh index 14c8bf7a6ce..01bfdbed543 100755 --- a/egs/madcat_ar/v1/run.sh +++ b/egs/madcat_ar/v1/run.sh @@ -11,9 +11,7 @@ decode_gmm=false # download_dir{1,2,3} points to the database path on the JHU grid. If you have not # already downloaded the database you can set it to a local directory # This corpus can be purchased here: -# https://catalog.ldc.upenn.edu/LDC2012T15, -# https://catalog.ldc.upenn.edu/LDC2013T09/, -# https://catalog.ldc.upenn.edu/LDC2013T15/. +# https://catalog.ldc.upenn.edu/{LDC2012T15,LDC2013T09/,LDC2013T15/} download_dir1=/export/corpora/LDC/LDC2012T15/data download_dir2=/export/corpora/LDC/LDC2013T09/data download_dir3=/export/corpora/LDC/LDC2013T15/data @@ -21,47 +19,50 @@ writing_condition1=/export/corpora/LDC/LDC2012T15/docs/writing_conditions.tab writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab data_splits_dir=data/download/data_splits - +images_scp_dir=data/local +overwrite=false +subset=false +augment=false +use_extra_corpus_text=true . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. ## This relates to the queue. . ./path.sh . ./utils/parse_options.sh # e.g. this parses the above options # if supplied. - ./local/check_tools.sh - mkdir -p data/{train,test,dev}/data mkdir -p data/local/{train,test,dev} if [ $stage -le 0 ]; then - echo "$0: Downloading data splits..." - echo "Date: $(date)." - local/download_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \ - --download_dir2 $download_dir2 --download_dir3 $download_dir3 -fi - -if [ $stage -le 1 ]; then - for dataset in test train dev; do - data_split_file=$data_splits_dir/madcat.$dataset.raw.lineid + if [ -f data/train/text ] && ! $overwrite; then + echo "$0: Not processing, probably script have run from wrong stage" + echo "Exiting with status 1 to avoid data corruption" + exit 1; + fi + local/prepare_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \ + --download_dir2 $download_dir2 --download_dir3 $download_dir3 \ + --use_extra_corpus_text $use_extra_corpus_text + + for set in test train dev; do + data_split_file=$data_splits_dir/madcat.$set.raw.lineid local/extract_lines.sh --nj $nj --cmd $cmd --data_split_file $data_split_file \ --download_dir1 $download_dir1 --download_dir2 $download_dir2 \ --download_dir3 $download_dir3 --writing_condition1 $writing_condition1 \ --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 \ - --data data/local/$dataset + --data data/local/$set --subset $subset --augment $augment || exit 1 done -fi -if [ $stage -le 2 ]; then - echo "$0: Preparing data..." - local/prepare_data.sh --download_dir1 $download_dir1 --download_dir2 $download_dir2 \ - --download_dir3 $download_dir3 --images_scp_dir data/local \ - --data_splits_dir $data_splits_dir --writing_condition1 $writing_condition1 \ - --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 + echo "$0: Processing data..." + for set in dev train test; do + local/process_data.py $download_dir1 $download_dir2 $download_dir3 \ + $data_splits_dir/madcat.$set.raw.lineid data/$set $images_scp_dir/$set/images.scp \ + $writing_condition1 $writing_condition2 $writing_condition3 --augment $augment --subset $subset + image/fix_data_dir.sh data/${set} + done fi -mkdir -p data/{train,test,dev}/data -if [ $stage -le 3 ]; then +if [ $stage -le 1 ]; then for dataset in test train; do local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 data/$dataset steps/compute_cmvn_stats.sh data/$dataset || exit 1; @@ -69,33 +70,53 @@ if [ $stage -le 3 ]; then utils/fix_data_dir.sh data/train fi -if [ $stage -le 4 ]; then - echo "$0: Preparing dictionary and lang..." +if [ $stage -le 2 ]; then + echo "$0: Preparing BPE..." + cut -d' ' -f2- data/train/text | utils/lang/bpe/reverse.py | \ + utils/lang/bpe/prepend_words.py | \ + utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt + + for set in test train dev; do + cut -d' ' -f1 data/$set/text > data/$set/ids + cut -d' ' -f2- data/$set/text | utils/lang/bpe/reverse.py | \ + utils/lang/bpe/prepend_words.py | \ + utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ + | sed 's/@@//g' > data/$set/bpe_text + + mv data/$set/text data/$set/text.old + paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text + rm -f data/$set/bpe_text data/$set/ids + done + + echo "$0:Preparing dictionary and lang..." local/prepare_dict.sh - utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.95 \ - data/local/dict "" data/lang/temp data/lang + utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \ + data/local/dict "" data/lang/temp data/lang + utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang fi -if [ $stage -le 5 ]; then +if [ $stage -le 3 ]; then echo "$0: Estimating a language model for decoding..." local/train_lm.sh - utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_unpruned.arpa.gz \ - data/local/dict/lexicon.txt data/lang_test + utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_small.arpa.gz \ + data/local/dict/lexicon.txt data/lang + utils/build_const_arpa_lm.sh data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \ + data/lang data/lang_rescore_6g fi -if [ $stage -le 6 ]; then +if [ $stage -le 4 ]; then steps/train_mono.sh --nj $nj --cmd $cmd --totgauss 10000 data/train \ data/lang exp/mono fi -if [ $stage -le 7 ] && $decode_gmm; then - utils/mkgraph.sh --mono data/lang_test exp/mono exp/mono/graph +if [ $stage -le 5 ] && $decode_gmm; then + utils/mkgraph.sh --mono data/lang exp/mono exp/mono/graph steps/decode.sh --nj $nj --cmd $cmd exp/mono/graph data/test \ exp/mono/decode_test fi -if [ $stage -le 8 ]; then +if [ $stage -le 6 ]; then steps/align_si.sh --nj $nj --cmd $cmd data/train data/lang \ exp/mono exp/mono_ali @@ -103,14 +124,14 @@ if [ $stage -le 8 ]; then exp/mono_ali exp/tri fi -if [ $stage -le 9 ] && $decode_gmm; then - utils/mkgraph.sh data/lang_test exp/tri exp/tri/graph +if [ $stage -le 7 ] && $decode_gmm; then + utils/mkgraph.sh data/lang exp/tri exp/tri/graph steps/decode.sh --nj $nj --cmd $cmd exp/tri/graph data/test \ exp/tri/decode_test fi -if [ $stage -le 10 ]; then +if [ $stage -le 8 ]; then steps/align_si.sh --nj $nj --cmd $cmd data/train data/lang \ exp/tri exp/tri_ali @@ -119,22 +140,22 @@ if [ $stage -le 10 ]; then data/train data/lang exp/tri_ali exp/tri3 fi -if [ $stage -le 11 ] && $decode_gmm; then - utils/mkgraph.sh data/lang_test exp/tri3 exp/tri3/graph +if [ $stage -le 9 ] && $decode_gmm; then + utils/mkgraph.sh data/lang exp/tri3 exp/tri3/graph steps/decode.sh --nj $nj --cmd $cmd exp/tri3/graph \ data/test exp/tri3/decode_test fi -if [ $stage -le 12 ]; then +if [ $stage -le 10 ]; then steps/align_fmllr.sh --nj $nj --cmd $cmd --use-graphs true \ data/train data/lang exp/tri3 exp/tri3_ali fi -if [ $stage -le 13 ]; then - local/chain/run_cnn_1a.sh +if [ $stage -le 11 ]; then + local/chain/run_cnn.sh fi -if [ $stage -le 14 ]; then - local/chain/run_cnn_chainali_1a.sh --stage 2 +if [ $stage -le 12 ]; then + local/chain/run_cnn_chainali.sh --stage 2 fi diff --git a/egs/madcat_ar/v1/run_end2end.sh b/egs/madcat_ar/v1/run_end2end.sh index 5d27476d3e1..62f4eeb7c71 100755 --- a/egs/madcat_ar/v1/run_end2end.sh +++ b/egs/madcat_ar/v1/run_end2end.sh @@ -7,9 +7,7 @@ nj=70 # download_dir{1,2,3} points to the database path on the JHU grid. If you have not # already downloaded the database you can set it to a local directory # This corpus can be purchased here: -# https://catalog.ldc.upenn.edu/LDC2012T15, -# https://catalog.ldc.upenn.edu/LDC2013T09/, -# https://catalog.ldc.upenn.edu/LDC2013T15/. +# https://catalog.ldc.upenn.edu/{LDC2012T15,LDC2013T09/,LDC2013T15/} download_dir1=/export/corpora/LDC/LDC2012T15/data download_dir2=/export/corpora/LDC/LDC2013T09/data download_dir3=/export/corpora/LDC/LDC2013T15/data @@ -17,7 +15,11 @@ writing_condition1=/export/corpora/LDC/LDC2012T15/docs/writing_conditions.tab writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab data_splits_dir=data/download/data_splits - +images_scp_dir=data/local +overwrite=false +subset=false +augment=false +use_extra_corpus_text=true . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. ## This relates to the queue. . ./path.sh @@ -27,102 +29,105 @@ data_splits_dir=data/download/data_splits mkdir -p data/{train,test,dev}/data mkdir -p data/local/{train,test,dev} - if [ $stage -le 0 ]; then - echo "$0: Downloading data splits..." - echo "Date: $(date)." - local/download_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \ - --download_dir2 $download_dir2 --download_dir3 $download_dir3 -fi -if [ $stage -le 1 ]; then - for dataset in test train dev; do - data_split_file=$data_splits_dir/madcat.$dataset.raw.lineid + if [ -f data/train/text ] && ! $overwrite; then + echo "$0: Not processing, probably script have run from wrong stage" + echo "Exiting with status 1 to avoid data corruption" + exit 1; + fi + + echo "$0: preparing data...$(date)" + local/prepare_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \ + --download_dir2 $download_dir2 --download_dir3 $download_dir3 \ + --use_extra_corpus_text $use_extra_corpus_text + + for set in test train dev; do + data_split_file=$data_splits_dir/madcat.$set.raw.lineid local/extract_lines.sh --nj $nj --cmd $cmd --data_split_file $data_split_file \ --download_dir1 $download_dir1 --download_dir2 $download_dir2 \ --download_dir3 $download_dir3 --writing_condition1 $writing_condition1 \ --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 \ - --data data/local/$dataset + --data data/local/$set --subset $subset --augment $augment || exit 1 + done + + echo "$0: Processing data..." + for set in dev train test; do + local/process_data.py $download_dir1 $download_dir2 $download_dir3 \ + $data_splits_dir/madcat.$set.raw.lineid data/$set $images_scp_dir/$set/images.scp \ + $writing_condition1 $writing_condition2 $writing_condition3 --augment $augment --subset $subset + image/fix_data_dir.sh data/${set} done -fi -if [ $stage -le 2 ]; then - echo "$0: Preparing data..." - local/prepare_data.sh --download_dir1 $download_dir1 --download_dir2 $download_dir2 \ - --download_dir3 $download_dir3 --images_scp_dir data/local \ - --data_splits_dir $data_splits_dir --writing_condition1 $writing_condition1 \ - --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 fi -if [ $stage -le 3 ]; then - echo "$0: Obtaining image groups. calling get_image2num_frames" - echo "Date: $(date)." - image/get_image2num_frames.py data/train # This will be needed for the next command - # The next command creates a "allowed_lengths.txt" file in data/train - # which will be used by local/make_features.py to enforce the images to - # have allowed lengths. The allowed lengths will be spaced by 10% difference in length. - echo "$0: Obtaining image groups. calling get_allowed_lengths" - echo "Date: $(date)." +if [ $stage -le 1 ]; then + echo "$0: Obtaining image groups. calling get_image2num_frames $(date)." + image/get_image2num_frames.py data/train image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train -fi -if [ $stage -le 4 ]; then - for dataset in test train; do - echo "$0: Extracting features and calling compute_cmvn_stats for dataset: $dataset. " - echo "Date: $(date)." - local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 data/$dataset - steps/compute_cmvn_stats.sh data/$dataset || exit 1; + for set in test dev train; do + echo "$0: Extracting features and calling compute_cmvn_stats for dataset: $set. $(date)" + local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 data/$set + steps/compute_cmvn_stats.sh data/$set || exit 1; done - echo "$0: Fixing data directory for train dataset" - echo "Date: $(date)." + echo "$0: Fixing data directory for train dataset $(date)." utils/fix_data_dir.sh data/train fi -if [ $stage -le 5 ]; then - echo "$0: Preparing dictionary and lang..." - cut -d' ' -f2- data/train/text | local/reverse.py | \ - local/prepend_words.py | \ - utils/lang/bpe/learn_bpe.py -s 700 > data/train/bpe.out +if [ $stage -le 2 ]; then + echo "$0: Preparing BPE..." + cut -d' ' -f2- data/train/text | utils/lang/bpe/reverse.py | \ + utils/lang/bpe/prepend_words.py | \ + utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt + for set in test train dev; do cut -d' ' -f1 data/$set/text > data/$set/ids - cut -d' ' -f2- data/$set/text | local/reverse.py | \ - local/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/train/bpe.out \ + cut -d' ' -f2- data/$set/text | utils/lang/bpe/reverse.py | \ + utils/lang/bpe/prepend_words.py | \ + utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ | sed 's/@@//g' > data/$set/bpe_text + mv data/$set/text data/$set/text.old paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text + rm -f data/$set/bpe_text data/$set/ids done + + echo "$0:Preparing dictionary and lang..." local/prepare_dict.sh - # This recipe uses byte-pair encoding, the silences are part of the words' pronunciations. - # So we set --sil-prob to 0.0 utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \ data/local/dict "" data/lang/temp data/lang utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang fi -if [ $stage -le 6 ]; then +if [ $stage -le 3 ]; then + echo "$0: Calling the flat-start chain recipe... $(date)." + local/chain/run_e2e_cnn.sh +fi + +lang_decode=data/lang +lang_rescore=data/lang_rescore_6g +decode_e2e=true +if [ $stage -le 4 ]; then echo "$0: Estimating a language model for decoding..." local/train_lm.sh - utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_unpruned.arpa.gz \ - data/local/dict/lexicon.txt data/lang_test + utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_big.arpa.gz \ + data/local/dict/lexicon.txt $lang_decode + utils/build_const_arpa_lm.sh data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \ + data/lang $lang_rescore fi -if [ $stage -le 7 ]; then - echo "$0: Calling the flat-start chain recipe..." - echo "Date: $(date)." - local/chain/run_flatstart_cnn1a.sh --nj $nj -fi +if [ $stage -le 5 ] && $decode_e2e; then + echo "$0: $(date) stage 5: decoding end2end setup..." + utils/mkgraph.sh --self-loop-scale 1.0 $lang_decode \ + exp/chain/e2e_cnn_1a/ exp/chain/e2e_cnn_1a/graph || exit 1; -if [ $stage -le 8 ]; then - echo "$0: Aligning the training data using the e2e chain model..." - echo "Date: $(date)." - steps/nnet3/align.sh --nj $nj --cmd "$cmd" \ - --use-gpu false \ - --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \ - data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train -fi + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 --nj $nj --cmd "$cmd" \ + exp/chain/e2e_cnn_1a/graph data/test exp/chain/e2e_cnn_1a/decode_test || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/test exp/chain/e2e_cnn_1a/decode_test{,_rescored} || exit 1 -if [ $stage -le 9 ]; then - echo "$0: Building a tree and training a regular chain model using the e2e alignments..." - echo "Date: $(date)." - local/chain/run_cnn_e2eali_1b.sh --nj $nj + echo "$0: Done. Date: $(date). Results:" + local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/ fi diff --git a/egs/madcat_zh/README.txt b/egs/madcat_zh/README.txt new file mode 100644 index 00000000000..4ea8df8bb3c --- /dev/null +++ b/egs/madcat_zh/README.txt @@ -0,0 +1,5 @@ +This directory contains example scripts for handwriting recognition on +the MADCAT Chinese HWR dataset (LDC2014T13). +This dataset consists of handwritten Chinese documents, scanned +at high resolution and annotated for each line and token. +More info: https://catalog.ldc.upenn.edu/LDC2014T13 diff --git a/egs/madcat_zh/v1/cmd.sh b/egs/madcat_zh/v1/cmd.sh new file mode 100644 index 00000000000..3c8eb9f93a5 --- /dev/null +++ b/egs/madcat_zh/v1/cmd.sh @@ -0,0 +1,13 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export cmd="queue.pl" diff --git a/egs/madcat_zh/v1/image b/egs/madcat_zh/v1/image new file mode 120000 index 00000000000..1668ee99922 --- /dev/null +++ b/egs/madcat_zh/v1/image @@ -0,0 +1 @@ +../../cifar/v1/image/ \ No newline at end of file diff --git a/egs/madcat_zh/v1/local/chain/compare_wer.sh b/egs/madcat_zh/v1/local/chain/compare_wer.sh new file mode 100755 index 00000000000..4eb665fc702 --- /dev/null +++ b/egs/madcat_zh/v1/local/chain/compare_wer.sh @@ -0,0 +1,59 @@ +#!/bin/bash + +# this script is used for comparing decoding results between systems. +# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b} + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora + +if [ $# == 0 ]; then + echo "Usage: $0: [ ... ]" + echo "e.g.: $0 exp/chain/cnn{1a,1b}" + exit 1 +fi + +echo "# $0 $*" +used_epochs=false + +echo -n "# System " +for x in $*; do printf "% 10s" " $(basename $x)"; done +echo + +echo -n "# WER " +for x in $*; do + wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}') + printf "% 10s" $wer +done +echo + +if $used_epochs; then + exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. +fi + +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final train prob (xent) " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob (xent) " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo diff --git a/egs/madcat_zh/v1/local/chain/run_cnn.sh b/egs/madcat_zh/v1/local/chain/run_cnn.sh new file mode 120000 index 00000000000..df6f0a468c1 --- /dev/null +++ b/egs/madcat_zh/v1/local/chain/run_cnn.sh @@ -0,0 +1 @@ +tuning/run_cnn_1a.sh \ No newline at end of file diff --git a/egs/madcat_zh/v1/local/chain/run_cnn_chainali.sh b/egs/madcat_zh/v1/local/chain/run_cnn_chainali.sh new file mode 120000 index 00000000000..86568421fe1 --- /dev/null +++ b/egs/madcat_zh/v1/local/chain/run_cnn_chainali.sh @@ -0,0 +1 @@ +tuning/run_cnn_chainali_1b.sh \ No newline at end of file diff --git a/egs/madcat_zh/v1/local/chain/run_e2e_cnn.sh b/egs/madcat_zh/v1/local/chain/run_e2e_cnn.sh new file mode 120000 index 00000000000..d26ba0182ce --- /dev/null +++ b/egs/madcat_zh/v1/local/chain/run_e2e_cnn.sh @@ -0,0 +1 @@ +tuning/run_e2e_cnn_1a.sh \ No newline at end of file diff --git a/egs/madcat_zh/v1/local/chain/tuning/run_cnn_1a.sh b/egs/madcat_zh/v1/local/chain/tuning/run_cnn_1a.sh new file mode 100755 index 00000000000..164d62a7ad9 --- /dev/null +++ b/egs/madcat_zh/v1/local/chain/tuning/run_cnn_1a.sh @@ -0,0 +1,223 @@ +#!/bin/bash + +# Copyright 2017 Hossein Hadian +# 2017 Chun Chieh Chang +# 2017 Ashish Arora + +# steps/info/chain_dir_info.pl exp/chain/cnn_1a/ +# exp/chain/cnn_1a/: num-iters=21 nj=2..4 num-params=4.4M dim=40->364 combine=-0.021->-0.015 xent:train/valid[13,20,final]=(-1.05,-0.701,-0.591/-1.30,-1.08,-1.00) logprob:train/valid[13,20,final]=(-0.061,-0.034,-0.030/-0.107,-0.101,-0.098) + +# local/chain/compare_wer.sh exp/chain/cnn_1a/ exp/chain/cnn_chainali_1b/ exp/chain/e2e_cnn_1a/ +# System cnn_1a cnn_chainali_1b e2e_cnn_1a +# WER 13.51 6.76 10.55 +# Final train prob -0.0291 -0.0138 -0.0702 +# Final valid prob -0.0712 -0.0171 -0.0578 +# Final train prob (xent) -0.3847 -0.4169 +# Final valid prob (xent) -0.4962 -0.5040 + +set -e -o pipefail + +stage=0 + +nj=50 +train_set=train +gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it + # should have alignments for the specified training data. +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +ali=tri3_ali +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +chunk_width=340,300,200,100 +num_leaves=500 +tdnn_dim=450 +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj $nj --cmd "$cmd" ${train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 4 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves ${train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) + common1="height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="height-offsets=-2,-1,0,1,2 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=60 name=input + + conv-relu-batchnorm-layer name=cnn1 height-in=60 height-out=60 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=60 height-out=30 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=30 height-out=30 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=30 height-out=15 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn4 input=Append(-4,0,4) dim=$tdnn_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn4 dim=$tdnn_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \ + --chain.frame-subsampling-factor=4 \ + --chain.alignment-subsampling-factor=4 \ + --trainer.srand=0 \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=2 \ + --trainer.frames-per-iter=2000000 \ + --trainer.optimization.num-jobs-initial=3 \ + --trainer.optimization.num-jobs-final=12 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=64,32 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=false \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; +fi + +echo "$0: Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir + diff --git a/egs/iam/v1/local/chain/run_cnn_chainali_1a.sh b/egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1a.sh similarity index 87% rename from egs/iam/v1/local/chain/run_cnn_chainali_1a.sh rename to egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1a.sh index ee3a1a3d92c..be51bdcc3d1 100755 --- a/egs/iam/v1/local/chain/run_cnn_chainali_1a.sh +++ b/egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1a.sh @@ -2,10 +2,16 @@ # chainali_1a is as 1a except it uses chain alignments (using 1a system) instead of gmm alignments +# ./local/chain/compare_wer.sh exp/chain/cnn_chainali_1a/ exp/chain/cnn_1a/ + +# steps/info/chain_dir_info.pl exp/chain/cnn_chainali_1a/ +# exp/chain/cnn_chainali_1a/: num-iters=21 nj=2..4 num-params=4.4M dim=40->364 combine=-0.002->0.000 xent:train/valid[13,20,final]=(-0.929,-0.711,-0.645/-1.16,-1.04,-0.992) logprob:train/valid[13,20,final]=(-0.029,-0.016,-0.013/-0.051,-0.047,-0.045) + +# cat exp/chain/cnn_chainali_1a/decode_test/scoring_kaldi/best_* + set -e -o pipefail stage=0 - nj=30 train_set=train gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it @@ -13,35 +19,25 @@ gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. affix=_1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. ali=tri3_ali -chain_model_dir=exp/chain${nnet3_affix}/cnn_1a +chain_model_dir=exp/chain${nnet3_affix}/cnn${affix} common_egs_dir= reporting_email= # chain options train_stage=-10 xent_regularize=0.1 -frame_subsampling_factor=4 -alignment_subsampling_factor=1 # training chunk-options chunk_width=340,300,200,100 num_leaves=500 # we don't need extra left/right context for TDNN systems. -chunk_left_context=0 -chunk_right_context=0 tdnn_dim=450 -# training options -srand=0 -remove_egs=false -lang_test=lang_test # End configuration section. echo "$0 $@" # Print the command line for logging - . ./cmd.sh . ./path.sh . ./utils/parse_options.sh - if ! cuda-compiled; then cat < $dir/configs/network.xconfig input dim=40 name=input - conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 @@ -175,27 +169,23 @@ if [ $stage -le 5 ]; then --chain.l2-regularize=0.00005 \ --chain.apply-deriv-weights=false \ --chain.lm-opts="--num-extra-lm-states=500" \ - --chain.frame-subsampling-factor=$frame_subsampling_factor \ - --chain.alignment-subsampling-factor=$alignment_subsampling_factor \ - --trainer.srand=$srand \ + --chain.frame-subsampling-factor=4 \ + --chain.alignment-subsampling-factor=1 \ + --trainer.srand=0 \ --trainer.max-param-change=2.0 \ - --trainer.num-epochs=4 \ - --trainer.frames-per-iter=1000000 \ - --trainer.optimization.num-jobs-initial=2 \ - --trainer.optimization.num-jobs-final=4 \ + --trainer.num-epochs=2 \ + --trainer.frames-per-iter=2000000 \ + --trainer.optimization.num-jobs-initial=8 \ + --trainer.optimization.num-jobs-final=16 \ --trainer.optimization.initial-effective-lrate=0.001 \ --trainer.optimization.final-effective-lrate=0.0001 \ --trainer.optimization.shrink-value=1.0 \ --trainer.num-chunk-per-minibatch=64,32 \ --trainer.optimization.momentum=0.0 \ --egs.chunk-width=$chunk_width \ - --egs.chunk-left-context=$chunk_left_context \ - --egs.chunk-right-context=$chunk_right_context \ - --egs.chunk-left-context-initial=0 \ - --egs.chunk-right-context-final=0 \ --egs.dir="$common_egs_dir" \ --egs.opts="--frames-overlap-per-eg 0" \ - --cleanup.remove-egs=$remove_egs \ + --cleanup.remove-egs=false \ --use-gpu=true \ --reporting.email="$reporting_email" \ --feat-dir=$train_data_dir \ @@ -211,19 +201,14 @@ if [ $stage -le 6 ]; then # topology file from the model). So you could give it a different # lang directory, one that contained a wordlist and LM of your choice, # as long as phones.txt was compatible. - utils/mkgraph.sh \ - --self-loop-scale 1.0 data/$lang_test \ + --self-loop-scale 1.0 data/lang_test \ $dir $dir/graph || exit 1; fi if [ $stage -le 7 ]; then frames_per_chunk=$(echo $chunk_width | cut -d, -f1) steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context $chunk_left_context \ - --extra-right-context $chunk_right_context \ - --extra-left-context-initial 0 \ - --extra-right-context-final 0 \ --frames-per-chunk $frames_per_chunk \ --nj $nj --cmd "$cmd" \ $dir/graph data/test $dir/decode_test || exit 1; diff --git a/egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1b.sh b/egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1b.sh new file mode 100755 index 00000000000..aa61620a92f --- /dev/null +++ b/egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1b.sh @@ -0,0 +1,226 @@ +#!/bin/bash + +# chainali_1b is as chainali_1a except it has 3 more cnn layers and 1 less tdnn layer. +# ./local/chain/compare_wer.sh exp/chain/cnn_chainali_1a/ exp/chain/cnn_chainali_1b/ + +# steps/info/chain_dir_info.pl exp/chain/chainali_cnn_1b/ +# exp/chain/chainali_cnn_1b/: num-iters=21 nj=2..4 num-params=4.0M dim=40->364 combine=-0.009->-0.005 xent:train/valid[13,20,final]=(-1.47,-0.728,-0.623/-1.69,-1.02,-0.940) logprob:train/valid[13,20,final]=(-0.068,-0.030,-0.011/-0.086,-0.056,-0.038) + +# local/chain/compare_wer.sh exp/chain/cnn_1a/ exp/chain/cnn_chainali_1b/ exp/chain/e2e_cnn_1a/ +# System cnn_1a cnn_chainali_1b e2e_cnn_1a +# WER 13.51 6.76 10.55 +# Final train prob -0.0291 -0.0138 -0.0702 +# Final valid prob -0.0712 -0.0171 -0.0578 +# Final train prob (xent) -0.3847 -0.4169 +# Final valid prob (xent) -0.4962 -0.5040 + +set -e -o pipefail + +stage=0 +nj=30 +train_set=train +gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it + # should have alignments for the specified training data. +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1b #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +ali=tri3_ali +chain_model_dir=exp/chain${nnet3_affix}/cnn${affix} +common_egs_dir= +reporting_email= +# chain options +train_stage=-10 +xent_regularize=0.1 +# training chunk-options +chunk_width=340,300,200,100 +num_leaves=500 +tdnn_dim=450 +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --acoustic-scale 1.0 \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + ${train_data_dir} data/lang $chain_model_dir $lat_dir + cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 4 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves ${train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) + common1="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=60 name=input + + conv-relu-batchnorm-layer name=cnn1 height-in=60 height-out=60 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=60 height-out=30 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=30 height-out=30 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=30 height-out=30 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=30 height-out=15 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=15 height-out=15 time-offsets=-1,0,1 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=15 height-out=15 time-offsets=-1,0,1 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \ + --chain.frame-subsampling-factor=4 \ + --chain.alignment-subsampling-factor=1 \ + --trainer.srand=0 \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=2 \ + --trainer.frames-per-iter=2000000 \ + --trainer.optimization.num-jobs-initial=6 \ + --trainer.optimization.num-jobs-final=16 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=64,32 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=false \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; +fi + +echo "$0: Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/madcat_zh/v1/local/chain/tuning/run_e2e_cnn_1a.sh b/egs/madcat_zh/v1/local/chain/tuning/run_e2e_cnn_1a.sh new file mode 100755 index 00000000000..ffc9a4c8a14 --- /dev/null +++ b/egs/madcat_zh/v1/local/chain/tuning/run_e2e_cnn_1a.sh @@ -0,0 +1,130 @@ +#!/bin/bash +# Copyright 2017 Hossein Hadian + +# local/chain/compare_wer.sh exp/chain/e2e_cnn_1a +# System e2e_cnn_1a +# WER 10.41 +# Final train prob -0.0536 +# Final valid prob -0.0489 +# Final train prob (xent) +# Final valid prob (xent) + +# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a/ +# exp/chain/e2e_cnn_1a/: num-iters=63 nj=6..12 num-params=6.1M dim=80->5760 combine=-0.048->-0.048 (over 5) logprob:train/valid[41,62,final]=(-0.062,-0.065,-0.054/-0.058,-0.062,-0.049) + +set -e +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +affix=1a + +# training options +tdnn_dim=450 +minibatch_size=150=48,24/300=24,12/600=12,6/1200=4,4 +common_egs_dir= +train_set=train + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 1 ]; then + steps/nnet3/chain/e2e/prepare_e2e.sh --nj 70 --cmd "$cmd" \ + --shared-phones true \ + --type mono \ + data/$train_set $lang $treedir + $cmd $treedir/log/make_phone_lm.log \ + cat data/$train_set/text \| \ + steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \ + utils/sym2int.pl -f 2- data/lang/phones.txt \| \ + chain-est-phone-lm --num-extra-lm-states=500 \ + ark:- $treedir/phone_lm.fst +fi + +if [ $stage -le 2 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}') + common1="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=80 name=input + conv-relu-batchnorm-layer name=cnn1 height-in=80 height-out=80 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=80 height-out=40 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=40 height-out=40 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=40 height-out=40 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=40 height-out=20 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=20 height-out=20 time-offsets=-1,0,1 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=20 height-out=20 time-offsets=-1,0,1 $common3 + conv-relu-batchnorm-layer name=cnn8 height-in=20 height-out=10 time-offsets=-1,0,1 $common3 height-subsample-out=2 + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs +fi + +if [ $stage -le 3 ]; then + # no need to store the egs in a shared storage because we always + # remove them. Anyway, it takes only 5 minutes to generate them. + + steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \ + --cmd "$cmd" \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \ + --chain.frame-subsampling-factor 4 \ + --chain.alignment-subsampling-factor 4 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 2000000 \ + --trainer.num-epochs 2 \ + --trainer.optimization.momentum 0 \ + --trainer.optimization.num-jobs-initial 6 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.shrink-value 1.0 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs true \ + --feat-dir data/${train_set} \ + --tree-dir $treedir \ + --dir $dir || exit 1; +fi diff --git a/egs/madcat_zh/v1/local/check_tools.sh b/egs/madcat_zh/v1/local/check_tools.sh new file mode 100755 index 00000000000..00de9778808 --- /dev/null +++ b/egs/madcat_zh/v1/local/check_tools.sh @@ -0,0 +1,49 @@ +#!/bin/bash -u + +# Copyright 2015 (c) Johns Hopkins University (Jan Trmal ) + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +[ -f ./path.sh ] && . ./path.sh +set +e + +command -v python3 >&/dev/null \ + || { echo >&2 "python3 not found on PATH. You will have to install Python3, preferably >= 3.6"; exit 1; } + +python3 -c "import numpy" +if [ $? -ne 0 ] ; then + echo >&2 "This recipe needs numpy installed." + exit 1 +fi + +python3 -c "import scipy" +if [ $? -ne 0 ] ; then + echo >&2 "This recipe needs scipy installed." + exit 1 +fi + +python3 -c "from scipy.spatial import ConvexHull" +if [ $? -ne 0 ] ; then + echo >&2 "This recipe needs scipy installed." + exit 1 +fi + +python3 -c "import scipy.misc; scipy.misc.__dict__['imread'];" +if [ $? -ne 0 ] ; then + echo >&2 "This recipe needs scipy-image, scikit-image and Pillow installed." + exit 1 +fi + + +exit 0 diff --git a/egs/madcat_zh/v1/local/create_line_image_from_page_image.py b/egs/madcat_zh/v1/local/create_line_image_from_page_image.py new file mode 100755 index 00000000000..22af571fc04 --- /dev/null +++ b/egs/madcat_zh/v1/local/create_line_image_from_page_image.py @@ -0,0 +1,536 @@ +#!/usr/bin/env python3 + +# Copyright 2018 Ashish Arora +# Apache 2.0 +# minimum bounding box part in this script is originally from +#https://github.com/BebeSparkelSparkel/MinimumBoundingBox +#https://startupnextdoor.com/computing-convex-hull-in-python/ + +""" This module will be used for extracting line images from page image. + Given the word segmentation (bounding box around a word) for every word, it will + extract line segmentation. To extract line segmentation, it will take word bounding + boxes of a line as input, will create a minimum area bounding box that will contain + all corner points of word bounding boxes. The obtained bounding box (will not necessarily + be vertically or horizontally aligned). Hence to extract line image from line bounding box, + page image is rotated and line image is cropped and saved. +""" + +import sys +import argparse +import os +import xml.dom.minidom as minidom +import numpy as np +from math import atan2, cos, sin, pi, degrees, sqrt +from collections import namedtuple + +from scipy.spatial import ConvexHull +from PIL import Image +from scipy.misc import toimage + +parser = argparse.ArgumentParser(description="Creates line images from page image", + epilog="E.g. " + sys.argv[0] + " data/LDC2012T15" + " data/madcat.train.raw.lineid " + " data/local/lines ", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument('database_path1', type=str, + help='Path to the downloaded madcat data directory 1') +parser.add_argument('data_splits', type=str, + help='Path to file that contains the train/test/dev split information') +parser.add_argument('out_dir', type=str, + help='directory location to write output files') +parser.add_argument('--padding', type=int, default=400, + help='padding across horizontal/verticle direction') +args = parser.parse_args() + +""" +bounding_box is a named tuple which contains: + + area (float): area of the rectangle + length_parallel (float): length of the side that is parallel to unit_vector + length_orthogonal (float): length of the side that is orthogonal to unit_vector + rectangle_center(int, int): coordinates of the rectangle center + (use rectangle_corners to get the corner points of the rectangle) + unit_vector (float, float): direction of the length_parallel side. + (it's orthogonal vector can be found with the orthogonal_vector function + unit_vector_angle (float): angle of the unit vector to be in radians. + corner_points [(float, float)]: set that contains the corners of the rectangle +""" + +bounding_box_tuple = namedtuple('bounding_box_tuple', 'area ' + 'length_parallel ' + 'length_orthogonal ' + 'rectangle_center ' + 'unit_vector ' + 'unit_vector_angle ' + 'corner_points' + ) + +def unit_vector(pt0, pt1): + """ Returns an unit vector that points in the direction of pt0 to pt1. + Args: + pt0 (float, float): Point 0. Eg. (1.0, 2.0). + pt1 (float, float): Point 1. Eg. (3.0, 8.0). + + Returns: + (float, float): unit vector that points in the direction of pt0 to pt1. + Eg. 0.31622776601683794, 0.9486832980505138 + """ + dis_0_to_1 = sqrt((pt0[0] - pt1[0])**2 + (pt0[1] - pt1[1])**2) + return (pt1[0] - pt0[0])/ dis_0_to_1, \ + (pt1[1] - pt0[1])/ dis_0_to_1 + + +def orthogonal_vector(vector): + """ From vector returns a orthogonal/perpendicular vector of equal length. + Args: + vector (float, float): A vector. Eg. (0.31622776601683794, 0.9486832980505138). + + Returns: + (float, float): A vector that points in the direction orthogonal to vector. + Eg. - 0.9486832980505138,0.31622776601683794 + """ + return -1 * vector[1], vector[0] + + +def bounding_area(index, hull): + """ Returns a named tuple that mainly contains area of the box that bounds + the hull. This bounding box orintation is same as the orientation of the + lines formed by the point hull[index] and hull[index+1]. + Args: + index (int): Eg. 1. + hull [(float, float)]: list or tuple of point cloud + Eg. ((1.0, -1.0), (2.0, -3.0), (3.0, 4.0), (5.0, 6.0)). + + Returns: a named tuple that contains: + area: area of the rectangle + length_parallel: length of the side that is parallel to unit_vector + length_orthogonal: length of the side that is orthogonal to unit_vector + rectangle_center: coordinates of the rectangle center + (use rectangle_corners to get the corner points of the rectangle) + unit_vector: direction of the length_parallel side. + (it's orthogonal vector can be found with the orthogonal_vector function + """ + unit_vector_p = unit_vector(hull[index], hull[index+1]) + unit_vector_o = orthogonal_vector(unit_vector_p) + + dis_p = tuple(np.dot(unit_vector_p, pt) for pt in hull) + dis_o = tuple(np.dot(unit_vector_o, pt) for pt in hull) + + min_p = min(dis_p) + min_o = min(dis_o) + len_p = max(dis_p) - min_p + len_o = max(dis_o) - min_o + + return {'area': len_p * len_o, + 'length_parallel': len_p, + 'length_orthogonal': len_o, + 'rectangle_center': (min_p + float(len_p)/ 2, min_o + float(len_o)/ 2), + 'unit_vector': unit_vector_p, + } + + +def to_xy_coordinates(unit_vector_angle, point): + """ Returns converted unit vector coordinates in x, y coordinates. + Args: + unit_vector_angle (float): angle of unit vector to be in radians. + Eg. 0.1543 . + point (float, float): Point from origin. Eg. (1.0, 2.0). + + Returns: + (float, float): converted x,y coordinate of the unit vector. + Eg. 0.680742447866183, 2.1299271629971663 + """ + angle_orthogonal = unit_vector_angle + pi/ 2 + return point[0] * cos(unit_vector_angle) + point[1] * cos(angle_orthogonal), \ + point[0] * sin(unit_vector_angle) + point[1] * sin(angle_orthogonal) + + +def rotate_points(center_of_rotation, angle, points): + """ Rotates a point cloud around the center_of_rotation point by angle + Args: + center_of_rotation (float, float): angle of unit vector to be in radians. + Eg. (1.56, -23.4). + angle (float): angle of rotation to be in radians. Eg. 0.1543 . + points [(float, float)]: Points to be a list or tuple of points. Points to be rotated. + Eg. ((1.56, -23.4), (1.56, -23.4)) + + Returns: + [(float, float)]: Rotated points around center of rotation by angle + Eg. ((1.16, -12.4), (2.34, -34.4)) + """ + + rot_points = [] + ang = [] + for pt in points: + diff = tuple([pt[d] - center_of_rotation[d] for d in range(2)]) + diff_angle = atan2(diff[1], diff[0]) + angle + ang.append(diff_angle) + diff_length = sqrt(sum([d**2 for d in diff])) + rot_points.append((center_of_rotation[0] + diff_length * cos(diff_angle), + center_of_rotation[1] + diff_length * sin(diff_angle))) + + return rot_points + + +def rectangle_corners(rectangle): + """ Given rectangle center and its inclination. It returns the corner + locations of the rectangle. + Args: + rectangle (bounding_box): the output of minimum bounding box rectangle + + Returns: + [(float, float)]: 4 corner points of rectangle. + Eg. ((1.0, -1.0), (2.0, -3.0), (3.0, 4.0), (5.0, 6.0)) + """ + corner_points = [] + for i1 in (.5, -.5): + for i2 in (i1, -1 * i1): + corner_points.append((rectangle['rectangle_center'][0] + i1 * rectangle['length_parallel'], + rectangle['rectangle_center'][1] + i2 * rectangle['length_orthogonal'])) + + return rotate_points(rectangle['rectangle_center'], rectangle['unit_vector_angle'], corner_points) + + +# use this function to find the listed properties of the minimum bounding box of a point cloud +def minimum_bounding_box(points): + """ Given a point cloud, it returns the minimum area rectangle bounding all + the points in the point cloud. + Args: + points [(float, float)]: points to be a list or tuple of 2D points + needs to be more than 2 points + + Returns: returns a namedtuple that contains: + area: area of the rectangle + length_parallel: length of the side that is parallel to unit_vector + length_orthogonal: length of the side that is orthogonal to unit_vector + rectangle_center: coordinates of the rectangle center + (use rectangle_corners to get the corner points of the rectangle) + unit_vector: direction of the length_parallel side. RADIANS + (it's orthogonal vector can be found with the orthogonal_vector function + unit_vector_angle: angle of the unit vector + corner_points: set that contains the corners of the rectangle + """ + + if len(points) <= 2: raise ValueError('More than two points required.') + + hull_ordered = [points[index] for index in ConvexHull(points).vertices] + hull_ordered.append(hull_ordered[0]) + hull_ordered = tuple(hull_ordered) + + min_rectangle = bounding_area(0, hull_ordered) + for i in range(1, len(hull_ordered)-1): + rectangle = bounding_area(i, hull_ordered) + if rectangle['area'] < min_rectangle['area']: + min_rectangle = rectangle + + min_rectangle['unit_vector_angle'] = atan2(min_rectangle['unit_vector'][1], min_rectangle['unit_vector'][0]) + min_rectangle['rectangle_center'] = to_xy_coordinates(min_rectangle['unit_vector_angle'], min_rectangle['rectangle_center']) + + return bounding_box_tuple( + area = min_rectangle['area'], + length_parallel = min_rectangle['length_parallel'], + length_orthogonal = min_rectangle['length_orthogonal'], + rectangle_center = min_rectangle['rectangle_center'], + unit_vector = min_rectangle['unit_vector'], + unit_vector_angle = min_rectangle['unit_vector_angle'], + corner_points = set(rectangle_corners(min_rectangle)) + ) + + +def get_center(im): + """ Returns the center pixel location of an image + Args: + im: image + + Returns: + (int, int): center of the image + Eg. 2550, 3300 + """ + center_x = float(im.size[0])/ 2 + center_y = float(im.size[1])/ 2 + return int(center_x), int(center_y) + + +def get_horizontal_angle(unit_vector_angle): + """ Returns angle of the unit vector in first or fourth quadrant. + Args: + angle (float): angle of the unit vector to be in radians. Eg. 0.01543. + + Returns: + (float): updated angle of the unit vector to be in radians. + It is only in first or fourth quadrant. + Eg. 0.01543. + """ + + if unit_vector_angle > pi/ 2 and unit_vector_angle <= pi: + unit_vector_angle = unit_vector_angle - pi + elif unit_vector_angle > -pi and unit_vector_angle < -pi/ 2: + unit_vector_angle = unit_vector_angle + pi + + return unit_vector_angle + + +def get_smaller_angle(bounding_box): + """ Returns smallest absolute angle of a rectangle. + Args: + rectangle (bounding_box): bounding box rectangle + + Returns: + (float): smallest angle of the rectangle to be in radians. + Eg. 0.01543. + """ + + unit_vector = bounding_box.unit_vector + unit_vector_angle = bounding_box.unit_vector_angle + ortho_vector = orthogonal_vector(unit_vector) + ortho_vector_angle = atan2(ortho_vector[1], ortho_vector[0]) + + unit_vector_angle_updated = get_horizontal_angle(unit_vector_angle) + ortho_vector_angle_updated = get_horizontal_angle(ortho_vector_angle) + + if abs(unit_vector_angle_updated) < abs(ortho_vector_angle_updated): + return unit_vector_angle_updated + else: + return ortho_vector_angle_updated + + +def rotated_points(bounding_box, center): + """ Rotates the corners of a bounding box rectangle around the center by smallest angle + of the rectangle. It first finds the smallest angle of the rectangle + then rotates it around the given center point. + Args: + rectangle (bounding_box): bounding box rectangle + center (int, int): center point around which the corners of rectangle are rotated. + Eg. (2550, 3300). + + Returns: 4 corner points of rectangle. + Eg. ((1.0, -1.0), (2.0, -3.0), (3.0, 4.0), (5.0, 6.0)) + """ + + p1, p2, p3, p4 = bounding_box.corner_points + x1, y1 = p1 + x2, y2 = p2 + x3, y3 = p3 + x4, y4 = p4 + center_x, center_y = center + rotation_angle_in_rad = -get_smaller_angle(bounding_box) + x_dash_1 = (x1 - center_x) * cos(rotation_angle_in_rad) - (y1 - center_y) * sin(rotation_angle_in_rad) + center_x + x_dash_2 = (x2 - center_x) * cos(rotation_angle_in_rad) - (y2 - center_y) * sin(rotation_angle_in_rad) + center_x + x_dash_3 = (x3 - center_x) * cos(rotation_angle_in_rad) - (y3 - center_y) * sin(rotation_angle_in_rad) + center_x + x_dash_4 = (x4 - center_x) * cos(rotation_angle_in_rad) - (y4 - center_y) * sin(rotation_angle_in_rad) + center_x + + y_dash_1 = (y1 - center_y) * cos(rotation_angle_in_rad) + (x1 - center_x) * sin(rotation_angle_in_rad) + center_y + y_dash_2 = (y2 - center_y) * cos(rotation_angle_in_rad) + (x2 - center_x) * sin(rotation_angle_in_rad) + center_y + y_dash_3 = (y3 - center_y) * cos(rotation_angle_in_rad) + (x3 - center_x) * sin(rotation_angle_in_rad) + center_y + y_dash_4 = (y4 - center_y) * cos(rotation_angle_in_rad) + (x4 - center_x) * sin(rotation_angle_in_rad) + center_y + return x_dash_1, y_dash_1, x_dash_2, y_dash_2, x_dash_3, y_dash_3, x_dash_4, y_dash_4 + + +def pad_image(image): + """ Pads the image around the border. It help in getting + bounding boxes that are slightly outside the page boundary. + Args: + image: page image. + + Returns: + image: page image + """ + + padded_image = Image.new('RGB', (image.size[0] + padding, image.size[1] + padding), "white") + padded_image.paste(im=image, box=(offset, offset)) + return padded_image + + +def update_minimum_bounding_box_input(bounding_box_input): + """ Updates the word bounding box corner points. + Args: + points [(float, float)]: points, a list or tuple of 2D coordinates. + ideally should be more than 2 points + Returns: + points [(float, float)]: points, a list or tuple of 2D coordinates + """ + + updated_minimum_bounding_box_input = [] + for point in bounding_box_input: + x, y = point + new_x = x + offset + new_y = y + offset + word_coordinate = (new_x, new_y) + updated_minimum_bounding_box_input.append(word_coordinate) + + return updated_minimum_bounding_box_input + + +def set_line_image_data(image, line_id, image_file_name): + """ Flips a given line image and saves it. Line image file name + is formed by appending the line id at the end page image name. + Args: + image: line image, non flipped + line_id (string): id of the line image. + image_file_name(string): name of the page image. + + Returns: + """ + + base_name = os.path.splitext(os.path.basename(image_file_name))[0] + image_file_name_wo_tif, b = image_file_name.split('.tif') + line_id = '_' + line_id.zfill(4) + line_image_file_name = base_name + line_id + '.png' + image_path = os.path.join(output_directory, line_image_file_name) + imgray = toimage(image.convert('L')) + imgray.save(image_path) + image_fh.write(image_path + '\n') + +def get_line_images_from_page_image(image_file_name, madcat_file_path): + """ Extracts the line image from page image. + Args: + image_file_name (string): complete path and name of the page image. + madcat_file_path (string): complete path and name of the madcat xml file + corresponding to the page image. + + Returns: + """ + im_wo_pad = Image.open(image_file_name) + im = pad_image(im_wo_pad) + doc = minidom.parse(madcat_file_path) + zone = doc.getElementsByTagName('zone') + for node in zone: + id = node.getAttribute('id') + token_image = node.getElementsByTagName('token-image') + minimum_bounding_box_input = [] + for token_node in token_image: + word_point = token_node.getElementsByTagName('point') + for word_node in word_point: + word_coordinate = (int(word_node.getAttribute('x')), int(word_node.getAttribute('y'))) + minimum_bounding_box_input.append(word_coordinate) + updated_mbb_input = update_minimum_bounding_box_input(minimum_bounding_box_input) + bounding_box = minimum_bounding_box(updated_mbb_input) + + p1, p2, p3, p4 = bounding_box.corner_points + x1, y1 = p1 + x2, y2 = p2 + x3, y3 = p3 + x4, y4 = p4 + min_x = int(min(x1, x2, x3, x4)) + min_y = int(min(y1, y2, y3, y4)) + max_x = int(max(x1, x2, x3, x4)) + max_y = int(max(y1, y2, y3, y4)) + box = (min_x, min_y, max_x, max_y) + region_initial = im.crop(box) + rot_points = [] + p1_new = (x1 - min_x, y1 - min_y) + p2_new = (x2 - min_x, y2 - min_y) + p3_new = (x3 - min_x, y3 - min_y) + p4_new = (x4 - min_x, y4 - min_y) + rot_points.append(p1_new) + rot_points.append(p2_new) + rot_points.append(p3_new) + rot_points.append(p4_new) + + cropped_bounding_box = bounding_box_tuple(bounding_box.area, + bounding_box.length_parallel, + bounding_box.length_orthogonal, + bounding_box.length_orthogonal, + bounding_box.unit_vector, + bounding_box.unit_vector_angle, + set(rot_points) + ) + + rotation_angle_in_rad = get_smaller_angle(cropped_bounding_box) + img2 = region_initial.rotate(degrees(rotation_angle_in_rad), resample=Image.BICUBIC) + x_dash_1, y_dash_1, x_dash_2, y_dash_2, x_dash_3, y_dash_3, x_dash_4, y_dash_4 = rotated_points( + cropped_bounding_box, get_center(region_initial)) + + + min_x = int(min(x_dash_1, x_dash_2, x_dash_3, x_dash_4)) + min_y = int(min(y_dash_1, y_dash_2, y_dash_3, y_dash_4)) + max_x = int(max(x_dash_1, x_dash_2, x_dash_3, x_dash_4)) + max_y = int(max(y_dash_1, y_dash_2, y_dash_3, y_dash_4)) + box = (min_x, min_y, max_x, max_y) + region_final = img2.crop(box) + set_line_image_data(region_final, id, image_file_name) + + +def check_file_location(): + """ Returns the complete path of the page image and corresponding + xml file. + Args: + + Returns: + image_file_name (string): complete path and name of the page image. + madcat_file_path (string): complete path and name of the madcat xml file + corresponding to the page image. + """ + + madcat_file_path1 = os.path.join(data_path1, 'madcat', base_name + '.madcat.xml') + + image_file_path1 = os.path.join(data_path1, 'images', base_name + '.tif') + + if os.path.exists(madcat_file_path1): + return madcat_file_path1, image_file_path1, wc_dict1 + + print("ERROR: path does not exist") + return None, None, None + +def parse_writing_conditions(writing_conditions): + """ Returns a dictionary which have writing condition of each page image. + Args: + writing_conditions(string): complete path of writing condition file. + + Returns: + (dict): dictionary with key as page image name and value as writing condition. + """ + + with open(writing_conditions) as f: + file_writing_cond = dict() + for line in f: + line_list = line.strip().split("\t") + file_writing_cond[line_list[0]] = line_list[3] + return file_writing_cond + +def check_writing_condition(wc_dict): + """ Checks if a given page image is writing in a given writing condition. + It is used to create subset of dataset based on writing condition. + Args: + wc_dict (dict): dictionary with key as page image name and value as writing condition. + + Returns: + (bool): True if writing condition matches. + """ + + return True + writing_condition = wc_dict[base_name].strip() + if writing_condition != 'IUC': + return False + + return True + + +### main ### + +data_path1 = os.path.join(args.database_path1, 'data') + +splits_handle = open(args.data_splits, 'r') +splits_data = splits_handle.read().strip().split('\n') + +padding = int(args.padding) +offset = int(padding // 2) + +output_directory = args.out_dir +image_file = os.path.join(output_directory, 'images.scp') +image_fh = open(image_file, 'w', encoding='utf-8') + +writing_conditions1 = os.path.join(args.database_path1, 'docs', 'writing_conditions.tab') + +wc_dict1 = parse_writing_conditions(writing_conditions1) + +prev_base_name = '' +for line in splits_data: + base_name = os.path.splitext(os.path.splitext(line.split(' ')[0])[0])[0] + if prev_base_name != base_name: + prev_base_name = base_name + madcat_file_path, image_file_path, wc_dict = check_file_location() + if wc_dict == None or not check_writing_condition(wc_dict): + continue + if madcat_file_path != None: + get_line_images_from_page_image(image_file_path, madcat_file_path) diff --git a/egs/madcat_zh/v1/local/extract_features.sh b/egs/madcat_zh/v1/local/extract_features.sh new file mode 100755 index 00000000000..9fe588f31b8 --- /dev/null +++ b/egs/madcat_zh/v1/local/extract_features.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +# Copyright 2017 Yiwen Shao +# 2018 Ashish Arora + +# Apache 2.0 +# This script runs the make features script in parallel. + +nj=4 +cmd=run.pl +feat_dim=40 +augment='no_aug' +verticle_shift=0 +echo "$0 $@" + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; + +data=$1 +featdir=$data/data +scp=$data/images.scp +logdir=$data/log + +mkdir -p $logdir +mkdir -p $featdir + +# make $featdir an absolute pathname +featdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $featdir ${PWD}` + +for n in $(seq $nj); do + split_scps="$split_scps $logdir/images.$n.scp" +done + +# split images.scp +utils/split_scp.pl $scp $split_scps || exit 1; + +$cmd JOB=1:$nj $logdir/extract_features.JOB.log \ + image/ocr/make_features.py $logdir/images.JOB.scp \ + --allowed_len_file_path $data/allowed_lengths.txt \ + --feat-dim $feat_dim --augment_type $augment \ + --vertical-shift $verticle_shift \| \ + copy-feats --compress=true --compression-method=7 \ + ark:- ark,scp:$featdir/images.JOB.ark,$featdir/images.JOB.scp + +## aggregates the output scp's to get feats.scp +for n in $(seq $nj); do + cat $featdir/images.$n.scp || exit 1; +done > $data/feats.scp || exit 1 diff --git a/egs/madcat_zh/v1/local/extract_lines.sh b/egs/madcat_zh/v1/local/extract_lines.sh new file mode 100755 index 00000000000..ed752e97e13 --- /dev/null +++ b/egs/madcat_zh/v1/local/extract_lines.sh @@ -0,0 +1,36 @@ +#!/bin/bash +# Copyright 2018 Ashish Arora + +nj=4 +cmd=run.pl +download_dir=/export/corpora/LDC/LDC2014T13 +dataset_file=data/download/datasplits/madcat.dev.raw.lineid +echo "$0 $@" + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; + +data=$1 +log_dir=$data/log +mkdir -p $log_dir +mkdir -p $data + +for n in $(seq $nj); do + split_scps="$split_scps $log_dir/lines.$n.scp" +done + +utils/split_scp.pl $dataset_file $split_scps || exit 1; + +for n in $(seq $nj); do + mkdir -p $data/$n +done + +$cmd JOB=1:$nj $log_dir/extract_lines.JOB.log \ + local/create_line_image_from_page_image.py $download_dir $log_dir/lines.JOB.scp $data/JOB \ + || exit 1; + +## concatenate the .scp files together. +for n in $(seq $nj); do + cat $data/$n/images.scp || exit 1; +done > $data/images.scp || exit 1 diff --git a/egs/madcat_zh/v1/local/prepare_data.sh b/egs/madcat_zh/v1/local/prepare_data.sh new file mode 100755 index 00000000000..ba35b90b173 --- /dev/null +++ b/egs/madcat_zh/v1/local/prepare_data.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora +# 2017 Hossein Hadian +# Apache 2.0 + +# This script downloads the Madcat Chinese handwriting database and prepares the training +# and test data (i.e text, images.scp, utt2spk and spk2utt) by calling process_data.py. +# It also downloads the LOB and Brown text corpora. It downloads the database files +# only if they do not already exist in download directory. + +# Eg. local/prepare_data.sh +# Eg. text file: 000_a01-000u-00 A MOVE to stop Mr. Gaitskell from +# utt2spk file: 000_a01-000u-00 000 +# images.scp file: 000_a01-000u-00 data/local/lines/a01/a01-000u/a01-000u-00.png +# spk2utt file: 000 000_a01-000u-00 000_a01-000u-01 000_a01-000u-02 000_a01-000u-03 + +download_dir1=/export/corpora/LDC/LDC2014T13/data +train_split_url=http://www.openslr.org/resources/50/madcat.train.raw.lineid +test_split_url=http://www.openslr.org/resources/50/madcat.test.raw.lineid +dev_split_url=http://www.openslr.org/resources/50/madcat.dev.raw.lineid +data_split_dir=data/download/datasplits + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; + +if [ -d $data_split_dir ]; then + echo "$0: Not downloading the data splits as it is already there." +else + if [ ! -f $data_split_dir/madcat.train.raw.lineid ]; then + mkdir -p $data_split_dir + echo "$0: Downloading the data splits..." + wget -P $data_split_dir $train_split_url || exit 1; + wget -P $data_split_dir $test_split_url || exit 1; + wget -P $data_split_dir $dev_split_url || exit 1; + fi + echo "$0: Done downloading the data splits" +fi + +if [ -d $download_dir1 ]; then + echo "$0: madcat chinese data directory is present." +else + if [ ! -f $download_dir1/madcat/*.madcat.xml ]; then + echo "$0: please download madcat data..." + fi +fi diff --git a/egs/madcat_zh/v1/local/prepare_dict.sh b/egs/madcat_zh/v1/local/prepare_dict.sh new file mode 100755 index 00000000000..f9cd8387fad --- /dev/null +++ b/egs/madcat_zh/v1/local/prepare_dict.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash + +# Copyright 2017 Hossein Hadian +# 2017 Chun Chieh Chang +# 2017 Ashish Arora + +# This script prepares the dictionary. + +set -e +dir=data/local/dict +mkdir -p $dir + +#local/prepare_lexicon.py data/train $dir +cat data/train/text | cut -d' ' -f2- | tr ' ' '\n' | sort -u | sed '/^$/d' | \ + python3 -c \ + 'import sys, io; \ + sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding="utf8"); \ + sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf8"); \ + [sys.stdout.write(line.strip() + " " + " ".join(list(line.strip())) + "\n") for line in sys.stdin];' > $dir/lexicon.txt + +cut -d' ' -f2- $dir/lexicon.txt | tr ' ' '\n' | sort -u >$dir/nonsilence_phones.txt || exit 1; + +echo ' SIL' >> $dir/lexicon.txt + +echo SIL > $dir/silence_phones.txt + +echo SIL >$dir/optional_silence.txt + +echo -n "" >$dir/extra_questions.txt diff --git a/egs/madcat_zh/v1/local/process_data.py b/egs/madcat_zh/v1/local/process_data.py new file mode 100755 index 00000000000..994a4486420 --- /dev/null +++ b/egs/madcat_zh/v1/local/process_data.py @@ -0,0 +1,197 @@ +#!/usr/bin/env python3 + +# Copyright 2018 Ashish Arora + +""" This script reads the extracted IAM database files and creates + the following files (for the data subset selected via --dataset): + text, utt2spk, images.scp. + + Eg. local/process_data.py data/local data/train data --dataset train + Eg. text file: 000_a01-000u-00 A MOVE to stop Mr. Gaitskell from + utt2spk file: 000_a01-000u-00 000 + images.scp file: 000_a01-000u-00 data/local/lines/a01/a01-000u/a01-000u-00.png +""" + +import argparse +import os +import sys +import xml.dom.minidom as minidom +import unicodedata + +parser = argparse.ArgumentParser(description="Creates text, utt2spk and images.scp files", + epilog="E.g. " + sys.argv[0] + " data/LDC2012T15" + " data/LDC2013T09 data/LDC2013T15 data/madcat.train.raw.lineid " + " data/train data/local/lines ", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument('database_path1', + help='Path to the downloaded (and extracted) madcat data') +parser.add_argument('data_splits', + help='Path to file that contains the train/test/dev split information') +parser.add_argument('out_dir', + help='directory location to write output files.') +args = parser.parse_args() + + +def check_file_location(): + """ Returns the complete path of the page image and corresponding + xml file. + Args: + + Returns: + image_file_name (string): complete path and name of the page image. + madcat_file_path (string): complete path and name of the madcat xml file + corresponding to the page image. + """ + + madcat_file_path1 = os.path.join(args.database_path1, 'data', 'madcat', base_name + '.madcat.xml') + + image_file_path1 = os.path.join(args.database_path1, 'data', 'images', base_name + '.tif') + + if os.path.exists(madcat_file_path1): + return madcat_file_path1, image_file_path1, wc_dict1 + + return None, None, None + + +def parse_writing_conditions(writing_conditions): + """ Returns a dictionary which have writing condition of each page image. + Args: + writing_conditions(string): complete path of writing condition file. + + Returns: + (dict): dictionary with key as page image name and value as writing condition. + """ + + with open(writing_conditions) as f: + file_writing_cond = dict() + for line in f: + line_list = line.strip().split("\t") + file_writing_cond[line_list[0]] = line_list[3] + return file_writing_cond + + +def check_writing_condition(wc_dict): + """ Checks if a given page image is writing in a given writing condition. + It is used to create subset of dataset based on writing condition. + Args: + wc_dict (dict): dictionary with key as page image name and value as writing condition. + + Returns: + (bool): True if writing condition matches. + """ + + return True + writing_condition = wc_dict[base_name].strip() + if writing_condition != 'IUC': + return False + + return True + + +def get_word_line_mapping(madcat_file_path): + """ Maps every word in the page image to a corresponding line. + Args: + madcat_file_path (string): complete path and name of the madcat xml file + corresponding to the page image. + + Returns: + """ + + doc = minidom.parse(madcat_file_path) + zone = doc.getElementsByTagName('zone') + for node in zone: + line_id = node.getAttribute('id') + line_word_dict[line_id] = list() + word_image = node.getElementsByTagName('token-image') + for tnode in word_image: + word_id = tnode.getAttribute('id') + line_word_dict[line_id].append(word_id) + word_line_dict[word_id] = line_id + + +def read_text(madcat_file_path): + """ Maps every word in the page image to a corresponding line. + Args: + madcat_file_path (string): complete path and name of the madcat xml file + corresponding to the page image. + + Returns: + dict: Mapping every word in the page image to a corresponding line. + """ + + text_line_word_dict = dict() + doc = minidom.parse(madcat_file_path) + segment = doc.getElementsByTagName('segment') + for node in segment: + token = node.getElementsByTagName('token') + for tnode in token: + segment_id = tnode.getAttribute('id') + ref_word_id = tnode.getAttribute('ref_id') + word = tnode.getElementsByTagName('source')[0].firstChild.nodeValue + word = unicodedata.normalize('NFKC',word) + ref_line_id = word_line_dict[ref_word_id] + if ref_line_id not in text_line_word_dict: + text_line_word_dict[ref_line_id] = list() + text_line_word_dict[ref_line_id].append(word) + return text_line_word_dict + + +def get_line_image_location(): + image_loc_dict = dict() # Stores image base name and location + image_loc_vect = input_image_fh.read().strip().split("\n") + for line in image_loc_vect: + base_name = os.path.basename(line) + location_vect = line.split('/') + location = "/".join(location_vect[:-1]) + image_loc_dict[base_name]=location + return image_loc_dict + + +text_file = os.path.join(args.out_dir, 'text') +text_fh = open(text_file, 'w', encoding='utf-8') +utt2spk_file = os.path.join(args.out_dir, 'utt2spk') +utt2spk_fh = open(utt2spk_file, 'w', encoding='utf-8') +image_file = os.path.join(args.out_dir, 'images.scp') +image_fh = open(image_file, 'w', encoding='utf-8') + +data_path1 = os.path.join(args.database_path1, 'data') + +input_image_file = os.path.join(args.out_dir, 'lines', 'images.scp') +input_image_fh = open(input_image_file, 'r', encoding='utf-8') + +writing_conditions1 = os.path.join(args.database_path1, 'docs', 'writing_conditions.tab') + +wc_dict1 = parse_writing_conditions(writing_conditions1) +image_loc_dict = get_line_image_location() + +image_num = 0 +with open(args.data_splits) as f: + prev_base_name = '' + for line in f: + base_name = os.path.splitext(os.path.splitext(line.split(' ')[0])[0])[0] + if prev_base_name != base_name: + prev_base_name = base_name + madcat_xml_path, image_file_path, wc_dict = check_file_location() + if wc_dict is None or not check_writing_condition(wc_dict): + continue + if madcat_xml_path is not None: + madcat_doc = minidom.parse(madcat_xml_path) + writer = madcat_doc.getElementsByTagName('writer') + writer_id = writer[0].getAttribute('id') + line_word_dict = dict() + word_line_dict = dict() + get_word_line_mapping(madcat_xml_path) + text_line_word_dict = read_text(madcat_xml_path) + base_name = os.path.basename(image_file_path) + base_name, b = base_name.split('.tif') + for lineID in sorted(text_line_word_dict): + updated_base_name = "{}_{}.png".format(base_name, str(lineID).zfill(4)) + location = image_loc_dict[updated_base_name] + image_file_path = os.path.join(location, updated_base_name) + line = text_line_word_dict[lineID] + text = ' '.join(''.join(line)) + utt_id = "{}_{}_{}_{}".format(writer_id, str(image_num).zfill(6), base_name, str(lineID).zfill(4)) + text_fh.write(utt_id + ' ' + text + '\n') + utt2spk_fh.write(utt_id + ' ' + writer_id + '\n') + image_fh.write(utt_id + ' ' + image_file_path + '\n') + image_num = image_num + 1 diff --git a/egs/madcat_zh/v1/local/process_segments.py b/egs/madcat_zh/v1/local/process_segments.py new file mode 100755 index 00000000000..3d09c0df190 --- /dev/null +++ b/egs/madcat_zh/v1/local/process_segments.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python3 + +# Copyright 2017 Chun Chieh Chang + +""" This script reads the provided word segmentations of chinese + and ensures that all of them are normalized to the same + unicode form. +""" + +import argparse +import os +import unicodedata + +parser = argparse.ArgumentParser(description="""Takes in word segmentations and normalizes character form.""") +parser.add_argument('segmentation_path', type=str, + help='Path to chinese word segmentation') +parser.add_argument('out_dir', type=str, + help='Where to write output file') +args = parser.parse_args() + +segment_file = os.path.join(args.out_dir, 'segmented_words') +segment_fh = open(segment_file, 'w', encoding='utf-8') + +with open(args.segmentation_path, encoding='utf-8') as f: + for line in f: + line_normalize = unicodedata.normalize('NFKC', line) + segment_fh.write(line_normalize + '\n') diff --git a/egs/madcat_zh/v1/local/score.sh b/egs/madcat_zh/v1/local/score.sh new file mode 100755 index 00000000000..31564d25326 --- /dev/null +++ b/egs/madcat_zh/v1/local/score.sh @@ -0,0 +1,5 @@ +#!/bin/bash + + +steps/scoring/score_kaldi_wer.sh "$@" +steps/scoring/score_kaldi_cer.sh --stage 2 "$@" diff --git a/egs/madcat_zh/v1/local/train_lm.sh b/egs/madcat_zh/v1/local/train_lm.sh new file mode 100755 index 00000000000..a8e2dc71f28 --- /dev/null +++ b/egs/madcat_zh/v1/local/train_lm.sh @@ -0,0 +1,108 @@ +#!/bin/bash + +# Copyright 2016 Vincent Nguyen +# 2016 Johns Hopkins University (author: Daniel Povey) +# 2017 Ashish Arora +# 2017 Hossein Hadian +# Apache 2.0 +# +# This script trains an LM on the LOB+Brown text data and IAM training transcriptions. +# It is based on the example scripts distributed with PocoLM + +# It will check if pocolm is installed and if not will proceed with installation + +set -e +stage=0 + +echo "$0 $@" # Print the command line for logging +. ./utils/parse_options.sh || exit 1; + +dir=data/local/local_lm +lm_dir=${dir}/data + + +mkdir -p $dir +. ./path.sh || exit 1; # for KALDI_ROOT +export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH +( # First make sure the pocolm toolkit is installed. + cd $KALDI_ROOT/tools || exit 1; + if [ -d pocolm ]; then + echo Not installing the pocolm toolkit since it is already there. + else + echo "$0: Please install the PocoLM toolkit with: " + echo " cd ../../../tools; extras/install_pocolm.sh; cd -" + exit 1; + fi +) || exit 1; + +bypass_metaparam_optim_opt= +# If you want to bypass the metaparameter optimization steps with specific metaparameters +# un-comment the following line, and change the numbers to some appropriate values. +# You can find the values from output log of train_lm.py. +# These example numbers of metaparameters is for 4-gram model (with min-counts) +# running with train_lm.py. +# The dev perplexity should be close to the non-bypassed model. +#bypass_metaparam_optim_opt= +# Note: to use these example parameters, you may need to remove the .done files +# to make sure the make_lm_dir.py be called and tain only 3-gram model +#for order in 3; do +#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done + +if [ $stage -le 0 ]; then + mkdir -p ${dir}/data + mkdir -p ${dir}/data/text + + echo "$0: Getting the Data sources" + + rm ${dir}/data/text/* 2>/dev/null || true + + # use the validation data as the dev set. + # Note: the name 'dev' is treated specially by pocolm, it automatically + # becomes the dev set. + + cat data/dev/text | cut -d " " -f 2- > ${dir}/data/text/dev.txt + + # use the training data as an additional data source. + # we can later fold the dev data into this. + cat data/train/text | cut -d " " -f 2- > ${dir}/data/text/madcat.txt + + # for reporting perplexities, we'll use the "real" dev set. + # (the validation data is used as ${dir}/data/text/dev.txt to work + # out interpolation weights.) + # note, we can't put it in ${dir}/data/text/, because then pocolm would use + # it as one of the data sources. + cut -d " " -f 2- < data/test/text > ${dir}/data/real_dev_set.txt + + # get the wordlist from IAM text + cat ${dir}/data/text/madcat.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count + cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist +fi + +order=3 + +if [ $stage -le 1 ]; then + # decide on the vocabulary. + # Note: you'd use --wordlist if you had a previously determined word-list + # that you wanted to use. + # Note: if you have more than one order, use a certain amount of words as the + # vocab and want to restrict max memory for 'sort', + echo "$0: training the unpruned LM" + min_counts='train=2 madcat=1' + wordlist=${dir}/data/wordlist + + lm_name="`basename ${wordlist}`_${order}" + if [ -n "${min_counts}" ]; then + lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`" + fi + unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm + train_lm.py --wordlist=${wordlist} --num-splits=5 --warm-start-ratio=1 \ + --limit-unk-history=true \ + ${bypass_metaparam_optim_opt} \ + ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir} + + get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' + #log-prob: -5.05603614242 [perplexity = 156.967086371] over 19477.0 words + + mkdir -p ${dir}/data/arpa + format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz +fi diff --git a/egs/madcat_zh/v1/local/wer_output_filter b/egs/madcat_zh/v1/local/wer_output_filter new file mode 100755 index 00000000000..5d5290ad8c3 --- /dev/null +++ b/egs/madcat_zh/v1/local/wer_output_filter @@ -0,0 +1,31 @@ +#!/usr/bin/env python3 +# Copyright 2017 Hossein Hadian + +# This is a filter used in scoring. It separates all +# punctuations from words. For e.g. this sentence: + +# "They have come!" he said reverently, gripping his +# hands. "Isn't it a glorious thing! Long awaited." + +# is converted to this: + +# " They have come ! " he said reverently , gripping his +# hands . " Isn ' t it a glorious thing ! Long awaited . " + +import sys +import io +import re +from collections import OrderedDict + +sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding="utf8"); +sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf8"); + +re_dict = OrderedDict([("“","\""), ("”","\"")]) +pattern = re.compile("|".join(re.escape(key) for key in re_dict.keys())) + +for line in sys.stdin: + words = line.strip().split() + uttid = words[0] + transcript = ' '.join(words[1:]) + transcript_fixed = pattern.sub(lambda x: re_dict[x.group()], transcript) + sys.stdout.write(uttid + " " + transcript_fixed + "\n") diff --git a/egs/madcat_zh/v1/path.sh b/egs/madcat_zh/v1/path.sh new file mode 100755 index 00000000000..2d17b17a84a --- /dev/null +++ b/egs/madcat_zh/v1/path.sh @@ -0,0 +1,6 @@ +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C diff --git a/egs/madcat_zh/v1/run.sh b/egs/madcat_zh/v1/run.sh new file mode 100755 index 00000000000..b3ef370c830 --- /dev/null +++ b/egs/madcat_zh/v1/run.sh @@ -0,0 +1,159 @@ +#!/bin/bash + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora +# 2017 Hossein Hadian + +set -e +stage=0 +nj=50 +decode_gmm=true +# madcat_database points to the database path on the JHU grid. If you have not +# already downloaded the database you can set it to a local directory +# like "data/download" and follow the instructions +# in "local/download_data.sh" to download the database: +# data_split_dir is an unofficial datasplit that is used. +# The datasplits can be found on http://www.openslr.org/51/ +madcat_database=/export/corpora/LDC/LDC2014T13 +data_split_dir=data/download/datasplits +overwrite=false +corpus_dir=/export/corpora5/handwriting_ocr/corpus_data/zh/ + +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. +. ./path.sh +. ./utils/parse_options.sh # e.g. this parses the above options + # if supplied. +./local/check_tools.sh + +# Start from stage=-1 for using extra corpus text +if [ $stage -le -1 ]; then + echo "$(date): getting corpus text for language modelling..." + mkdir -p data/local/text/cleaned + cat $corpus_dir/* > data/local/text/zh.txt + head -20000 data/local/text/zh.txt > data/local/text/cleaned/val.txt + tail -n +20000 data/local/text/zh.txt > data/local/text/cleaned/corpus.txt +fi + +mkdir -p data/{train,test,dev}/lines +if [ $stage -le 0 ]; then + + if [ -f data/train/text ] && ! $overwrite; then + echo "$0: Not processing, probably script have run from wrong stage" + echo "Exiting with status 1 to avoid data corruption" + exit 1; + fi + + echo "$0: Preparing data..." + local/prepare_data.sh --download-dir1 $madcat_database/data --data-split-dir $data_split_dir + + for dataset in train test dev; do + local/extract_lines.sh --nj $nj --cmd $cmd \ + --download-dir $madcat_database + --dataset-file $data_split_dir/madcat.${dataset}.raw.lineid \ + data/${dataset}/lines + done + + echo "$0: Processing data..." + for set in dev train test; do + local/process_data.py $madcat_database $data_split_dir/madcat.$set.raw.lineid data/$set + image/fix_data_dir.sh data/$set + done +fi + +mkdir -p data/{train,test,dev}/data +if [ $stage -le 1 ]; then + for dataset in train test dev; do + local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 60 data/$dataset + steps/compute_cmvn_stats.sh data/$dataset + done +fi + +if [ $stage -le 2 ]; then +echo "$0: Preparing dictionary and lang..." + local/prepare_dict.sh + utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 16 --sil-prob 0.95 \ + --position-dependent-phones false \ + data/local/dict "" data/lang/temp data/lang +fi + +if [ $stage -le 3 ]; then + echo "$0: Estimating a language model for decoding..." + local/train_lm.sh + utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_unpruned.arpa.gz \ + data/local/dict/lexicon.txt data/lang_test +fi + +if [ $stage -le 4 ]; then + steps/train_mono.sh --nj $nj --cmd $cmd --totgauss 10000 data/train \ + data/lang exp/mono +fi + +if [ $stage -le 5 ] && $decode_gmm; then + utils/mkgraph.sh --mono data/lang_test exp/mono exp/mono/graph + + steps/decode.sh --nj $nj --cmd $cmd exp/mono/graph data/test \ + exp/mono/decode_test +fi + +if [ $stage -le 6 ]; then + steps/align_si.sh --nj $nj --cmd $cmd data/train data/lang \ + exp/mono exp/mono_ali + + steps/train_deltas.sh --cmd $cmd --context-opts "--context-width=2 --central-position=1" \ + 50000 20000 data/train data/lang \ + exp/mono_ali exp/tri +fi + +if [ $stage -le 7 ] && $decode_gmm; then + utils/mkgraph.sh data/lang_test exp/tri exp/tri/graph + + steps/decode.sh --nj $nj --cmd $cmd exp/tri/graph data/test \ + exp/tri/decode_test +fi + +if [ $stage -le 8 ]; then + steps/align_si.sh --nj $nj --cmd $cmd data/train data/lang \ + exp/tri exp/tri_ali + + steps/train_lda_mllt.sh --cmd $cmd \ + --splice-opts "--left-context=3 --right-context=3" \ + --context-opts "--context-width=2 --central-position=1" 50000 20000 \ + data/train data/lang exp/tri_ali exp/tri2 +fi + +if [ $stage -le 9 ] && $decode_gmm; then + utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph + + steps/decode.sh --nj $nj --cmd $cmd exp/tri2/graph \ + data/test exp/tri2/decode_test +fi + +if [ $stage -le 10 ]; then + steps/align_fmllr.sh --nj $nj --cmd $cmd --use-graphs true \ + data/train data/lang exp/tri2 exp/tri2_ali + + steps/train_sat.sh --cmd $cmd --context-opts "--context-width=2 --central-position=1" \ + 50000 20000 data/train data/lang \ + exp/tri2_ali exp/tri3 +fi + +if [ $stage -le 11 ] && $decode_gmm; then + utils/mkgraph.sh data/lang_test exp/tri3 exp/tri3/graph + + steps/decode_fmllr.sh --nj $nj --cmd $cmd exp/tri3/graph \ + data/test exp/tri3/decode_test +fi + +if [ $stage -le 12 ]; then + steps/align_fmllr.sh --nj $nj --cmd $cmd --use-graphs true \ + data/train data/lang exp/tri3 exp/tri3_ali +fi + +if [ $stage -le 13 ]; then + local/chain/run_cnn_1a.sh +fi + +if [ $stage -le 14 ]; then + local/chain/run_cnn_chainali_1b.sh --chain-model-dir exp/chain/cnn_1a --stage 2 +fi diff --git a/egs/madcat_zh/v1/run_end2end.sh b/egs/madcat_zh/v1/run_end2end.sh new file mode 100755 index 00000000000..7e0fc1e25d1 --- /dev/null +++ b/egs/madcat_zh/v1/run_end2end.sh @@ -0,0 +1,107 @@ +#!/bin/bash +# Copyright 2017 Hossein Hadian + +set -e +stage=0 +nj=50 +username= +password= +# iam_database points to the database path on the JHU grid. If you have not +# already downloaded the database you can set it to a local directory +# like "data/download" and follow the instructions +# in "local/prepare_data.sh" to download the database: +madcat_database=/export/corpora/LDC/LDC2014T13 +data_split_dir=data/download/datasplits +overwrite=false +corpus_dir=/export/corpora5/handwriting_ocr/corpus_data/zh/ + +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. +. ./path.sh +. ./utils/parse_options.sh # e.g. this parses the above options + # if supplied. +./local/check_tools.sh + + +# Start from stage=-1 for using extra corpus text +if [ $stage -le -1 ]; then + echo "$(date): getting corpus text for language modelling..." + mkdir -p data/local/text/cleaned + cat $corpus_dir/* > data/local/text/zh.txt + head -20000 data/local/text/zh.txt > data/local/text/cleaned/val.txt + tail -n +20000 data/local/text/zh.txt > data/local/text/cleaned/corpus.txt +fi + +if [ $stage -le 0 ]; then + if [ -f data/train/text ] && ! $overwrite; then + echo "$0: Not processing, probably script have run from wrong stage" + echo "Exiting with status 1 to avoid data corruption" + exit 1; + fi + + echo "$0: Preparing data..." + local/prepare_data.sh --download-dir1 $madcat_database/data --data-split-dir $data_split_dir + + for dataset in train test dev; do + local/extract_lines.sh --nj $nj --cmd $cmd \ + --download-dir $madcat_database \ + --dataset-file $data_split_dir/madcat.${dataset}.raw.lineid \ + data/${dataset}/lines + done + + echo "$0: Processing data..." + for set in dev train test; do + local/process_data.py $madcat_database $data_split_dir/madcat.$set.raw.lineid data/$set + image/fix_data_dir.sh data/$set + done + +fi + +mkdir -p data/{train,test}/data +if [ $stage -le 1 ]; then + image/get_image2num_frames.py --feat-dim 80 data/train # This will be needed for the next command + # The next command creates a "allowed_lengths.txt" file in data/train + # which will be used by local/make_features.py to enforce the images to + # have allowed lengths. The allowed lengths will be spaced by 10% difference in length. + image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train + echo "$0: Preparing the test and train feature files..." + for dataset in train test; do + local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 80 data/$dataset + steps/compute_cmvn_stats.sh data/$dataset + done + utils/fix_data_dir.sh data/train +fi + +if [ $stage -le 2 ]; then + echo "$0: Preparing dictionary and lang..." + local/prepare_dict.sh + utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 16 --sil-prob 0.95 \ + --position-dependent-phones false \ + data/local/dict "" data/lang/temp data/lang +fi + +if [ $stage -le 3 ]; then + echo "$0: calling the flat-start chain recipe..." + local/chain/run_e2e_cnn.sh +fi + +lang_decode=data/lang_test +decode_e2e=true +if [ $stage -le 4 ]; then + echo "$0: Estimating a language model for decoding..." + local/train_lm.sh + utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_unpruned.arpa.gz \ + data/local/dict/lexicon.txt $lang_decode +fi + +if [ $stage -le 5 ] && $decode_e2e; then + echo "$0: $(date) stage 5: decoding end2end setup..." + utils/mkgraph.sh --self-loop-scale 1.0 $lang_decode \ + exp/chain/e2e_cnn_1a/ exp/chain/e2e_cnn_1a/graph || exit 1; + + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 --nj $nj --cmd "$cmd" \ + exp/chain/e2e_cnn_1a/graph data/test exp/chain/e2e_cnn_1a/decode_test || exit 1; + + echo "$0: Done. Date: $(date). Results:" + local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/ +fi diff --git a/egs/madcat_zh/v1/steps b/egs/madcat_zh/v1/steps new file mode 120000 index 00000000000..1b186770dd1 --- /dev/null +++ b/egs/madcat_zh/v1/steps @@ -0,0 +1 @@ +../../wsj/s5/steps/ \ No newline at end of file diff --git a/egs/madcat_zh/v1/utils b/egs/madcat_zh/v1/utils new file mode 120000 index 00000000000..a3279dc8679 --- /dev/null +++ b/egs/madcat_zh/v1/utils @@ -0,0 +1 @@ +../../wsj/s5/utils/ \ No newline at end of file diff --git a/egs/malach/s5/README.txt b/egs/malach/s5/README.txt new file mode 100644 index 00000000000..9ea62aae53d --- /dev/null +++ b/egs/malach/s5/README.txt @@ -0,0 +1,64 @@ +# Copyright 2019 IBM Corp. (Author: Michael Picheny) Adapted AMI recipe to MALACH corpus + +This s5 recipe for MALACH data is a modified version of the s5b +recipe for AMI. + +You need to download the malach data to get started. For information about the MALACH database see : +USC-SFI MALACH Interviews and Transcripts English - Speech Recognition Edition +https://catalog.ldc.upenn.edu/LDC2019S11 + +Once the data is unloaded and untar-ed, you need to run: + +run_prepare_shared.sh - prepares most of the data for the system +run.sh - builds the system + +Beforehand, you need to edit BOTH scripts to point to +where you downloaded and untar-ed the data. Find the lines in +run_prepare_shared.sh and run.sh that say: + +malach_dir=dummy_directory + +Replace "dummy_directory" with the fully-qualified location of the actual data +data. For example, let's say you copied the data distribution tar file to +/user/jdoe/malach and untar-ed it there. That would create a high level directory called +/user/jdoe/malach/malach_eng_speech_recognition. You would then change the above line to read: + +malach_dir=/user/doe/malach/malach_eng_speech_recognition/data + +Note that the scripts were "tweaked" to always use sclite scoring +(vs. default kaldi scoring). + +Other issues that we have run up against in setting up this recipe +that may or may not impact you: + +On the system on which these scripts were developed, we run python 2.7 +and a relatively older version of CUDA by default. We had to modify +path.sh to point to the right load libraries for both python 3 (a +number of the scripts use python three) and an appropriate library +consistent with the level of CUDA we were using. Please modify path.sh +accordingly. + +You may also have to modify "configure" line 405 in +/speech7/picheny5_nb/forked_kaldi/kaldi/src to point to where your +version of CUDA lives. + +Basic pipeline results summary: + +tri2: +%WER 39.1 | 843 12345 | 66.5 25.1 8.3 5.7 39.1 74.0 | -0.230 | exp/tri2/decode_dev_malach.o4g.kn.pr1-9/ascore_13/dev.ctm.filt.sys + +tri3.si: +%WER 42.8 | 843 12345 | 63.4 28.0 8.5 6.3 42.8 76.9 | -1.079 | exp/tri3/decode_dev_malach.o4g.kn.pr1-9.si/ascore_12/dev.ctm.filt.sys + +tri3: +%WER 34.5 | 843 12345 | 70.7 22.1 7.1 5.2 34.5 69.2 | -0.398 | exp/tri3/decode_dev_malach.o4g.kn.pr1-9/ascore_15/dev.ctm.filt.sys + +tri3_cleaned.si: +%WER 43.1 | 843 12345 | 63.6 28.2 8.2 6.7 43.1 79.0 | -1.095 | exp/tri3_cleaned/decode_dev_malach.o4g.kn.pr1-9.si/ascore_12/dev.ctm.filt.sys + +tri3_cleaned: +%WER 35.1 | 843 12345 | 71.0 22.6 6.4 6.1 35.1 72.7 | -0.431 | exp/tri3_cleaned/decode_dev_malach.o4g.kn.pr1-9/ascore_13/dev.ctm.filt.sys + +Results using the chain model, and rescoring the chain model with various LSTMs, can be found in s5/local/chain/run_tdnn.sh + + diff --git a/egs/malach/s5/cmd.sh b/egs/malach/s5/cmd.sh new file mode 100644 index 00000000000..166bfdd450b --- /dev/null +++ b/egs/malach/s5/cmd.sh @@ -0,0 +1,18 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="run.pl --mem 1G" +export decode_cmd="run.pl --mem 2G" +# the use of cuda_cmd is deprecated, used only in 'nnet1', +export cuda_cmd="run.pl --gpu 1 --mem 20G" + + diff --git a/egs/malach/s5/conf/decode.conf b/egs/malach/s5/conf/decode.conf new file mode 100644 index 00000000000..c8a0ece58bf --- /dev/null +++ b/egs/malach/s5/conf/decode.conf @@ -0,0 +1,3 @@ +beam=11.0 # beam for decoding. Was 13.0 in the scripts. +first_beam=8.0 # beam for 1st-pass decoding in SAT. + diff --git a/egs/malach/s5/conf/mfcc.conf b/egs/malach/s5/conf/mfcc.conf new file mode 100644 index 00000000000..a1aa3d6c158 --- /dev/null +++ b/egs/malach/s5/conf/mfcc.conf @@ -0,0 +1,2 @@ +--use-energy=false # only non-default option. +--sample-frequency=16000 diff --git a/egs/malach/s5/conf/mfcc_hires.conf b/egs/malach/s5/conf/mfcc_hires.conf new file mode 100644 index 00000000000..434834a6725 --- /dev/null +++ b/egs/malach/s5/conf/mfcc_hires.conf @@ -0,0 +1,10 @@ +# config for high-resolution MFCC features, intended for neural network training +# Note: we keep all cepstra, so it has the same info as filterbank features, +# but MFCC is more easily compressible (because less correlated) which is why +# we prefer this method. +--use-energy=false # use average of log energy, not energy. +--num-mel-bins=40 # similar to Google's setup. +--num-ceps=40 # there is no dimensionality reduction. +--low-freq=20 # low cutoff frequency for mel bins... this is high-bandwidth data, so + # there might be some information at the low end. +--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) diff --git a/egs/malach/s5/conf/online_cmvn.conf b/egs/malach/s5/conf/online_cmvn.conf new file mode 100644 index 00000000000..7748a4a4dd3 --- /dev/null +++ b/egs/malach/s5/conf/online_cmvn.conf @@ -0,0 +1 @@ +# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh diff --git a/egs/malach/s5/local/chain/compare_wer_general.sh b/egs/malach/s5/local/chain/compare_wer_general.sh new file mode 100755 index 00000000000..9bd017414ab --- /dev/null +++ b/egs/malach/s5/local/chain/compare_wer_general.sh @@ -0,0 +1,72 @@ +#!/bin/bash + +echo -n "System " +for x in $*; do printf " % 10s" $x; done +echo + +#for d in exp/chain_cleaned/tdnn*/decode_*; do grep Sum $d/*sc*/*ys | utils/best_wer.sh; done|grep eval_hires + + +echo -n "WER on dev " +for x in $*; do + wer=$(grep Sum exp/chain_cleaned/${x}/decode_dev/*sc*/*ys | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "Rescore with lstm 1a " +for x in $*; do + wer=$(grep Sum exp/chain_cleaned/${x}/decode_dev*tdnn_1a/*sc*/*ys | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "Rescore with lstm 1b " +for x in $*; do + wer=$(grep Sum exp/chain_cleaned/${x}/decode_dev*tdnn_1b/*sc*/*ys | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "Rescore with lstm bs_1a " +for x in $*; do + wer=$(grep Sum exp/chain_cleaned/${x}/decode_dev*tdnn_bs_1a/*sc*/*ys | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "Final train prob " +for x in $*; do + if [[ "${x}" != *online* ]]; then + prob=$(grep Overall exp/chain_cleaned/${x}/log/compute_prob_train.final.log | grep -v xent | awk '{print $8}') + printf "% 10s" $prob + fi +done +echo + +echo -n "Final valid prob " +for x in $*; do + if [[ "${x}" != *online* ]]; then + prob=$(grep Overall exp/chain_cleaned/${x}/log/compute_prob_valid.final.log | grep -v xent | awk '{print $8}') + printf "% 10s" $prob + fi +done +echo + +echo -n "Final train prob (xent) " +for x in $*; do + if [[ "${x}" != *online* ]]; then + prob=$(grep Overall exp/chain_cleaned/${x}/log/compute_prob_train.final.log | grep -w xent | awk '{print $8}') + printf "% 10s" $prob + fi +done +echo + +echo -n "Final valid prob (xent) " +for x in $*; do + if [[ "${x}" != *online* ]]; then + prob=$(grep Overall exp/chain_cleaned/${x}/log/compute_prob_valid.final.log | grep -w xent | awk '{print $8}') + printf "% 10s" $prob + fi +done +echo diff --git a/egs/malach/s5/local/chain/run_tdnn.sh b/egs/malach/s5/local/chain/run_tdnn.sh new file mode 120000 index 00000000000..ede0f99dc57 --- /dev/null +++ b/egs/malach/s5/local/chain/run_tdnn.sh @@ -0,0 +1 @@ +egs/malach/s5/local/chain/tuning/run_tdnn_1a.sh \ No newline at end of file diff --git a/egs/malach/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/malach/s5/local/chain/tuning/run_tdnn_1a.sh new file mode 100644 index 00000000000..007e94ef1a3 --- /dev/null +++ b/egs/malach/s5/local/chain/tuning/run_tdnn_1a.sh @@ -0,0 +1,259 @@ +#!/bin/bash + +# Copyright 2019 IBM Corp. (Author: Michael Picheny) Adapted AMI recipe to MALACH corpus + +# same as 1h in corresponding AMI s5b recipe but replacing proportional-shrink with l2-regularize. + +# local/chain/compare_wer_general.sh tdnn1a_sp_bi + +# System tdnn1a_sp_bi +# WER on dev 23.7 +# Rescore with lstm 1a 21.1 +# Rescore with lstm 1b 20.8 +# Rescore with lstm bs_1a 20.7 +# Final train prob -0.118005 +# Final valid prob -0.167522 +# Final train prob (xent) -2.06349 +# Final valid prob (xent) -2.29166 + +# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn1a_sp_bi +# exp/chain_cleaned/tdnn1i_sp_bi: num-iters=918 nj=2..4 num-params=7.9M dim=40+100->3696 combine=-0.133->-0.130 (over 19) xent:train/valid[610,917,final]=(-2.37,-2.10,-2.06/-2.60,-2.35,-2.29) logprob:train/valid[610,917,final]=(-0.143,-0.124,-0.118/-0.191,-0.173,-0.168) + +3set -e -o pipefail +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +min_seg_len=1.55 +train_set=train_cleaned +gmm=tri3_cleaned # the gmm for the target data +num_threads_ubm=32 +ivector_transform_type=pca +nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned +num_epochs=9 +remove_egs=true + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tdnn_affix=1a #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 13 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 14 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + +xent_regularize=0.1 + +if [ $stage -le 15 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + opts="l2-regularize=0.02" + output_opts="l2-regularize=0.004" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=450 $opts + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=450 $opts + relu-batchnorm-layer name=tdnn3 dim=450 $opts + relu-batchnorm-layer name=tdnn4 input=Append(-1,0,1) dim=450 $opts + relu-batchnorm-layer name=tdnn5 dim=450 $opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=450 $opts + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=450 $opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=450 $opts + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=450 $opts + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn9 dim=450 target-rms=0.5 $opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn9 dim=450 target-rms=0.5 $opts + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts + +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 16 ]; then + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch 128 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 4 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir +fi + + +graph_dir=$dir/graph_${LM} +if [ $stage -le 17 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + rm $dir/.error 2>/dev/null || true + for decode_set in dev; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $nj --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set} || exit 1; + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi +exit 0 diff --git a/egs/malach/s5/local/convert2stm.pl b/egs/malach/s5/local/convert2stm.pl new file mode 100755 index 00000000000..f0b85c65b42 --- /dev/null +++ b/egs/malach/s5/local/convert2stm.pl @@ -0,0 +1,101 @@ +#!/usr/bin/env perl + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. +# 2013 University of Edinburgh (Author: Pawel Swietojanski) + +# This takes as standard input path to directory containing all the usual +# data files - segments, text, utt2spk and reco2file_and_channel and creates stm + +if (@ARGV < 1 || @ARGV > 2) { + print STDERR "Usage: convert2stm.pl [] > stm-file\n"; + exit(1); +} + +$dir=shift @ARGV; +$utt2spk_file=shift @ARGV || 'utt2spk'; + +$segments = "$dir/segments"; +$reco2file_and_channel = "$dir/reco2file_and_channel"; +$text = "$dir/text"; +$utt2spk_file = "$dir/$utt2spk_file"; + +open(S, "<$segments") || die "opening segments file $segments"; +while() { + @A = split(" ", $_); + @A > 3 || die "convert2stm: Bad line in segments file: $_"; + ($utt, $recording_id, $begin_time, $end_time) = @A[0..3]; + $utt2reco{$utt} = $recording_id; + $begin{$utt} = $begin_time; + $end{$utt} = $end_time; +} +close(S); + +open(R, "<$reco2file_and_channel") || die "open reco2file_and_channel file $reco2file_and_channel"; +while() { + @A = split(" ", $_); + @A == 3 || die "convert2stm: Bad line in reco2file_and_channel file: $_"; + ($recording_id, $file, $channel) = @A; + $reco2file{$recording_id} = $file; + $reco2channel{$recording_id} = $channel; +} +close(R); + +open(T, "<$text") || die "open text file $text"; +while() { + @A = split(" ", $_); + $utt = shift @A; + $utt2text{$utt} = "@A"; +} +close(T); + +open(U, "<$utt2spk_file") || die "open utt2spk file $utt2spk_file"; +while() { + @A = split(" ", $_); + @A == 2 || die "convert2stm: Bad line in utt2spk file: $_"; + ($utt, $spk) = @A; + $utt2spk{$utt} = $spk; +} +close(U); + +# Now generate the stm file +foreach $utt (sort keys(%utt2reco)) { + + # lines look like: + # [