From d9641482ec6b72e49733ed8fbce0bfafe2da1562 Mon Sep 17 00:00:00 2001
From: Leonard Lausen <lausen@amazon.com>
Date: Wed, 18 Mar 2020 20:57:05 -0700
Subject: [PATCH 01/42] Fix race condition in directory creation (#1187)

---
 scripts/language_model/transformer/model.py           | 10 +---------
 src/gluonnlp/data/baidu_ernie_data.py                 |  3 +--
 src/gluonnlp/data/conll.py                            |  3 +--
 src/gluonnlp/data/corpora/google_billion_word.py      |  3 +--
 .../data/corpora/large_text_compression_benchmark.py  |  3 +--
 src/gluonnlp/data/corpora/wikitext.py                 |  3 +--
 src/gluonnlp/data/glue.py                             |  3 +--
 src/gluonnlp/data/intent_slot.py                      |  3 +--
 src/gluonnlp/data/question_answering.py               |  3 +--
 src/gluonnlp/data/sentiment.py                        |  6 ++----
 src/gluonnlp/data/super_glue.py                       |  3 +--
 src/gluonnlp/data/transforms.py                       | 10 +---------
 src/gluonnlp/data/translation.py                      |  3 +--
 src/gluonnlp/data/word_embedding_evaluation.py        |  3 +--
 src/gluonnlp/utils/files.py                           | 11 ++---------
 15 files changed, 17 insertions(+), 53 deletions(-)

diff --git a/scripts/language_model/transformer/model.py b/scripts/language_model/transformer/model.py
index de4d7dbbe6..b4238b6291 100644
--- a/scripts/language_model/transformer/model.py
+++ b/scripts/language_model/transformer/model.py
@@ -18,7 +18,6 @@
 # pylint: disable=wildcard-import, arguments-differ
 """Module for pre-defined NLP models."""
 
-import errno
 import os
 import time
 import zipfile
@@ -265,14 +264,7 @@ def _get_xlnet_tokenizer(dataset_name, root, do_lower_case=False):
     else:
         print('Tokenizer file is not found. Downloading.')
 
-    if not os.path.exists(root):
-        try:
-            os.makedirs(root)
-        except OSError as e:
-            if e.errno == errno.EEXIST and os.path.isdir(root):
-                pass
-            else:
-                raise e
+    os.makedirs(root, exist_ok=True)
 
     repo_url = _get_repo_url()
     prefix = str(time.time())
diff --git a/src/gluonnlp/data/baidu_ernie_data.py b/src/gluonnlp/data/baidu_ernie_data.py
index 604dc5413f..cb9f517d86 100644
--- a/src/gluonnlp/data/baidu_ernie_data.py
+++ b/src/gluonnlp/data/baidu_ernie_data.py
@@ -34,8 +34,7 @@ def __init__(self, root=None, dataset_name=None, segment=None, filename=None, **
         assert (filename or (root and dataset_name and segment))
         if not filename:
             root = os.path.expanduser(root)
-            if not os.path.isdir(root):
-                os.makedirs(root)
+            os.makedirs(root, exist_ok=True)
             self._root = root
             download_data_path = os.path.join(self._root, 'task_data.tgz')
             if not os.path.exists(download_data_path):
diff --git a/src/gluonnlp/data/conll.py b/src/gluonnlp/data/conll.py
index b2602e944e..951b91e9e7 100644
--- a/src/gluonnlp/data/conll.py
+++ b/src/gluonnlp/data/conll.py
@@ -39,8 +39,7 @@
 class _CoNLLSequenceTagging(SimpleDataset):
     def __init__(self, segment, root, has_comment=False):
         root = os.path.expanduser(root)
-        if not os.path.isdir(root):
-            os.makedirs(root)
+        os.makedirs(root, exist_ok=True)
         self._segment = segment
         self._root = root
         self._has_comment = has_comment
diff --git a/src/gluonnlp/data/corpora/google_billion_word.py b/src/gluonnlp/data/corpora/google_billion_word.py
index 36128a4dbc..4a5f00a2ad 100644
--- a/src/gluonnlp/data/corpora/google_billion_word.py
+++ b/src/gluonnlp/data/corpora/google_billion_word.py
@@ -84,8 +84,7 @@ class GBWStream(SimpleDatasetStream):
     def __init__(self, segment='train', skip_empty=True, bos=None, eos=EOS_TOKEN,
                  root=os.path.join(get_home_dir(), 'datasets', 'gbw')):
         root = os.path.expanduser(root)
-        if not os.path.isdir(root):
-            os.makedirs(root)
+        os.makedirs(root, exist_ok=True)
         self._root = root
         self._dir = os.path.join(root, '1-billion-word-language-modeling-benchmark-r13output')
         self._namespace = 'gluon/dataset/gbw'
diff --git a/src/gluonnlp/data/corpora/large_text_compression_benchmark.py b/src/gluonnlp/data/corpora/large_text_compression_benchmark.py
index 3d9a8e20f7..1135cb1a94 100644
--- a/src/gluonnlp/data/corpora/large_text_compression_benchmark.py
+++ b/src/gluonnlp/data/corpora/large_text_compression_benchmark.py
@@ -49,8 +49,7 @@
 class _LargeTextCompressionBenchmark(CorpusDataset):
     def __init__(self, root, segment, **kwargs):
         root = os.path.expanduser(root)
-        if not os.path.isdir(root):
-            os.makedirs(root)
+        os.makedirs(root, exist_ok=True)
         self._root = root
         self._segment = segment
         self._namespace = 'gluon/dataset/large_text_compression_benchmark'
diff --git a/src/gluonnlp/data/corpora/wikitext.py b/src/gluonnlp/data/corpora/wikitext.py
index 13bdd866b3..dc8312d5e8 100644
--- a/src/gluonnlp/data/corpora/wikitext.py
+++ b/src/gluonnlp/data/corpora/wikitext.py
@@ -36,8 +36,7 @@ class _WikiText(CorpusDataset):
     def __init__(self, namespace, segment, bos, eos, flatten, skip_empty, root,
                  **kwargs):
         root = os.path.expanduser(root)
-        if not os.path.isdir(root):
-            os.makedirs(root)
+        os.makedirs(root, exist_ok=True)
         self._root = root
         self._namespace = 'gluon/dataset/{}'.format(namespace)
         self._segment = segment
diff --git a/src/gluonnlp/data/glue.py b/src/gluonnlp/data/glue.py
index 853c9ad833..f1518cafee 100644
--- a/src/gluonnlp/data/glue.py
+++ b/src/gluonnlp/data/glue.py
@@ -35,8 +35,7 @@
 class _GlueDataset(TSVDataset):
     def __init__(self, root, data_file, **kwargs):
         root = os.path.expanduser(root)
-        if not os.path.isdir(root):
-            os.makedirs(root)
+        os.makedirs(root, exist_ok=True)
         segment, zip_hash, data_hash = data_file
         self._root = root
         filename = os.path.join(self._root, '%s.tsv' % segment)
diff --git a/src/gluonnlp/data/intent_slot.py b/src/gluonnlp/data/intent_slot.py
index 9fc7311d79..6f487f4349 100644
--- a/src/gluonnlp/data/intent_slot.py
+++ b/src/gluonnlp/data/intent_slot.py
@@ -44,8 +44,7 @@ class _BaseICSLDataset(SimpleDataset):
     """
     def __init__(self, segment, root):
         root = os.path.expanduser(root)
-        if not os.path.isdir(root):
-            os.makedirs(root)
+        os.makedirs(root, exist_ok=True)
         self._segment = segment
         self._root = root
         self._intent_vocab = None
diff --git a/src/gluonnlp/data/question_answering.py b/src/gluonnlp/data/question_answering.py
index 4a065a62d5..0fb980a158 100644
--- a/src/gluonnlp/data/question_answering.py
+++ b/src/gluonnlp/data/question_answering.py
@@ -125,8 +125,7 @@ def __init__(self, segment='train', version='1.1',
 
         root = os.path.expanduser(root)
 
-        if not os.path.isdir(root):
-            os.makedirs(root)
+        os.makedirs(root, exist_ok=True)
 
         self._root = root
         self._segment = segment
diff --git a/src/gluonnlp/data/sentiment.py b/src/gluonnlp/data/sentiment.py
index 0f4c722b90..c1e4e8b2ca 100644
--- a/src/gluonnlp/data/sentiment.py
+++ b/src/gluonnlp/data/sentiment.py
@@ -43,8 +43,7 @@ class SentimentDataset(SimpleDataset):
     """
     def __init__(self, segment, root):
         root = os.path.expanduser(root)
-        if not os.path.isdir(root):
-            os.makedirs(root)
+        os.makedirs(root, exist_ok=True)
         self._root = root
         self._segment = segment
         self._get_data()
@@ -136,8 +135,7 @@ def __init__(self, segment='train', root=os.path.join(get_home_dir(), 'datasets'
                            'unsup': ('unsup.json',
                                      'f908a632b7e7d7ecf113f74c968ef03fadfc3c6c')}
         root = os.path.expanduser(root)
-        if not os.path.isdir(root):
-            os.makedirs(root)
+        os.makedirs(root, exist_ok=True)
         self._root = root
         self._segment = segment
         self._get_data()
diff --git a/src/gluonnlp/data/super_glue.py b/src/gluonnlp/data/super_glue.py
index 67385f1296..e3d7ddf5d5 100644
--- a/src/gluonnlp/data/super_glue.py
+++ b/src/gluonnlp/data/super_glue.py
@@ -38,8 +38,7 @@
 class _SuperGlueDataset(_JsonlDataset):
     def __init__(self, root, data_file):
         root = os.path.expanduser(root)
-        if not os.path.isdir(root):
-            os.makedirs(root)
+        os.makedirs(root, exist_ok=True)
         segment, zip_hash, data_hash = data_file
         self._root = root
         filename = os.path.join(self._root, '%s.jsonl' % segment)
diff --git a/src/gluonnlp/data/transforms.py b/src/gluonnlp/data/transforms.py
index fc7aff91a5..6bf88ca67e 100644
--- a/src/gluonnlp/data/transforms.py
+++ b/src/gluonnlp/data/transforms.py
@@ -29,7 +29,6 @@
     'GPT2BPETokenizer', 'GPT2BPEDetokenizer'
 ]
 
-import errno
 import functools
 import io
 import os
@@ -1128,14 +1127,7 @@ def __init__(self, root=os.path.join(get_home_dir(), 'models')):
                 print('Detected mismatch in the content of BPE rank file. Downloading again.')
             else:
                 print('BPE rank file is not found. Downloading.')
-            if not os.path.exists(root):
-                try:
-                    os.makedirs(root)
-                except OSError as e:
-                    if e.errno == errno.EEXIST and os.path.isdir(root):
-                        pass
-                    else:
-                        raise e
+            os.makedirs(root, exist_ok=True)
 
             prefix = str(time.time())
             zip_file_path = os.path.join(root, prefix + file_name)
diff --git a/src/gluonnlp/data/translation.py b/src/gluonnlp/data/translation.py
index 6ab02e0ce8..c25886c718 100644
--- a/src/gluonnlp/data/translation.py
+++ b/src/gluonnlp/data/translation.py
@@ -62,8 +62,7 @@ def __init__(self, namespace, segment, src_lang, tgt_lang, root):
         self._tgt_vocab = None
         self._pair_key = _get_pair_key(src_lang, tgt_lang)
         root = os.path.expanduser(root)
-        if not os.path.isdir(root):
-            os.makedirs(root)
+        os.makedirs(root, exist_ok=True)
         self._root = root
         if isinstance(segment, str):
             segment = [segment]
diff --git a/src/gluonnlp/data/word_embedding_evaluation.py b/src/gluonnlp/data/word_embedding_evaluation.py
index a2e6becfb8..49417463bf 100644
--- a/src/gluonnlp/data/word_embedding_evaluation.py
+++ b/src/gluonnlp/data/word_embedding_evaluation.py
@@ -50,8 +50,7 @@ class _Dataset(SimpleDataset):
 
     def __init__(self, root):
         self.root = os.path.expanduser(root)
-        if not os.path.isdir(self.root):
-            os.makedirs(self.root)
+        os.makedirs(self.root, exist_ok=True)
         self._download_data()
         super(_Dataset, self).__init__(self._get_data())
 
diff --git a/src/gluonnlp/utils/files.py b/src/gluonnlp/utils/files.py
index 0a2e8c292f..7403bcfc22 100644
--- a/src/gluonnlp/utils/files.py
+++ b/src/gluonnlp/utils/files.py
@@ -78,13 +78,7 @@ def mkdir(dirname):
                       %(dirname, C.S3_PREFIX))
         return
     dirname = os.path.expanduser(dirname)
-    if not os.path.exists(dirname):
-        try:
-            os.makedirs(dirname)
-        except OSError as e:
-            # errno 17 means the file already exists
-            if e.errno != 17:
-                raise e
+    os.makedirs(dirname, exist_ok=True)
 
 class _TempFilePath:
     """A TempFilePath that provides a path to a temporarily file, and automatically
@@ -92,8 +86,7 @@ class _TempFilePath:
     """
     def __init__(self):
         self.temp_dir = os.path.join(tempfile.gettempdir(), str(hash(os.times())))
-        if not os.path.exists(self.temp_dir):
-            os.makedirs(self.temp_dir)
+        os.makedirs(self.temp_dir, exist_ok=True)
 
     def __enter__(self):
         self.temp_path = os.path.join(self.temp_dir, str(hash(os.times())))

From 7533bba394b6813525d315c309dbd50da17781b9 Mon Sep 17 00:00:00 2001
From: "WANG, Chen" <zhanghuilian-98022@163.com>
Date: Sat, 28 Mar 2020 04:20:53 +0800
Subject: [PATCH 02/42] Add option in md2ipynb for disable computing (#1191)

Useful to build the website without executing all jupyter notebooks
---
 docs/md2ipynb.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/docs/md2ipynb.py b/docs/md2ipynb.py
index 3dfa91959b..08eefd7d5b 100644
--- a/docs/md2ipynb.py
+++ b/docs/md2ipynb.py
@@ -1,10 +1,15 @@
+import argparse
 import os
-import sys
 import time
-import notedown
+
 import nbformat
+import notedown
 
-assert len(sys.argv) == 2, 'usage: input.md'
+parser = argparse.ArgumentParser(description='Convert md file to ipynb files.')
+parser.add_argument('input', help='input.md', type=str)
+parser.add_argument('-d', '--disable_compute',
+                    help='Disable computing python scripts', action="store_true")
+args = parser.parse_args()
 
 # timeout for each notebook, in sec
 timeout = 40 * 60
@@ -12,10 +17,8 @@
 # the files will be ignored for execution
 ignore_execution = []
 
-input_path = sys.argv[1]
-
 # Change working directory to directory of input file
-input_dir, input_fn = os.path.split(input_path)
+input_dir, input_fn = os.path.split(args.input)
 os.chdir(input_dir)
 
 output_fn = '.'.join(input_fn.split('.')[:-1] + ['ipynb'])
@@ -28,8 +31,9 @@
 
 if not any([i in input_fn for i in ignore_execution]):
     tic = time.time()
-    notedown.run(notebook, timeout)
-    print('=== Finished evaluation in %f sec'%(time.time()-tic))
+    if not args.disable_compute:
+        notedown.run(notebook, timeout)
+    print('=== Finished evaluation in %f sec' % (time.time() - tic))
 
 # write
 # need to add language info to for syntax highlight

From f9e9dcd7a9da29841ff78b37df0a1aa2e8be1f18 Mon Sep 17 00:00:00 2001
From: Sheng Zha <szha@users.noreply.github.com>
Date: Fri, 27 Mar 2020 22:04:21 -0700
Subject: [PATCH 03/42] [DOC] update slack invitation link (#1186)

---
 docs/community/index.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/community/index.rst b/docs/community/index.rst
index f9b1627e02..b43c7dbab4 100644
--- a/docs/community/index.rst
+++ b/docs/community/index.rst
@@ -30,7 +30,7 @@ Community
       :title: GluonNLP Slack Channel
       :link: https://apache-mxnet.slack.com/messages/CCCDM10V9
 
-      #gluon-nlp Slack channel. Click the `sign-up link <https://join.slack.com/t/apache-mxnet/shared_invite/enQtNDQyMjAxMjQzMTI3LTkzMzY3ZmRlNzNjNGQxODg0N2Y5NmExMjEwOTZlYmIwYTU2ZTY4ZjNlMmEzOWY5MGQ5N2QxYjhlZTFhZTVmYTc>`_ to register.
+      #gluon-nlp Slack channel. Click the `sign-up link <https://join.slack.com/t/apache-mxnet/shared_invite/zt-5n577awn-iEQhjazdppqbAV~0K7_Vvg>`_ to register.
 
 
    .. card::

From 0d829f722d98daaebbcaf6aa597d51fe0b4a8ab1 Mon Sep 17 00:00:00 2001
From: "WANG, Chen" <zhanghuilian-98022@163.com>
Date: Wed, 1 Apr 2020 14:42:46 +0800
Subject: [PATCH 04/42] [DOC]Add a new argument to Makefile for making docs
 without computing (#1192)

* Add a new argument to Makefile

* Change argument name
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 90b1b01e19..776058cb64 100644
--- a/Makefile
+++ b/Makefile
@@ -93,7 +93,7 @@ compile_notebooks:
 		if [ -f $$TARGETNAME ]; then \
 			echo $$TARGETNAME exists. Skipping compilation of $$BASENAME in Makefile. ; \
 		else \
-			python $(MD2IPYNB) $$BASENAME ; \
+			python $(MD2IPYNB) $(MD2IPYNB_OPTION) $$BASENAME ; \
 		fi ; \
 		cd - ; \
 	done;

From f092e60c1141ab4c5ea01a2b25d90ff3aedc857b Mon Sep 17 00:00:00 2001
From: "WANG, Chen" <zhanghuilian-98022@163.com>
Date: Fri, 3 Apr 2020 01:04:40 +0800
Subject: [PATCH 05/42] [Bugfix] Fix invoking docs/md2ipynb.py from Makefile
 (#1193)

---
 docs/md2ipynb.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/md2ipynb.py b/docs/md2ipynb.py
index 08eefd7d5b..21e466b79c 100644
--- a/docs/md2ipynb.py
+++ b/docs/md2ipynb.py
@@ -19,7 +19,8 @@
 
 # Change working directory to directory of input file
 input_dir, input_fn = os.path.split(args.input)
-os.chdir(input_dir)
+if input_dir:
+    os.chdir(input_dir)
 
 output_fn = '.'.join(input_fn.split('.')[:-1] + ['ipynb'])
 

From 9ccbb55afd55cdc39a1d262a293d62e18dc066a8 Mon Sep 17 00:00:00 2001
From: "WANG, Chen" <zhanghuilian-98022@163.com>
Date: Sat, 4 Apr 2020 05:28:09 +0800
Subject: [PATCH 06/42] Use python3 to invoke MD2IPYNB in Makefile (#1195)

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 776058cb64..55e0bb3014 100644
--- a/Makefile
+++ b/Makefile
@@ -93,7 +93,7 @@ compile_notebooks:
 		if [ -f $$TARGETNAME ]; then \
 			echo $$TARGETNAME exists. Skipping compilation of $$BASENAME in Makefile. ; \
 		else \
-			python $(MD2IPYNB) $(MD2IPYNB_OPTION) $$BASENAME ; \
+			python3 $(MD2IPYNB) $(MD2IPYNB_OPTION) $$BASENAME ; \
 		fi ; \
 		cd - ; \
 	done;

From ef59a379e0888a27cbaa9676b26f749a27dff406 Mon Sep 17 00:00:00 2001
From: "WANG, Chen" <zhanghuilian-98022@163.com>
Date: Tue, 7 Apr 2020 07:34:39 +0800
Subject: [PATCH 07/42] Disable nbsphinx plugin execute code chunks (#1197)

all the computation should be done in md2ipynb.py file and nbsphinx should not compute code chunks in the normal process.
---
 docs/conf.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/conf.py b/docs/conf.py
index c48d913674..65ebbc8db4 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -63,6 +63,7 @@
 nbsphinx_kernel_name = 'python3'
 nbsphinx_allow_errors = True
 nbsphinx_timeout = 1200
+nbsphinx_execute = 'never'
 html_sourcelink_suffix = ''
 
 html_context = {

From b3043b71bbf98d1f904e2b23b7f393618ca12859 Mon Sep 17 00:00:00 2001
From: "WANG, Chen" <zhanghuilian-98022@163.com>
Date: Tue, 7 Apr 2020 07:35:23 +0800
Subject: [PATCH 08/42] Change Makefile:distribute interpreter to python3
 (#1196)

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 55e0bb3014..e982019464 100644
--- a/Makefile
+++ b/Makefile
@@ -110,4 +110,4 @@ test:
 	py.test -v --capture=no --durations=0  tests/unittest scripts
 
 distribute: dist_scripts dist_notebooks
-	python setup.py sdist
+	python3 setup.py sdist

From 4c591db71f283b2de57c2b5f956a8d9a925eff34 Mon Sep 17 00:00:00 2001
From: Haibin Lin <linhaibin.eric@gmail.com>
Date: Tue, 7 Apr 2020 10:45:44 -0700
Subject: [PATCH 09/42] [usability] Add proper error msg to dataset_name=None
 (#1149)

* add err msg

* add err msg

* Update utils.py

* Update utils.py

* Update baidu_ernie_data.py

* Update baidu_ernie_data.py

Co-authored-by: Lin <haibilin@a483e7be4c92.ant.amazon.com>
---
 src/gluonnlp/model/utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/gluonnlp/model/utils.py b/src/gluonnlp/model/utils.py
index 4eca81abbc..16d2ae54bb 100644
--- a/src/gluonnlp/model/utils.py
+++ b/src/gluonnlp/model/utils.py
@@ -276,6 +276,8 @@ def _load_vocab(dataset_name, vocab, root, cls=None):
 
 def _load_pretrained_params(net, model_name, dataset_name, root, ctx, ignore_extra=False,
                             allow_missing=False):
+    assert isinstance(dataset_name, str), \
+      'dataset_name(str) is required when loading pretrained models. Got {}'.format(dataset_name)
     path = '_'.join([model_name, dataset_name])
     model_file = model_store.get_model_file(path, root=root)
     net.load_parameters(model_file, ctx=ctx, ignore_extra=ignore_extra, allow_missing=allow_missing)

From c0d2f283a2bc190e9a64d18443a6bf5c0be882d6 Mon Sep 17 00:00:00 2001
From: "WANG, Chen" <zhanghuilian-98022@163.com>
Date: Wed, 8 Apr 2020 02:51:53 +0800
Subject: [PATCH 10/42] [DEV] Exclude .idea from make clean (#1194)

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index e982019464..5592e7a8bf 100644
--- a/Makefile
+++ b/Makefile
@@ -81,7 +81,7 @@ docs: compile_notebooks distribute
 	sed -i.bak 's/2196f3/178dc9/g' docs/_build/html/_static/sphinx_materialdesign_theme.css;
 
 clean:
-	git clean -ff -d -x --exclude="$(ROOTDIR)/tests/data/*" --exclude="$(ROOTDIR)/conda/"
+	git clean -ff -d -x --exclude="$(ROOTDIR)/tests/data/*" --exclude="$(ROOTDIR)/conda/" --exclude="$(ROOTDIR)/.idea/"
 
 compile_notebooks:
 	for f in $(shell find docs/examples -type f -name '*.md' -print) ; do \

From f2947690aa652df71b4e3969e57e31979acf87eb Mon Sep 17 00:00:00 2001
From: "WANG, Chen" <zhanghuilian-98022@163.com>
Date: Thu, 9 Apr 2020 01:53:35 +0800
Subject: [PATCH 11/42] [DEV] Add new command for building website locally
 (#1198)

---
 Makefile | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 5592e7a8bf..b2f4ab5653 100644
--- a/Makefile
+++ b/Makefile
@@ -57,8 +57,7 @@ ratcheck: ci/rat/apache-rat.jar
         echo "SUCCESS: There are no files with an Unknown License."; \
     fi
 
-docs: compile_notebooks distribute
-	make -C docs html SPHINXOPTS=-W
+docs: docs_local
 	for f in $(shell find docs/examples -type f -name '*.md' -print) ; do \
 		FILE=`echo $$f | sed 's/docs\///g'` ; \
 		DIR=`dirname $$FILE` ; \
@@ -80,6 +79,9 @@ docs: compile_notebooks distribute
 	sed -i.bak 's/33\,150\,243/23\,141\,201/g' docs/_build/html/_static/material-design-lite-1.3.0/material.blue-deep_orange.min.css;
 	sed -i.bak 's/2196f3/178dc9/g' docs/_build/html/_static/sphinx_materialdesign_theme.css;
 
+docs_local: compile_notebooks distribute
+	make -C docs html SPHINXOPTS=-W
+
 clean:
 	git clean -ff -d -x --exclude="$(ROOTDIR)/tests/data/*" --exclude="$(ROOTDIR)/conda/" --exclude="$(ROOTDIR)/.idea/"
 

From 1d54ad8058ec4f8d54b8585ef0819d39140fa74c Mon Sep 17 00:00:00 2001
From: "WANG, Chen" <zhanghuilian-98022@163.com>
Date: Wed, 15 Apr 2020 03:25:12 +0800
Subject: [PATCH 12/42] Add missing development requirements to setup.py
 (#1200)

And remove duplicate imports
---
 setup.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/setup.py b/setup.py
index 91b1b3e893..3afd338d18 100644
--- a/setup.py
+++ b/setup.py
@@ -2,15 +2,14 @@
 import io
 import os
 import re
-import shutil
-import sys
+
 from setuptools import setup, find_packages, Extension
 
 
 def read(*names, **kwargs):
     with io.open(
-        os.path.join(os.path.dirname(__file__), *names),
-        encoding=kwargs.get("encoding", "utf8")
+            os.path.join(os.path.dirname(__file__), *names),
+            encoding=kwargs.get("encoding", "utf8")
     ) as fp:
         return fp.read()
 
@@ -64,7 +63,7 @@ def find_version(*file_paths):
     extras_require={
         'extras': [
             'spacy',
-            'nltk',
+            'nltk>=3.2.5',
             'sacremoses',
             'scipy',
             'numba>=0.45',
@@ -89,6 +88,9 @@ def find_version(*file_paths):
             'sphinx-autodoc-typehints',
             'nbsphinx',
             'flaky',
+            'notedown',
+            'matplotlib',
+            'Image',
         ],
     },
     ext_modules=[

From 7794395665136f65f8d692d998fcabea47cffc61 Mon Sep 17 00:00:00 2001
From: "WANG, Chen" <zhanghuilian-98022@163.com>
Date: Fri, 17 Apr 2020 09:59:41 +0800
Subject: [PATCH 13/42] [Doc] Add documentation for building website (#1179)

Add a new website page about the configuration of compiling the website.
---
 docs/conf.py                               |  4 +-
 docs/index.rst                             |  2 +-
 docs/website/configuration.rst             | 74 ++++++++++++++++++++++
 docs/{community => website}/contribute.rst |  0
 docs/{community => website}/git.rst        |  0
 docs/{community => website}/index.rst      |  1 +
 docs/{community => website}/release.rst    |  0
 7 files changed, 78 insertions(+), 3 deletions(-)
 create mode 100644 docs/website/configuration.rst
 rename docs/{community => website}/contribute.rst (100%)
 rename docs/{community => website}/git.rst (100%)
 rename docs/{community => website}/index.rst (98%)
 rename docs/{community => website}/release.rst (100%)

diff --git a/docs/conf.py b/docs/conf.py
index 65ebbc8db4..1543c0ed56 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -173,8 +173,8 @@
     'header_links' : [
         ('Install', 'install/install-more', False, ''),
         ('API', 'api/index', False, ''),
-        ('Community', 'community/index', False, ''),
-        ('Contribute', 'community/contribute', False, ''),
+        ('Community', 'website/index', False, ''),
+        ('Contribute', 'website/contribute', False, ''),
         ('GitHub', 'https://github.com/dmlc/gluon-nlp/', True, 'fab fa-github'),
     ],
 
diff --git a/docs/index.rst b/docs/index.rst
index cb7f30af41..c3d225a957 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -79,5 +79,5 @@ You may find the 60-min Gluon crash course linked from there especially helpful.
    model_zoo/index
    examples/index
    api/index
-   community/index
+   website/index
    genindex
diff --git a/docs/website/configuration.rst b/docs/website/configuration.rst
new file mode 100644
index 0000000000..3e63dae430
--- /dev/null
+++ b/docs/website/configuration.rst
@@ -0,0 +1,74 @@
+Preview GluonNLP Website Locally
+-----------------------------------------------------------------
+
+The GluonNLP docs website is at `release branch <https://gluon-nlp.mxnet.io>`__, or `master branch <https://gluon-nlp.mxnet.io/master/index.html>`__. Its source code is at `gluon-nlp <https://github.com/dmlc/gluon-nlp>`__.
+
+Currently the GluonNLP website is constructed from the source code via CI automatically. Here I will share:
+
+- the structure of files used for the website, and
+- how to make changes to the website and preview the website
+
+Website Structure
+~~~~~~~~~~~~~~~~~
+
+Currently the docs part contain four sections: Model Zoo, Examples, API and Community. It should be noted that the model zoo is a link redirecting to the ``scripts`` folder in the parent folder. The other three folders are used exclusively by the docs website. Also, three different sections use ``rst``, ``py``, ``md`` files and their composition for compiling - they are inconsistent. So when you work on different sections of the docs website, you should  pay attention to handle the different sections with care.
+
+The main structure, the index file of the entire website, is written in ``rst`` format. It calls the index file of each different section separately. Before compiling the website, you should be aware that:
+
+- ``rst`` files are static files, they are directly displayed to the website with further styles;
+- ``md`` files are script files, the python scripts in these files will be executed and then stored into ``ipynb`` files before converting ``ipynb`` files into website files.
+
+Or more specifically, the files in the examples folder will be further executed and converted into intermediate files before writing to the final HTML files, while those in other folders don’t need further conversion or computation.
+
+Environment Configuration Instruction
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Next, I will give a step by step instruction on how to compile this website from scratch.
+
+1. Preview website without displaying python output
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Use the command from https://github.com/dmlc/gluon-nlp/blob/master/docs/README.txt to install the necessary packages.
+
+.. code:: bash
+
+    pip install sphinx>=1.5.5 sphinx-gallery sphinx_rtd_theme matplotlib Image recommonmark
+
+Then use the command below to build the website locally, all the ``python`` scripts are skipped and there is no output for ``python`` code blocks:
+
+.. code:: bash
+
+    make docs_local MD2IPYNB_OPTION=-d
+
+You will get full HTML result for the website after successful execution.
+
+2. Preview website with python output
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+To accomplish this task, we recommend you to use the instance ``g4dn.xlarge`` on Amazon EC2. For convenience, you can search *deep learning* in the filter bar to select the deep learning-enabled machines, where you will have no need of installing addition drivers.
+
+After you have got the machine and logged to the machine, you will need to configure the packages using the command below:
+
+.. code:: bash
+
+    git clone https://github.com/dmlc/gluon-nlp
+    cd gluon-nlp
+    pip3 install --user -e '.[extras,dev]'
+
+If necessary, you might still need to configure the packages like below:
+
+Use ``python3`` command to get into the python execution screen, and then type the commands below to install the necessary packages inside python:
+
+.. code:: python
+
+    import nltk
+    nltk.download('perluniprops')
+    nltk.download('nonbreaking_prefixes')
+    nltk.download('punkt')
+
+By now, you should have installed all the necessary packages for the website. You can use the command below for previewing the website locally with all the python output:
+
+.. code:: bash
+
+    make docs_local
+
diff --git a/docs/community/contribute.rst b/docs/website/contribute.rst
similarity index 100%
rename from docs/community/contribute.rst
rename to docs/website/contribute.rst
diff --git a/docs/community/git.rst b/docs/website/git.rst
similarity index 100%
rename from docs/community/git.rst
rename to docs/website/git.rst
diff --git a/docs/community/index.rst b/docs/website/index.rst
similarity index 98%
rename from docs/community/index.rst
rename to docs/website/index.rst
index b43c7dbab4..d5313d8f06 100644
--- a/docs/community/index.rst
+++ b/docs/website/index.rst
@@ -55,3 +55,4 @@ Interested in contributing to GluonNLP? Check our contribution guide:
    contribute
    git
    release
+   configuration
\ No newline at end of file
diff --git a/docs/community/release.rst b/docs/website/release.rst
similarity index 100%
rename from docs/community/release.rst
rename to docs/website/release.rst

From 9612bd410b634e8d09d9ca919309a55947509639 Mon Sep 17 00:00:00 2001
From: "WANG, Chen" <zhanghuilian-98022@163.com>
Date: Fri, 24 Apr 2020 01:33:13 +0800
Subject: [PATCH 14/42] [DEV] Specify llvmlite version in CI environment to
 avoid incompatibility with numba (#1208)

Co-authored-by: Leonard Lausen <leonard@lausen.nl>
---
 ci/batch/docker/Dockerfile | 2 +-
 env/cpu/py3.yml            | 5 +++--
 env/docker/py3.yml         | 3 ++-
 env/gpu/py3.yml            | 5 +++--
 4 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/ci/batch/docker/Dockerfile b/ci/batch/docker/Dockerfile
index 8cc64125b5..7122a7e013 100644
--- a/ci/batch/docker/Dockerfile
+++ b/ci/batch/docker/Dockerfile
@@ -16,7 +16,7 @@ FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
           libxft-dev &&\
       rm -rf /var/lib/apt/lists/*
 
- RUN curl -o ~/miniconda.sh -O  https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh  && \
+ RUN curl -o ~/miniconda.sh -O  https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh  && \
       chmod +x ~/miniconda.sh && \
       ~/miniconda.sh -b -p /opt/conda && \
       rm ~/miniconda.sh && \
diff --git a/env/cpu/py3.yml b/env/cpu/py3.yml
index 77a649b07c..2c23106ce7 100644
--- a/env/cpu/py3.yml
+++ b/env/cpu/py3.yml
@@ -16,7 +16,8 @@ dependencies:
     - nbsphinx>=0.3.4,<0.4
     - ipython
     - ipykernel
-    - numba==0.47
+    - numba==0.47.0
+    - llvmlite==0.31.0
     - https://github.com/szha/mx-theme/tarball/master
     - seaborn
     - jieba
@@ -32,7 +33,7 @@ dependencies:
     - flaky==3.6.1
     - flake8==3.7.9
     - mock<3
-    - https://lausen-public.s3.amazonaws.com/mxnet_cu100-1.6.0b20200125-py2.py3-none-manylinux1_x86_64.whl
+    - mxnet==1.6.0
     - scipy==1.3.2
     - regex==2019.11.1
     - nltk==3.4.5
diff --git a/env/docker/py3.yml b/env/docker/py3.yml
index 2c8b532186..33b0e57c47 100644
--- a/env/docker/py3.yml
+++ b/env/docker/py3.yml
@@ -17,6 +17,7 @@ dependencies:
     - ipython
     - ipykernel
     - numba==0.47
+    - llvmlite==0.31.0
     - https://github.com/szha/mx-theme/tarball/master
     - seaborn
     - jieba
@@ -32,7 +33,7 @@ dependencies:
     - flaky==3.6.1
     - flake8==3.7.9
     - mock<3
-    - https://lausen-public.s3.amazonaws.com/mxnet_cu100-1.6.0b20200125-py2.py3-none-manylinux1_x86_64.whl
+    - mxnet-cu101==1.6.0
     - scipy==1.3.2
     - regex==2019.11.1 
     - nltk==3.4.5
diff --git a/env/gpu/py3.yml b/env/gpu/py3.yml
index 1ed92f3fa5..0ce9bc09f0 100644
--- a/env/gpu/py3.yml
+++ b/env/gpu/py3.yml
@@ -16,7 +16,8 @@ dependencies:
     - nbsphinx>=0.3.4,<0.4
     - ipython
     - ipykernel
-    - numba==0.47
+    - numba==0.47.0
+    - llvmlite==0.31.0
     - https://github.com/szha/mx-theme/tarball/master
     - seaborn
     - jieba
@@ -32,7 +33,7 @@ dependencies:
     - flaky==3.6.1
     - flake8==3.7.9
     - mock<3
-    - https://lausen-public.s3.amazonaws.com/mxnet_cu100-1.6.0b20200125-py2.py3-none-manylinux1_x86_64.whl
+    - mxnet-cu101==1.6.0
     - scipy==1.3.2
     - regex==2019.11.1
     - nltk==3.4.5

From a7223802e69301fc33c7c4e0dac825dd1672c97f Mon Sep 17 00:00:00 2001
From: Haibin Lin <linhaibin.eric@gmail.com>
Date: Tue, 28 Apr 2020 14:06:00 -0700
Subject: [PATCH 15/42] [BUGFIX] Fix wd in finetune_squad.py (#1210)

* [FEATURE] Add tests for setting seed value

* [FEATURE] Add function for setting seed value

* [DOCS] Add docs for setting seed value #1110

* remove wd from squad

Co-authored-by: Avinash Madasu <avinash.sai001@gmail.com>
Co-authored-by: Lin <haibilin@a483e7be4c92.ant.amazon.com>
---
 scripts/bert/finetune_squad.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/bert/finetune_squad.py b/scripts/bert/finetune_squad.py
index b807123cd4..2f21356986 100644
--- a/scripts/bert/finetune_squad.py
+++ b/scripts/bert/finetune_squad.py
@@ -422,7 +422,7 @@ def train():
 
     log.info('Start Training')
 
-    optimizer_params = {'learning_rate': lr, 'wd': 0.01}
+    optimizer_params = {'learning_rate': lr}
     param_dict = net.collect_params()
     if args.comm_backend == 'horovod':
         trainer = hvd.DistributedTrainer(param_dict, optimizer, optimizer_params)

From b67b9a4073cd2bc7e25887482bc2b7fd945584ef Mon Sep 17 00:00:00 2001
From: "WANG, Chen" <zhanghuilian-98022@163.com>
Date: Fri, 1 May 2020 03:20:19 +0800
Subject: [PATCH 16/42] [DEV] Update dependency on CI build (#1212)

* Update dependency
tornado and sphinx updated

* Keep tornado at 5.1.1 for we use python 3.5

* Update pandoc to 2.9.2.1

* Update cpu/py3 conda forge dependencies

* Update pandoc/sphinx on master branches

* Revert sphinx to 2.2.1 on all tests

* Update sphinx to 2.4.4
---
 env/cpu/py3-master.yml | 4 ++--
 env/cpu/py3.yml        | 4 ++--
 env/docker/py3.yml     | 4 ++--
 env/gpu/py3-master.yml | 4 ++--
 env/gpu/py3.yml        | 4 ++--
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/env/cpu/py3-master.yml b/env/cpu/py3-master.yml
index 15d49c49fb..3b1f5ac7a9 100644
--- a/env/cpu/py3-master.yml
+++ b/env/cpu/py3-master.yml
@@ -4,9 +4,9 @@ dependencies:
   - python=3.5
   - pip
   - perl
-  - pandoc=1.19.2
+  - pandoc=2.9.2.1
   - tornado=5.1.1
-  - sphinx=2.2.1
+  - sphinx=2.4.4
   # In the -master pipeline, we test without numba. Numba is an optional
   # dependency and GluonNLP needs to work both with and without numba installed.
   - pip:
diff --git a/env/cpu/py3.yml b/env/cpu/py3.yml
index 2c23106ce7..f885b2fc0e 100644
--- a/env/cpu/py3.yml
+++ b/env/cpu/py3.yml
@@ -4,9 +4,9 @@ dependencies:
   - python=3.5
   - pip
   - perl
-  - pandoc=1.19.2
+  - pandoc=2.9.2.1
   - tornado=5.1.1
-  - sphinx=2.2.1
+  - sphinx=2.4.4
   - pip:
     - numpy==1.17.4
     - notedown==1.5.1
diff --git a/env/docker/py3.yml b/env/docker/py3.yml
index 33b0e57c47..0baf63ba70 100644
--- a/env/docker/py3.yml
+++ b/env/docker/py3.yml
@@ -4,9 +4,9 @@ dependencies:
   - python=3.5
   - pip
   - perl
-  - pandoc=1.19.2
+  - pandoc=2.9.2.1
   - tornado=5.1.1
-  - sphinx=2.2.1
+  - sphinx=2.4.4
   - pip:
     - numpy==1.17.4
     - notedown==1.5.1
diff --git a/env/gpu/py3-master.yml b/env/gpu/py3-master.yml
index 593614b587..5f67cb2905 100644
--- a/env/gpu/py3-master.yml
+++ b/env/gpu/py3-master.yml
@@ -4,9 +4,9 @@ dependencies:
   - python=3.5
   - pip
   - perl
-  - pandoc=1.19.2
+  - pandoc=2.9.2.1
   - tornado=5.1.1
-  - sphinx=2.2.1
+  - sphinx=2.4.4
   # In the -master pipeline, we test without numba. Numba is an optional
   # dependency and GluonNLP needs to work both with and without numba installed.
   - pip:
diff --git a/env/gpu/py3.yml b/env/gpu/py3.yml
index 0ce9bc09f0..c747bcea53 100644
--- a/env/gpu/py3.yml
+++ b/env/gpu/py3.yml
@@ -4,9 +4,9 @@ dependencies:
   - python=3.5
   - pip
   - perl
-  - pandoc=1.19.2
+  - pandoc=2.9.2.1
   - tornado=5.1.1
-  - sphinx=2.2.1
+  - sphinx=2.4.4
   - pip:
     - numpy==1.17.4
     - notedown==1.5.1

From 425f799372e075d46d0161d5593816e3e411e4df Mon Sep 17 00:00:00 2001
From: Leonard Lausen <lausen@amazon.com>
Date: Thu, 30 Apr 2020 21:27:56 -0700
Subject: [PATCH 17/42] Fix layer_norm_eps in BERTEncoder (#1215)

---
 src/gluonnlp/model/bert.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/gluonnlp/model/bert.py b/src/gluonnlp/model/bert.py
index 022d1103b2..876749f8d2 100644
--- a/src/gluonnlp/model/bert.py
+++ b/src/gluonnlp/model/bert.py
@@ -318,11 +318,12 @@ def __init__(self, *, num_layers=2, units=512, hidden_size=2048,
         self._output_attention = output_attention
         self._output_all_encodings = output_all_encodings
         self._dropout = dropout
+        self._layer_norm_eps = layer_norm_eps
 
         with self.name_scope():
             if dropout:
                 self.dropout_layer = nn.Dropout(rate=dropout)
-            self.layer_norm = nn.LayerNorm(in_channels=units, epsilon=1e-12)
+            self.layer_norm = nn.LayerNorm(in_channels=units, epsilon=self._layer_norm_eps)
             self.position_weight = self.params.get('position_weight', shape=(max_length, units),
                                                    init=weight_initializer)
             self.transformer_cells = nn.HybridSequential()
@@ -550,7 +551,7 @@ def _get_decoder(self, units, vocab_size, embed, prefix):
             decoder = nn.HybridSequential(prefix=prefix)
             decoder.add(nn.Dense(units, flatten=False))
             decoder.add(GELU())
-            decoder.add(nn.LayerNorm(in_channels=units, epsilon=1e-12))
+            decoder.add(nn.LayerNorm(in_channels=units, epsilon=self.encoder._layer_norm_eps))
             decoder.add(nn.Dense(vocab_size, flatten=False, params=embed.collect_params()))
         assert decoder[3].weight == list(embed.collect_params().values())[0], \
             'The weights of word embedding are not tied with those of decoder'

From 0176f86e0b50d811597ae6db71c346f77747c1f5 Mon Sep 17 00:00:00 2001
From: "WANG, Chen" <zhanghuilian-98022@163.com>
Date: Tue, 5 May 2020 02:57:08 +0800
Subject: [PATCH 18/42] Lift timeout on cpu unittest (#1216)

---
 ci/jenkins/Jenkinsfile_py3-master_cpu_unittest | 2 +-
 ci/jenkins/Jenkinsfile_py3_cpu_unittest        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/jenkins/Jenkinsfile_py3-master_cpu_unittest b/ci/jenkins/Jenkinsfile_py3-master_cpu_unittest
index fb87760de6..cf2ddb402d 100644
--- a/ci/jenkins/Jenkinsfile_py3-master_cpu_unittest
+++ b/ci/jenkins/Jenkinsfile_py3-master_cpu_unittest
@@ -21,7 +21,7 @@
 // See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
 
 // timeout in minutes
-max_time = 120
+max_time = 180
 
 node {
   // Loading the utilities requires a node context unfortunately
diff --git a/ci/jenkins/Jenkinsfile_py3_cpu_unittest b/ci/jenkins/Jenkinsfile_py3_cpu_unittest
index 6d518fdbfd..56478e1a0a 100644
--- a/ci/jenkins/Jenkinsfile_py3_cpu_unittest
+++ b/ci/jenkins/Jenkinsfile_py3_cpu_unittest
@@ -21,7 +21,7 @@
 // See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
 
 // timeout in minutes
-max_time = 120
+max_time = 180
 
 node {
   // Loading the utilities requires a node context unfortunately

From 16b8cbdbb3a54ab5bd46f242d1d6c06b2627b3dc Mon Sep 17 00:00:00 2001
From: Avinash Madasu <avinash.sai001@gmail.com>
Date: Wed, 6 May 2020 03:13:44 +0530
Subject: [PATCH 19/42] [Dev] Change sklearn accuracy metric to mxnet accuracy
 metric (#1209)

---
 .../self_attentive_sentence_embedding.md      | 29 +++++++++++--------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/docs/examples/sentiment_analysis/self_attentive_sentence_embedding.md b/docs/examples/sentiment_analysis/self_attentive_sentence_embedding.md
index ac60e0cea7..7c78a73dc3 100644
--- a/docs/examples/sentiment_analysis/self_attentive_sentence_embedding.md
+++ b/docs/examples/sentiment_analysis/self_attentive_sentence_embedding.md
@@ -38,8 +38,9 @@ from mxnet import gluon, nd, init
 from mxnet.gluon import nn, rnn
 from mxnet import autograd, gluon, nd
 
-# iUse sklearn's metric function to evaluate the results of the experiment
-from sklearn.metrics import accuracy_score, f1_score
+# iUse Mxnet and sklearn's metric functions to evaluate the results of the experiment
+from mxnet.metric import Accuracy
+from sklearn.metrics import f1_score
 
 # fixed random number seed
 np.random.seed(2018)
@@ -65,7 +66,6 @@ The [Yelp users' review dataset](https://www.kaggle.com/yelp-dataset/yelp-datase
 
 Each sample in the data consists of a user's comment, in English, with each comment marked one through five, each number representing one of five different emotions the user expressed. Here we download, unzip, and reformat the dataset for ease of use further on.
 
-
 ```{.python .input}
 # Download the data from the server
 data_url = 'http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/yelp_review_subset-167bb781.zip'
@@ -398,10 +398,14 @@ def one_epoch(data_iter, model, loss, trainer, ctx, is_train, epoch,
     total_true = []
     n_batch = 0
 
+    batch_acc = Accuracy()  # Batch Accuracy
+    epoch_acc = Accuracy()  # Epoch Accuracy
+
     for batch_x, batch_y in data_iter:
         batch_x = batch_x.as_in_context(ctx)
         batch_y = batch_y.as_in_context(ctx)
 
+        batch_acc.reset()   # Reset Batch Accuracy
         if is_train:
             with autograd.record():
                 batch_pred, l = calculate_loss(batch_x, batch_y, model, loss, class_weight, penal_coeff)
@@ -429,10 +433,10 @@ def one_epoch(data_iter, model, loss, trainer, ctx, is_train, epoch,
             batch_pred, l = calculate_loss(batch_x, batch_y, model, loss, class_weight, penal_coeff)
 
         # keep result for metric
-        batch_pred = nd.argmax(nd.softmax(batch_pred, axis=1), axis=1).asnumpy()
-        batch_true = np.reshape(batch_y.asnumpy(), (-1, ))
-        total_pred.extend(batch_pred.tolist())
-        total_true.extend(batch_true.tolist())
+        batch_pred = nd.argmax(nd.softmax(batch_pred, axis=1), axis=1)
+        batch_true = batch_y.reshape(-1, )
+        total_pred.extend(batch_pred.asnumpy().tolist())
+        total_true.extend(batch_true.asnumpy().tolist())
 
         batch_loss = l.mean().asscalar()
 
@@ -441,22 +445,24 @@ def one_epoch(data_iter, model, loss, trainer, ctx, is_train, epoch,
 
         # check the result of traing phase
         if is_train and n_batch % 400 == 0:
+            batch_acc.update(batch_true, batch_pred)
             print('epoch %d, batch %d, batch_train_loss %.4f, batch_train_acc %.3f' %
-                  (epoch, n_batch, batch_loss, accuracy_score(batch_true, batch_pred)))
+                  (epoch, n_batch, batch_loss, batch_acc.get()[1]))
+
 
     # metric
     F1 = f1_score(np.array(total_true), np.array(total_pred), average='weighted')
-    acc = accuracy_score(np.array(total_true), np.array(total_pred))
+    epoch_acc.update(nd.array(total_true), nd.array(total_pred))
     loss_val /= n_batch
 
     if is_train:
         print('epoch %d, learning_rate %.5f \n\t train_loss %.4f, acc_train %.3f, F1_train %.3f, ' %
-              (epoch, trainer.learning_rate, loss_val, acc, F1))
+              (epoch, trainer.learning_rate, loss_val, epoch_acc.get()[1], F1))
         # declay lr
         if epoch % 2 == 0:
             trainer.set_learning_rate(trainer.learning_rate * 0.9)
     else:
-        print('\t valid_loss %.4f, acc_valid %.3f, F1_valid %.3f, ' % (loss_val, acc, F1))
+        print('\t valid_loss %.4f, acc_valid %.3f, F1_valid %.3f, ' % (loss_val, epoch_acc.get()[1], F1))
 
 ```
 
@@ -487,7 +493,6 @@ def train_valid(data_iter_train, data_iter_valid, model, loss, trainer, ctx, nep
 
 Now that we are actually training the model, we use `WeightedSoftmaxCE` to alleviate the problem of data categorical imbalance. We perform statistical analysis on the data in advance to retrieve a set of `class_weight`s.
 
-
 ```{.python .input}
 class_weight = None
 loss_name = 'wsce'

From 306c4a52e4df759c7faf1fb4c2d1c373895ffdb6 Mon Sep 17 00:00:00 2001
From: "WANG, Chen" <zhanghuilian-98022@163.com>
Date: Fri, 8 May 2020 02:56:12 +0800
Subject: [PATCH 20/42] [DEV] Update pip pakcages versions (#1224)

* Update numpy versions

* Update sphinx related packages

* Update regex
---
 env/cpu/py3-master.yml | 8 ++++----
 env/cpu/py3.yml        | 8 ++++----
 env/docker/py3.yml     | 8 ++++----
 env/gpu/py3-master.yml | 8 ++++----
 env/gpu/py3.yml        | 8 ++++----
 5 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/env/cpu/py3-master.yml b/env/cpu/py3-master.yml
index 3b1f5ac7a9..cbecd59241 100644
--- a/env/cpu/py3-master.yml
+++ b/env/cpu/py3-master.yml
@@ -10,12 +10,12 @@ dependencies:
   # In the -master pipeline, we test without numba. Numba is an optional
   # dependency and GluonNLP needs to work both with and without numba installed.
   - pip:
-    - numpy==1.17.4
+    - numpy==1.18.4
     - notedown==1.5.1
-    - sphinx-gallery==0.4.0
+    - sphinx-gallery==0.6.2
     - recommonmark==0.6.0
     - nbconvert==5.6.1
-    - nbsphinx>=0.3.4,<0.4
+    - nbsphinx==0.6.1
     - ipython
     - ipykernel
     - https://github.com/szha/mx-theme/tarball/master
@@ -35,7 +35,7 @@ dependencies:
     - mock<3
     - https://repo.mxnet.io/dist/python/cpu/mxnet-1.6.0-py2.py3-none-manylinux1_x86_64.whl
     - scipy==1.3.2
-    - regex==2019.11.1 
+    - regex==2020.4.4
     - nltk==3.4.5
     - sacremoses==0.0.35
     - spacy==2.2.2
diff --git a/env/cpu/py3.yml b/env/cpu/py3.yml
index f885b2fc0e..8dcd529eb8 100644
--- a/env/cpu/py3.yml
+++ b/env/cpu/py3.yml
@@ -8,12 +8,12 @@ dependencies:
   - tornado=5.1.1
   - sphinx=2.4.4
   - pip:
-    - numpy==1.17.4
+    - numpy==1.18.4
     - notedown==1.5.1
-    - sphinx-gallery==0.4.0
+    - sphinx-gallery==0.6.2
     - recommonmark==0.6.0
     - nbconvert==5.6.1
-    - nbsphinx>=0.3.4,<0.4
+    - nbsphinx==0.6.1
     - ipython
     - ipykernel
     - numba==0.47.0
@@ -35,7 +35,7 @@ dependencies:
     - mock<3
     - mxnet==1.6.0
     - scipy==1.3.2
-    - regex==2019.11.1
+    - regex==2020.4.4
     - nltk==3.4.5
     - sacremoses==0.0.35
     - spacy==2.2.2
diff --git a/env/docker/py3.yml b/env/docker/py3.yml
index 0baf63ba70..556e439b94 100644
--- a/env/docker/py3.yml
+++ b/env/docker/py3.yml
@@ -8,12 +8,12 @@ dependencies:
   - tornado=5.1.1
   - sphinx=2.4.4
   - pip:
-    - numpy==1.17.4
+    - numpy==1.18.4
     - notedown==1.5.1
-    - sphinx-gallery==0.4.0
+    - sphinx-gallery==0.6.2
     - recommonmark==0.6.0
     - nbconvert==5.6.1
-    - nbsphinx>=0.3.4,<0.4
+    - nbsphinx==0.6.1
     - ipython
     - ipykernel
     - numba==0.47
@@ -35,7 +35,7 @@ dependencies:
     - mock<3
     - mxnet-cu101==1.6.0
     - scipy==1.3.2
-    - regex==2019.11.1 
+    - regex==2020.4.4
     - nltk==3.4.5
     - sacremoses==0.0.35
     - spacy==2.2.2
diff --git a/env/gpu/py3-master.yml b/env/gpu/py3-master.yml
index 5f67cb2905..a533770569 100644
--- a/env/gpu/py3-master.yml
+++ b/env/gpu/py3-master.yml
@@ -10,12 +10,12 @@ dependencies:
   # In the -master pipeline, we test without numba. Numba is an optional
   # dependency and GluonNLP needs to work both with and without numba installed.
   - pip:
-    - numpy==1.17.4
+    - numpy==1.18.4
     - notedown==1.5.1
-    - sphinx-gallery==0.4.0
+    - sphinx-gallery==0.6.2
     - recommonmark==0.6.0
     - nbconvert==5.6.1
-    - nbsphinx>=0.3.4,<0.4
+    - nbsphinx==0.6.1
     - ipython
     - ipykernel
     - https://github.com/szha/mx-theme/tarball/master
@@ -35,7 +35,7 @@ dependencies:
     - mock<3
     - https://repo.mxnet.io/dist/python/cu100/mxnet_cu100-1.6.0-py2.py3-none-manylinux1_x86_64.whl
     - scipy==1.3.2
-    - regex==2019.11.1 
+    - regex==2020.4.4
     - nltk==3.4.5
     - sacremoses==0.0.35
     - spacy==2.2.2
diff --git a/env/gpu/py3.yml b/env/gpu/py3.yml
index c747bcea53..321722f316 100644
--- a/env/gpu/py3.yml
+++ b/env/gpu/py3.yml
@@ -8,12 +8,12 @@ dependencies:
   - tornado=5.1.1
   - sphinx=2.4.4
   - pip:
-    - numpy==1.17.4
+    - numpy==1.18.4
     - notedown==1.5.1
-    - sphinx-gallery==0.4.0
+    - sphinx-gallery==0.6.2
     - recommonmark==0.6.0
     - nbconvert==5.6.1
-    - nbsphinx>=0.3.4,<0.4
+    - nbsphinx==0.6.1
     - ipython
     - ipykernel
     - numba==0.47.0
@@ -35,7 +35,7 @@ dependencies:
     - mock<3
     - mxnet-cu101==1.6.0
     - scipy==1.3.2
-    - regex==2019.11.1
+    - regex==2020.4.4
     - nltk==3.4.5
     - sacremoses==0.0.35
     - spacy==2.2.2

From f429d526a0ad9dca21754b53694cba055d3f0986 Mon Sep 17 00:00:00 2001
From: Avinash Madasu <avinash.sai001@gmail.com>
Date: Fri, 8 May 2020 00:29:15 +0530
Subject: [PATCH 21/42] [BUGFIX] TextCNN rand model downloads pretrained
 vectors even if not needed (#1222)

---
 scripts/sentiment_analysis/process_data.py    | 19 +++++++++++++------
 .../sentiment_analysis_cnn.py                 | 13 ++++++-------
 2 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/scripts/sentiment_analysis/process_data.py b/scripts/sentiment_analysis/process_data.py
index 70363f7dea..365cd8622b 100644
--- a/scripts/sentiment_analysis/process_data.py
+++ b/scripts/sentiment_analysis/process_data.py
@@ -80,7 +80,7 @@ def _clean_str(string, data_name):
         return string.strip() if data_name == 'TREC' else string.strip().lower()
 
 
-def _build_vocab(data_name, train_dataset, test_dataset, dev_dataset):
+def _build_vocab(data_name, train_dataset, test_dataset, dev_dataset, model_name):
     all_token = []
     max_len = 0
     for dataset in (train_dataset, dev_dataset, test_dataset):
@@ -89,7 +89,13 @@ def _build_vocab(data_name, train_dataset, test_dataset, dev_dataset):
             max_len = max_len if max_len > len(line) else len(line)
             all_token.extend(line)
     vocab = nlp.Vocab(nlp.data.count_tokens(all_token))
-    vocab.set_embedding(nlp.embedding.create('Word2Vec', source='GoogleNews-vectors-negative300'))
+    if(model_name == 'rand'):
+        emb = nlp.embedding.TokenEmbedding()
+        emb[emb.unknown_token] = nd.zeros(300)
+        vocab.set_embedding(emb)
+    else:
+        vocab.set_embedding(nlp.embedding.create('Word2Vec',
+                                                 source='GoogleNews-vectors-negative300'))
     for word in vocab.embedding._idx_to_token:
         if (vocab.embedding[word] == nd.zeros(300)).sum() == 300:
             vocab.embedding[word] = nd.random.uniform(0, 0.05, 300)
@@ -118,23 +124,24 @@ def _preprocess_dataset(dataset, vocab, max_len):
     return dataset, lengths
 
 
-def load_dataset(data_name):
+def load_dataset(data_name, model_name):
     """Load sentiment dataset."""
     if data_name in ('MR', 'Subj', 'CR', 'MPQA'):
         train_dataset, output_size = _load_file(data_name)
-        vocab, max_len = _build_vocab(data_name, train_dataset, [], [])
+        vocab, max_len = _build_vocab(data_name, train_dataset, [], [], model_name)
         train_dataset, train_data_lengths = _preprocess_dataset(train_dataset, vocab, max_len)
         return vocab, max_len, output_size, train_dataset, train_data_lengths
     elif data_name == 'TREC':
         train_dataset, test_dataset, output_size = _load_file(data_name)
-        vocab, max_len = _build_vocab(data_name, train_dataset, test_dataset, [])
+        vocab, max_len = _build_vocab(data_name, train_dataset, test_dataset, [], model_name)
         train_dataset, train_data_lengths = _preprocess_dataset(train_dataset, vocab, max_len)
         test_dataset, test_data_lengths = _preprocess_dataset(test_dataset, vocab, max_len)
         return vocab, max_len, output_size, train_dataset, train_data_lengths, test_dataset, \
                test_data_lengths
     else:
         train_dataset, test_dataset, dev_dataset, output_size = _load_file(data_name)
-        vocab, max_len = _build_vocab(data_name, train_dataset, test_dataset, dev_dataset)
+        vocab, max_len = _build_vocab(data_name, train_dataset, test_dataset, dev_dataset,
+                                      model_name)
         train_dataset, train_data_lengths = _preprocess_dataset(train_dataset, vocab, max_len)
         test_dataset, test_data_lengths = _preprocess_dataset(test_dataset, vocab, max_len)
         dev_dataset, dev_data_lengths = _preprocess_dataset(dev_dataset, vocab, max_len)
diff --git a/scripts/sentiment_analysis/sentiment_analysis_cnn.py b/scripts/sentiment_analysis/sentiment_analysis_cnn.py
index f540537ec6..b595d3c0c4 100644
--- a/scripts/sentiment_analysis/sentiment_analysis_cnn.py
+++ b/scripts/sentiment_analysis/sentiment_analysis_cnn.py
@@ -28,7 +28,6 @@
 import argparse
 import time
 import random
-import numpy as np
 
 import mxnet as mx
 from mxnet import nd, gluon, autograd
@@ -39,9 +38,8 @@
 
 gluonnlp.utils.check_version('0.7.0')
 
-np.random.seed(3435)
-random.seed(3435)
-mx.random.seed(3435)
+seed = 3435
+gluonnlp.utils.set_seed(seed)
 
 parser = argparse.ArgumentParser(description='Sentiment analysis with the textCNN model on\
                                  various datasets.')
@@ -72,13 +70,14 @@
 
 if args.data_name in ('MR', 'Subj', 'CR', 'MPQA'):
     vocab, max_len, output_size, train_dataset, train_data_lengths \
-    = process_data.load_dataset(args.data_name)
+    = process_data.load_dataset(args.data_name, args.model_mode)
 elif args.data_name == 'TREC':
     vocab, max_len, output_size, train_dataset, train_data_lengths, \
-    test_dataset, test_data_lengths = process_data.load_dataset(args.data_name)
+    test_dataset, test_data_lengths = process_data.load_dataset(args.data_name, args.model_mode)
 else:
     vocab, max_len, output_size, train_dataset, train_data_lengths, test_dataset, \
-    test_data_lengths, dev_dataset, dev_data_lengths = process_data.load_dataset(args.data_name)
+    test_data_lengths, dev_dataset, dev_data_lengths = process_data.load_dataset(args.data_name,
+                                                                                 args.model_mode)
 
 model = text_cnn.model(args.dropout, vocab, args.model_mode, output_size)
 print(model)

From 7b6e051cbfbd73187e8734da1782e48548f2f524 Mon Sep 17 00:00:00 2001
From: "WANG, Chen" <16307110064@fudan.edu.cn>
Date: Wed, 13 May 2020 01:34:44 +0800
Subject: [PATCH 22/42] Lift pytype versions (#1227)

---
 env/cpu/py3-master.yml | 2 +-
 env/cpu/py3.yml        | 2 +-
 env/docker/py3.yml     | 2 +-
 env/gpu/py3-master.yml | 2 +-
 env/gpu/py3.yml        | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/env/cpu/py3-master.yml b/env/cpu/py3-master.yml
index cbecd59241..afaeb9c99d 100644
--- a/env/cpu/py3-master.yml
+++ b/env/cpu/py3-master.yml
@@ -23,7 +23,7 @@ dependencies:
     - jieba
     - cython
     - boto3
-    - pytype==2019.10.17
+    - pytype==2020.5.7
     - pytest==5.3.2
     - pytest-env==0.6.2
     - pytest-cov==2.8.1
diff --git a/env/cpu/py3.yml b/env/cpu/py3.yml
index 8dcd529eb8..55d93d2e54 100644
--- a/env/cpu/py3.yml
+++ b/env/cpu/py3.yml
@@ -23,7 +23,7 @@ dependencies:
     - jieba
     - cython
     - boto3
-    - pytype==2019.10.17
+    - pytype==2020.5.7
     - pytest==5.3.2
     - pytest-env==0.6.2
     - pytest-cov==2.8.1
diff --git a/env/docker/py3.yml b/env/docker/py3.yml
index 556e439b94..11b34e4d25 100644
--- a/env/docker/py3.yml
+++ b/env/docker/py3.yml
@@ -23,7 +23,7 @@ dependencies:
     - jieba
     - scikit-learn==0.21.3
     - cython
-    - pytype==2019.10.17
+    - pytype==2020.5.7
     - pytest==5.2.3
     - pytest-env==0.6.2
     - pytest-cov==2.8.1
diff --git a/env/gpu/py3-master.yml b/env/gpu/py3-master.yml
index a533770569..a29bbbb149 100644
--- a/env/gpu/py3-master.yml
+++ b/env/gpu/py3-master.yml
@@ -23,7 +23,7 @@ dependencies:
     - jieba
     - cython
     - boto3
-    - pytype==2019.10.17
+    - pytype==2020.5.7
     - pytest==5.3.2
     - pytest-env==0.6.2
     - pytest-cov==2.8.1
diff --git a/env/gpu/py3.yml b/env/gpu/py3.yml
index 321722f316..6eed672474 100644
--- a/env/gpu/py3.yml
+++ b/env/gpu/py3.yml
@@ -23,7 +23,7 @@ dependencies:
     - jieba
     - cython
     - boto3
-    - pytype==2019.10.17
+    - pytype==2020.5.7
     - pytest==5.3.2
     - pytest-env==0.6.2
     - pytest-cov==2.8.1

From cfdafd310ed428d65599f0d133c275a2613690ab Mon Sep 17 00:00:00 2001
From: ma-hei <marten_heidemeyer@web.de>
Date: Sun, 17 May 2020 11:24:02 -0700
Subject: [PATCH 23/42] [BUGFIX] Remove mutable_args restriction in get_model
 API (#1207)

This commit removes the mutable_args restriction.
After this commit, any parameter can be overriden,
as long as the user sets the allow_override parameter
to True.
---
 scripts/tests/test_models.py         |  32 +++++
 scripts/text_generation/model/gpt.py |  23 ++-
 src/gluonnlp/model/bert.py           |  16 ++-
 src/gluonnlp/model/language_model.py | 203 ++++++++++++++++++---------
 src/gluonnlp/model/transformer.py    |  40 +++---
 tests/unittest/test_models.py        |  64 +++++++++
 6 files changed, 276 insertions(+), 102 deletions(-)

diff --git a/scripts/tests/test_models.py b/scripts/tests/test_models.py
index 16ef22d612..f3e174dbb8 100644
--- a/scripts/tests/test_models.py
+++ b/scripts/tests/test_models.py
@@ -28,6 +28,38 @@
 from gluonnlp.data.transforms import GPT2BPEDetokenizer, GPT2BPETokenizer
 
 from ..text_generation.model import get_model
+from ..text_generation.model.gpt import gpt2_hparams
+
+
+def verify_get_model_with_hparam_allow_override(models, hparam_allow_override, predefined_args_dict,
+        mutable_args, dataset_name):
+
+    for model in models:
+        predefined_args = predefined_args_dict[model].copy()
+        if hparam_allow_override:
+            params_that_should_throw_exception = set()
+        else:
+            params_that_should_throw_exception = set(predefined_args.keys()) - set(mutable_args)
+        params_that_threw_exception = set()
+        for key in predefined_args:
+            try:
+                get_model(model, dataset_name=dataset_name,
+                    hparam_allow_override=hparam_allow_override, **{key: predefined_args[key]})
+            except:
+                expected = not hparam_allow_override and not key in mutable_args
+                params_that_threw_exception.add(key)
+                assert expected
+
+        assert params_that_threw_exception == params_that_should_throw_exception
+
+
+@pytest.mark.parametrize('hparam_allow_override', [False, True])
+def test_hparam_allow_override_gpt2(hparam_allow_override):
+    models = ['gpt2_117m', 'gpt2_345m']
+    mutable_args_of_models = ['dropout']
+    predefined_args_dict = gpt2_hparams.copy()
+    verify_get_model_with_hparam_allow_override(models, hparam_allow_override, predefined_args_dict,
+            mutable_args_of_models, 'openai_webtext')
 
 
 @pytest.mark.remote_required
diff --git a/scripts/text_generation/model/gpt.py b/scripts/text_generation/model/gpt.py
index 37137c4a12..b1ef4061a2 100644
--- a/scripts/text_generation/model/gpt.py
+++ b/scripts/text_generation/model/gpt.py
@@ -393,7 +393,8 @@ def gpt2_345m(dataset_name=None, vocab=None, pretrained=True, ctx=mx.cpu(),
 
 
 def _get_gpt2_model(model_name=None, dataset_name=None, vocab=None, pretrained=True, ctx=mx.cpu(),
-                    root=os.path.join(get_home_dir(), 'models'), **kwargs):
+                    root=os.path.join(get_home_dir(), 'models'),
+                    hparam_allow_override=False, **kwargs):
     """Any predefined GPT-2 model.
 
     Parameters
@@ -416,26 +417,24 @@ def _get_gpt2_model(model_name=None, dataset_name=None, vocab=None, pretrained=T
     root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
         MXNET_HOME defaults to '~/.mxnet'.
+    hparam_allow_override : bool, default False
+        If set to True, pre-defined hyper-parameters of the model
+        (e.g. the number of layers, hidden units) can be overriden.
 
     Returns
     -------
     GPT2Model, gluonnlp.vocab.Vocab
     """
     predefined_args = gpt2_hparams[model_name].copy()
-    mutable_args = ['dropout']
-    mutable_args = frozenset(mutable_args)
-    assert all((k not in kwargs or k in mutable_args) for k in predefined_args), \
-        'Cannot override predefined model settings.'
+    if not hparam_allow_override:
+        mutable_args = ['dropout']
+        mutable_args = frozenset(mutable_args)
+        assert all((k not in kwargs or k in mutable_args) for k in predefined_args), \
+            'Cannot override predefined model settings.'
     predefined_args.update(kwargs)
     vocab = _load_vocab(dataset_name, vocab, root)
     # GPT2
-    net = GPT2Model(units=predefined_args['units'],
-                    vocab_size=len(vocab),
-                    max_length=predefined_args['max_length'],
-                    num_layers=predefined_args['num_layers'],
-                    num_heads=predefined_args['num_heads'],
-                    dropout=predefined_args['dropout'],
-                    **kwargs)
+    net = GPT2Model(vocab_size=len(vocab), **predefined_args)
     if pretrained:
         _load_pretrained_params(net, model_name, dataset_name, root, ctx)
     return net, vocab
diff --git a/src/gluonnlp/model/bert.py b/src/gluonnlp/model/bert.py
index 876749f8d2..ed29c8aaae 100644
--- a/src/gluonnlp/model/bert.py
+++ b/src/gluonnlp/model/bert.py
@@ -1254,7 +1254,7 @@ def distilbert_6_768_12(dataset_name='distil_book_corpus_wiki_en_uncased', vocab
                         output_attention=False,
                         output_all_encodings=False,
                         root=os.path.join(get_home_dir(), 'models'),
-                        **kwargs):
+                        hparam_allow_override=False, **kwargs):
     """DistilBERT model: https://arxiv.org/abs/1910.01108
 
     The number of layers (L) is 6, number of units (H) is 768, and the
@@ -1277,17 +1277,21 @@ def distilbert_6_768_12(dataset_name='distil_book_corpus_wiki_en_uncased', vocab
     root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
         MXNET_HOME defaults to '~/.mxnet'.
+    hparam_allow_override : bool, default False
+        If set to True, pre-defined hyper-parameters of the model
+        (e.g. the number of layers, hidden units) can be overriden.
 
     Returns
     -------
     DistilBERTModel, gluonnlp.vocab.Vocab
     """
     model_name = 'distilbert_6_768_12'
-    predefined_args = bert_hparams[model_name]
-    mutable_args = ['use_residual', 'dropout', 'word_embed']
-    mutable_args = frozenset(mutable_args)
-    assert all((k not in kwargs or k in mutable_args) for k in predefined_args), \
-        'Cannot override predefined model settings.'
+    predefined_args = bert_hparams[model_name].copy()
+    if not hparam_allow_override:
+        mutable_args = ['use_residual', 'dropout', 'word_embed']
+        mutable_args = frozenset(mutable_args)
+        assert all((k not in kwargs or k in mutable_args) for k in predefined_args), \
+            'Cannot override predefined model settings.'
     predefined_args.update(kwargs)
     # encoder
     encoder = BERTEncoder(num_layers=predefined_args['num_layers'],
diff --git a/src/gluonnlp/model/language_model.py b/src/gluonnlp/model/language_model.py
index 29f5eedb56..2df355729c 100644
--- a/src/gluonnlp/model/language_model.py
+++ b/src/gluonnlp/model/language_model.py
@@ -173,6 +173,69 @@ def hybrid_forward(self, F, inputs, begin_state=None): # pylint: disable=argumen
         out = self.decoder(encoded)
         return out, state
 
+awd_lstm_lm_1150_hparams = {
+        'embed_size': 400,
+        'hidden_size': 1150,
+        'mode': 'lstm',
+        'num_layers': 3,
+        'tie_weights': True,
+        'dropout': 0.4,
+        'weight_drop': 0.5,
+        'drop_h': 0.2,
+        'drop_i': 0.65,
+        'drop_e': 0.1
+}
+
+awd_lstm_lm_600_hparams = {
+        'embed_size': 200,
+        'hidden_size': 600,
+        'mode': 'lstm',
+        'num_layers': 3,
+        'tie_weights': True,
+        'dropout': 0.2,
+        'weight_drop': 0.2,
+        'drop_h': 0.1,
+        'drop_i': 0.3,
+        'drop_e': 0.05
+}
+
+standard_lstm_lm_200_hparams = {
+        'embed_size': 200,
+        'hidden_size': 200,
+        'mode': 'lstm',
+        'num_layers': 2,
+        'tie_weights': True,
+        'dropout': 0.2
+}
+
+standard_lstm_lm_650_hparams = {
+        'embed_size': 650,
+        'hidden_size': 650,
+        'mode': 'lstm',
+        'num_layers': 2,
+        'tie_weights': True,
+        'dropout': 0.5
+}
+
+standard_lstm_lm_1500_hparams = {
+        'embed_size': 1500,
+        'hidden_size': 1500,
+        'mode': 'lstm',
+        'num_layers': 2,
+        'tie_weights': True,
+        'dropout': 0.65
+}
+
+awd_lstm_lm_hparams = {
+        'awd_lstm_lm_1150': awd_lstm_lm_1150_hparams,
+        'awd_lstm_lm_600': awd_lstm_lm_600_hparams
+}
+
+standard_lstm_lm_hparams = {
+        'standard_lstm_lm_200': standard_lstm_lm_200_hparams,
+        'standard_lstm_lm_650': standard_lstm_lm_650_hparams,
+        'standard_lstm_lm_1500': standard_lstm_lm_1500_hparams
+}
 
 def _get_rnn_model(model_cls, model_name, dataset_name, vocab, pretrained, ctx, root, **kwargs):
     vocab = _load_vocab(dataset_name, vocab, root)
@@ -184,7 +247,8 @@ def _get_rnn_model(model_cls, model_name, dataset_name, vocab, pretrained, ctx,
 
 
 def awd_lstm_lm_1150(dataset_name=None, vocab=None, pretrained=False, ctx=cpu(),
-                     root=os.path.join(get_home_dir(), 'models'), **kwargs):
+                     root=os.path.join(get_home_dir(), 'models'),
+                     hparam_allow_override=False, **kwargs):
     r"""3-layer LSTM language model with weight-drop, variational dropout, and tied weights.
 
     Embedding size is 400, and hidden layer size is 1150.
@@ -208,31 +272,27 @@ def awd_lstm_lm_1150(dataset_name=None, vocab=None, pretrained=False, ctx=cpu(),
     root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
         MXNET_HOME defaults to '~/.mxnet'.
+    hparam_allow_override : bool, default False
+        If set to True, pre-defined hyper-parameters of the model
+        (e.g. the number of layers, hidden units) can be overriden.
 
     Returns
     -------
     gluon.Block, gluonnlp.Vocab
     """
-    predefined_args = {'embed_size': 400,
-                       'hidden_size': 1150,
-                       'mode': 'lstm',
-                       'num_layers': 3,
-                       'tie_weights': True,
-                       'dropout': 0.4,
-                       'weight_drop': 0.5,
-                       'drop_h': 0.2,
-                       'drop_i': 0.65,
-                       'drop_e': 0.1}
-    mutable_args = frozenset(['dropout', 'weight_drop', 'drop_h', 'drop_i', 'drop_e'])
-    assert all((k not in kwargs or k in mutable_args) for k in predefined_args), \
-           'Cannot override predefined model settings.'
+    predefined_args = awd_lstm_lm_hparams['awd_lstm_lm_1150'].copy()
+    if not hparam_allow_override:
+        mutable_args = frozenset(['dropout', 'weight_drop', 'drop_h', 'drop_i', 'drop_e'])
+        assert all((k not in kwargs or k in mutable_args) for k in predefined_args), \
+            'Cannot override predefined model settings.'
     predefined_args.update(kwargs)
     return _get_rnn_model(AWDRNN, 'awd_lstm_lm_1150', dataset_name, vocab, pretrained,
                           ctx, root, **predefined_args)
 
 
 def awd_lstm_lm_600(dataset_name=None, vocab=None, pretrained=False, ctx=cpu(),
-                    root=os.path.join(get_home_dir(), 'models'), **kwargs):
+                    root=os.path.join(get_home_dir(), 'models'),
+                    hparam_allow_override=False, **kwargs):
     r"""3-layer LSTM language model with weight-drop, variational dropout, and tied weights.
 
     Embedding size is 200, and hidden layer size is 600.
@@ -256,30 +316,26 @@ def awd_lstm_lm_600(dataset_name=None, vocab=None, pretrained=False, ctx=cpu(),
     root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
         MXNET_HOME defaults to '~/.mxnet'.
+    hparam_allow_override : bool, default False
+        If set to True, pre-defined hyper-parameters of the model
+        (e.g. the number of layers, hidden units) can be overriden.
 
     Returns
     -------
     gluon.Block, gluonnlp.Vocab
     """
-    predefined_args = {'embed_size': 200,
-                       'hidden_size': 600,
-                       'mode': 'lstm',
-                       'num_layers': 3,
-                       'tie_weights': True,
-                       'dropout': 0.2,
-                       'weight_drop': 0.2,
-                       'drop_h': 0.1,
-                       'drop_i': 0.3,
-                       'drop_e': 0.05}
-    mutable_args = frozenset(['dropout', 'weight_drop', 'drop_h', 'drop_i', 'drop_e'])
-    assert all((k not in kwargs or k in mutable_args) for k in predefined_args), \
-           'Cannot override predefined model settings.'
+    predefined_args = awd_lstm_lm_hparams['awd_lstm_lm_600'].copy()
+    if not hparam_allow_override:
+        mutable_args = frozenset(['dropout', 'weight_drop', 'drop_h', 'drop_i', 'drop_e'])
+        assert all((k not in kwargs or k in mutable_args) for k in predefined_args), \
+            'Cannot override predefined model settings.'
     predefined_args.update(kwargs)
     return _get_rnn_model(AWDRNN, 'awd_lstm_lm_600', dataset_name, vocab, pretrained,
                           ctx, root, **predefined_args)
 
 def standard_lstm_lm_200(dataset_name=None, vocab=None, pretrained=False, ctx=cpu(),
-                         root=os.path.join(get_home_dir(), 'models'), **kwargs):
+                         root=os.path.join(get_home_dir(), 'models'),
+                         hparam_allow_override=False, **kwargs):
     r"""Standard 2-layer LSTM language model with tied embedding and output weights.
 
     Both embedding and hidden dimensions are 200.
@@ -303,27 +359,27 @@ def standard_lstm_lm_200(dataset_name=None, vocab=None, pretrained=False, ctx=cp
     root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
         MXNET_HOME defaults to '~/.mxnet'.
+    hparam_allow_override : bool, default False
+        If set to True, pre-defined hyper-parameters of the model
+        (e.g. the number of layers, hidden units) can be overriden.
 
     Returns
     -------
     gluon.Block, gluonnlp.Vocab
     """
-    predefined_args = {'embed_size': 200,
-                       'hidden_size': 200,
-                       'mode': 'lstm',
-                       'num_layers': 2,
-                       'tie_weights': True,
-                       'dropout': 0.2}
-    mutable_args = ['dropout']
-    assert all((k not in kwargs or k in mutable_args) for k in predefined_args), \
-           'Cannot override predefined model settings.'
+    predefined_args = standard_lstm_lm_hparams['standard_lstm_lm_200'].copy()
+    if not hparam_allow_override:
+        mutable_args = frozenset(['dropout'])
+        assert all((k not in kwargs or k in mutable_args) for k in predefined_args), \
+            'Cannot override predefined model settings.'
     predefined_args.update(kwargs)
     return _get_rnn_model(StandardRNN, 'standard_lstm_lm_200', dataset_name, vocab, pretrained,
                           ctx, root, **predefined_args)
 
 
 def standard_lstm_lm_650(dataset_name=None, vocab=None, pretrained=False, ctx=cpu(),
-                         root=os.path.join(get_home_dir(), 'models'), **kwargs):
+                         root=os.path.join(get_home_dir(), 'models'),
+                         hparam_allow_override=False, **kwargs):
     r"""Standard 2-layer LSTM language model with tied embedding and output weights.
 
     Both embedding and hidden dimensions are 650.
@@ -347,27 +403,27 @@ def standard_lstm_lm_650(dataset_name=None, vocab=None, pretrained=False, ctx=cp
     root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
         MXNET_HOME defaults to '~/.mxnet'.
+    hparam_allow_override : bool, default False
+        If set to True, pre-defined hyper-parameters of the model
+        (e.g. the number of layers, hidden units) can be overriden.
 
     Returns
     -------
     gluon.Block, gluonnlp.Vocab
     """
-    predefined_args = {'embed_size': 650,
-                       'hidden_size': 650,
-                       'mode': 'lstm',
-                       'num_layers': 2,
-                       'tie_weights': True,
-                       'dropout': 0.5}
-    mutable_args = ['dropout']
-    assert all((k not in kwargs or k in mutable_args) for k in predefined_args), \
-           'Cannot override predefined model settings.'
+    predefined_args = standard_lstm_lm_hparams['standard_lstm_lm_650'].copy()
+    if not hparam_allow_override:
+        mutable_args = frozenset(['dropout'])
+        assert all((k not in kwargs or k in mutable_args) for k in predefined_args), \
+            'Cannot override predefined model settings.'
     predefined_args.update(kwargs)
     return _get_rnn_model(StandardRNN, 'standard_lstm_lm_650', dataset_name, vocab, pretrained,
                           ctx, root, **predefined_args)
 
 
 def standard_lstm_lm_1500(dataset_name=None, vocab=None, pretrained=False, ctx=cpu(),
-                          root=os.path.join(get_home_dir(), 'models'), **kwargs):
+                          root=os.path.join(get_home_dir(), 'models'),
+                          hparam_allow_override=False, **kwargs):
     r"""Standard 2-layer LSTM language model with tied embedding and output weights.
 
     Both embedding and hidden dimensions are 1500.
@@ -391,20 +447,19 @@ def standard_lstm_lm_1500(dataset_name=None, vocab=None, pretrained=False, ctx=c
     root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
         MXNET_HOME defaults to '~/.mxnet'.
+    hparam_allow_override : bool, default False
+        If set to True, pre-defined hyper-parameters of the model
+        (e.g. the number of layers, hidden units) can be overriden.
 
     Returns
     -------
     gluon.Block, gluonnlp.Vocab
     """
-    predefined_args = {'embed_size': 1500,
-                       'hidden_size': 1500,
-                       'mode': 'lstm',
-                       'num_layers': 2,
-                       'tie_weights': True,
-                       'dropout': 0.65}
-    mutable_args = ['dropout']
-    assert all((k not in kwargs or k in mutable_args) for k in predefined_args), \
-           'Cannot override predefined model settings.'
+    predefined_args = standard_lstm_lm_hparams['standard_lstm_lm_1500'].copy()
+    if not hparam_allow_override:
+        mutable_args = frozenset(['dropout'])
+        assert all((k not in kwargs or k in mutable_args) for k in predefined_args), \
+            'Cannot override predefined model settings.'
     predefined_args.update(kwargs)
     return _get_rnn_model(StandardRNN, 'standard_lstm_lm_1500',
                           dataset_name, vocab, pretrained, ctx, root, **predefined_args)
@@ -514,8 +569,21 @@ def forward(self, inputs, begin_state): # pylint: disable=arguments-differ
         out = out.reshape((length, batch_size, -1))
         return out, state
 
+big_rnn_lm_2048_512_hparams = {
+        'embed_size': 512,
+        'hidden_size': 2048,
+        'projection_size': 512,
+        'num_layers': 1,
+        'embed_dropout': 0.1,
+        'encode_dropout': 0.1}
+
+big_rnn_lm_hparams = {
+        'big_rnn_lm_2048_512': big_rnn_lm_2048_512_hparams
+}
+
 def big_rnn_lm_2048_512(dataset_name=None, vocab=None, pretrained=False, ctx=cpu(),
-                        root=os.path.join(get_home_dir(), 'models'), **kwargs):
+                        root=os.path.join(get_home_dir(), 'models'),
+                        hparam_allow_override=False, **kwargs):
     r"""Big 1-layer LSTMP language model.
 
     Both embedding and projection size are 512. Hidden size is 2048.
@@ -539,20 +607,19 @@ def big_rnn_lm_2048_512(dataset_name=None, vocab=None, pretrained=False, ctx=cpu
     root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
         MXNET_HOME defaults to '~/.mxnet'.
+    hparam_allow_override : bool, default False
+        If set to True, pre-defined hyper-parameters of the model
+        (e.g. the number of layers, hidden units) can be overriden.
 
     Returns
     -------
     gluon.Block, gluonnlp.Vocab
     """
-    predefined_args = {'embed_size': 512,
-                       'hidden_size': 2048,
-                       'projection_size': 512,
-                       'num_layers': 1,
-                       'embed_dropout': 0.1,
-                       'encode_dropout': 0.1}
-    mutable_args = ['embed_dropout', 'encode_dropout']
-    assert all((k not in kwargs or k in mutable_args) for k in predefined_args), \
-           'Cannot override predefined model settings.'
+    predefined_args = big_rnn_lm_hparams['big_rnn_lm_2048_512'].copy()
+    if not hparam_allow_override:
+        mutable_args = frozenset(['embed_dropout', 'encode_dropout'])
+        assert all((k not in kwargs or k in mutable_args) for k in predefined_args), \
+            'Cannot override predefined model settings.'
     predefined_args.update(kwargs)
     return _get_rnn_model(BigRNN, 'big_rnn_lm_2048_512', dataset_name, vocab, pretrained,
                           ctx, root, **predefined_args)
diff --git a/src/gluonnlp/model/transformer.py b/src/gluonnlp/model/transformer.py
index 08e58e10f5..decad26d45 100644
--- a/src/gluonnlp/model/transformer.py
+++ b/src/gluonnlp/model/transformer.py
@@ -947,9 +947,23 @@ def _get_transformer_model(model_cls, model_name, dataset_name, src_vocab, tgt_v
         _load_pretrained_params(net, model_name, dataset_name, root, ctx)
     return net, src_vocab, tgt_vocab
 
+transformer_en_de_hparams = {
+        'num_units': 512,
+        'hidden_size': 2048,
+        'dropout': 0.1,
+        'epsilon': 0.1,
+        'num_layers': 6,
+        'num_heads': 8,
+        'scaled': True,
+        'share_embed': True,
+        'embed_size': 512,
+        'tie_weights': True,
+        'embed_initializer': None
+}
 
 def transformer_en_de_512(dataset_name=None, src_vocab=None, tgt_vocab=None, pretrained=False,
-                          ctx=cpu(), root=os.path.join(get_home_dir(), 'models'), **kwargs):
+                          ctx=cpu(), root=os.path.join(get_home_dir(), 'models'),
+                          hparam_allow_override=False, **kwargs):
     r"""Transformer pretrained model.
 
     Embedding size is 400, and hidden layer size is 1150.
@@ -966,26 +980,20 @@ def transformer_en_de_512(dataset_name=None, src_vocab=None, tgt_vocab=None, pre
     root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
         MXNET_HOME defaults to '~/.mxnet'.
+    hparam_allow_override : bool, default False
+        If set to True, pre-defined hyper-parameters of the model
+        (e.g. the number of layers, hidden units) can be overriden.
 
     Returns
     -------
     gluon.Block, gluonnlp.Vocab, gluonnlp.Vocab
     """
-    predefined_args = {'num_units': 512,
-                       'hidden_size': 2048,
-                       'dropout': 0.1,
-                       'epsilon': 0.1,
-                       'num_layers': 6,
-                       'num_heads': 8,
-                       'scaled': True,
-                       'share_embed': True,
-                       'embed_size': 512,
-                       'tie_weights': True,
-                       'embed_initializer': None}
-    mutable_args = frozenset(['num_units', 'hidden_size', 'dropout', 'epsilon', 'num_layers',
-                              'num_heads', 'scaled'])
-    assert all((k not in kwargs or k in mutable_args) for k in predefined_args), \
-           'Cannot override predefined model settings.'
+    predefined_args = transformer_en_de_hparams.copy()
+    if not hparam_allow_override:
+        mutable_args = frozenset(['num_units', 'hidden_size', 'dropout', 'epsilon', 'num_layers',
+                                  'num_heads', 'scaled'])
+        assert all((k not in kwargs or k in mutable_args) for k in predefined_args), \
+            'Cannot override predefined model settings.'
     predefined_args.update(kwargs)
     encoder, decoder, one_step_ahead_decoder = get_transformer_encoder_decoder(
         units=predefined_args['num_units'], hidden_size=predefined_args['hidden_size'],
diff --git a/tests/unittest/test_models.py b/tests/unittest/test_models.py
index 4a2bafdcf3..9e837c3713 100644
--- a/tests/unittest/test_models.py
+++ b/tests/unittest/test_models.py
@@ -658,6 +658,70 @@ def forward(self, inpt):
             assert not mx.test_utils.almost_equal(grads[name].asnumpy(), param.grad().asnumpy())
 
 
+# helper method used by test_hparam_allow_override_parameter_in_get_model_api
+def verify_get_model_with_hparam_allow_override(models, hparam_allow_override, predefined_args_dict,
+        mutable_args, dataset_name):
+
+    for model in models:
+        predefined_args = predefined_args_dict[model].copy()
+        if hparam_allow_override:
+            params_that_should_throw_exception = set()
+        else:
+            params_that_should_throw_exception = set(predefined_args.keys()) - set(mutable_args)
+        params_that_threw_exception = set()
+        for param in predefined_args:
+            try:
+                nlp.model.get_model(model, dataset_name=dataset_name,
+                    hparam_allow_override=hparam_allow_override, **{param: predefined_args[param]})
+            except:
+                # we're expecting get_model to fail if hparam_allow_override is False
+                # and the parameter is not in the set of mutable parameters
+                expected = not hparam_allow_override and not param in mutable_args
+                assert expected, 'Unexpected exception when creating model ' + model + ' with '\
+                       'parameter ' + param + '.\n'
+                params_that_threw_exception.add(param)
+
+        assert params_that_threw_exception == params_that_should_throw_exception
+
+
+@pytest.mark.parametrize('hparam_allow_override', [False, True])
+def test_hparam_allow_override_parameter_in_get_model_api(hparam_allow_override):
+    models = ['awd_lstm_lm_1150', 'awd_lstm_lm_600']
+    mutable_args_of_models = ['dropout', 'weight_drop', 'drop_h', 'drop_i', 'drop_e']
+    predefined_args_dict = nlp.model.language_model.awd_lstm_lm_hparams.copy()
+    verify_get_model_with_hparam_allow_override(models, hparam_allow_override, predefined_args_dict,
+            mutable_args_of_models, 'wikitext-2')
+
+    models = ['standard_lstm_lm_200', 'standard_lstm_lm_650', 'standard_lstm_lm_1500']
+    mutable_args_of_models = ['dropout']
+    predefined_args_dict = nlp.model.language_model.standard_lstm_lm_hparams.copy()
+    verify_get_model_with_hparam_allow_override(models, hparam_allow_override, predefined_args_dict,
+            mutable_args_of_models, 'wikitext-2')
+
+    models = ['big_rnn_lm_2048_512']
+    mutable_args_of_models = ['embed_dropout', 'encode_dropout']
+    predefined_args_dict = nlp.model.language_model.big_rnn_lm_hparams.copy()
+    verify_get_model_with_hparam_allow_override(models, hparam_allow_override, predefined_args_dict,
+            mutable_args_of_models, 'wikitext-2')
+
+    models = ['transformer_en_de_512']
+    mutable_args_of_models = ['num_units', 'hidden_size', 'dropout', 'epsilon', 'num_layers',
+                                  'num_heads', 'scaled']
+    predefined_args_dict = {
+        'transformer_en_de_512': nlp.model.transformer.transformer_en_de_hparams.copy()
+    }
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        verify_get_model_with_hparam_allow_override(models, hparam_allow_override, predefined_args_dict,
+            mutable_args_of_models, 'WMT2014')
+
+    models = ['distilbert_6_768_12']
+    mutable_args_of_models = ['use_residual', 'dropout', 'word_embed']
+    predefined_args_dict = nlp.model.bert.bert_hparams.copy()
+    verify_get_model_with_hparam_allow_override(models, hparam_allow_override, predefined_args_dict,
+            mutable_args_of_models, 'distilbert_book_corpus_wiki_en_uncased')
+
+
 def test_gelu():
     x = mx.random.uniform(shape=(3, 4, 5))
     net = nlp.model.GELU()

From d5d9eb4de85a72d0595ba531484625855d9a8975 Mon Sep 17 00:00:00 2001
From: "WANG, Chen" <16307110064@fudan.edu.cn>
Date: Sat, 30 May 2020 01:10:44 +0800
Subject: [PATCH 24/42] [BUGFIX] Change CONLL dataset source to HTTPS (#1236)

---
 src/gluonnlp/data/conll.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gluonnlp/data/conll.py b/src/gluonnlp/data/conll.py
index 951b91e9e7..28ec562670 100644
--- a/src/gluonnlp/data/conll.py
+++ b/src/gluonnlp/data/conll.py
@@ -140,7 +140,7 @@ def __init__(self, segment='train',
                                     'dc57527f1f60eeafad03da51235185141152f849')}
         super(CoNLL2000, self).__init__(segment, root)
 
-    base_url = 'http://www.clips.uantwerpen.be/conll2000/chunking/'
+    base_url = 'https://www.clips.uantwerpen.be/conll2000/chunking/'
     codec = 'utf-8'
 
 

From 4d1c087f5748c36bcc41da912ec357a564cb1276 Mon Sep 17 00:00:00 2001
From: Avinash Madasu <avinash.sai001@gmail.com>
Date: Sun, 31 May 2020 22:42:06 +0530
Subject: [PATCH 25/42] [BUGFIX] Remove test set from validation (#1228)

* [BUGFIX] Remove test set from validation

* [BUGFIX] Add argument whether to validate test data
---
 scripts/machine_translation/train_gnmt.py      | 18 +++++++++++-------
 .../machine_translation/train_transformer.py   | 10 ----------
 2 files changed, 11 insertions(+), 17 deletions(-)

diff --git a/scripts/machine_translation/train_gnmt.py b/scripts/machine_translation/train_gnmt.py
index da1c61f2d9..fee6b2fa4b 100644
--- a/scripts/machine_translation/train_gnmt.py
+++ b/scripts/machine_translation/train_gnmt.py
@@ -100,6 +100,8 @@
                     help='directory path to save the final model and training log')
 parser.add_argument('--gpu', type=int, default=None,
                     help='id of the gpu to use. Set it to empty means to use cpu.')
+parser.add_argument('--validate_on_test_data', type=bool, default=False,
+                    help='To perform validation on test data')
 args = parser.parse_args()
 print(args)
 logging_config(args.save_dir)
@@ -246,16 +248,18 @@ def train():
         valid_bleu_score, _, _, _, _ = compute_bleu([val_tgt_sentences], valid_translation_out)
         logging.info('[Epoch {}] valid Loss={:.4f}, valid ppl={:.4f}, valid bleu={:.2f}'
                      .format(epoch_id, valid_loss, np.exp(valid_loss), valid_bleu_score * 100))
-        test_loss, test_translation_out = evaluate(test_data_loader)
-        test_bleu_score, _, _, _, _ = compute_bleu([test_tgt_sentences], test_translation_out)
-        logging.info('[Epoch {}] test Loss={:.4f}, test ppl={:.4f}, test bleu={:.2f}'
-                     .format(epoch_id, test_loss, np.exp(test_loss), test_bleu_score * 100))
         dataprocessor.write_sentences(valid_translation_out,
                                       os.path.join(args.save_dir,
                                                    'epoch{:d}_valid_out.txt').format(epoch_id))
-        dataprocessor.write_sentences(test_translation_out,
-                                      os.path.join(args.save_dir,
-                                                   'epoch{:d}_test_out.txt').format(epoch_id))
+        if args.validate_on_test_data:
+            test_loss, test_translation_out = evaluate(test_data_loader)
+            test_bleu_score, _, _, _, _ = compute_bleu([test_tgt_sentences], test_translation_out)
+            logging.info('[Epoch {}] test Loss={:.4f}, test ppl={:.4f}, test bleu={:.2f}'
+                         .format(epoch_id, test_loss, np.exp(test_loss), test_bleu_score * 100))
+
+            dataprocessor.write_sentences(test_translation_out,
+                                          os.path.join(args.save_dir,
+                                                       'epoch{:d}_test_out.txt').format(epoch_id))
         if valid_bleu_score > best_valid_bleu:
             best_valid_bleu = valid_bleu_score
             save_path = os.path.join(args.save_dir, 'valid_best.params')
diff --git a/scripts/machine_translation/train_transformer.py b/scripts/machine_translation/train_transformer.py
index baa8249c04..7f6c9d2236 100644
--- a/scripts/machine_translation/train_transformer.py
+++ b/scripts/machine_translation/train_transformer.py
@@ -350,19 +350,9 @@ def train():
                                                     bpe=bpe)
         logging.info('[Epoch {}] valid Loss={:.4f}, valid ppl={:.4f}, valid bleu={:.2f}'
                      .format(epoch_id, valid_loss, np.exp(valid_loss), valid_bleu_score * 100))
-        test_loss, test_translation_out = evaluate(test_data_loader, ctx[0])
-        test_bleu_score, _, _, _, _ = compute_bleu([test_tgt_sentences], test_translation_out,
-                                                   tokenized=tokenized, tokenizer=args.bleu,
-                                                   split_compound_word=split_compound_word,
-                                                   bpe=bpe)
-        logging.info('[Epoch {}] test Loss={:.4f}, test ppl={:.4f}, test bleu={:.2f}'
-                     .format(epoch_id, test_loss, np.exp(test_loss), test_bleu_score * 100))
         dataprocessor.write_sentences(valid_translation_out,
                                       os.path.join(args.save_dir,
                                                    'epoch{:d}_valid_out.txt').format(epoch_id))
-        dataprocessor.write_sentences(test_translation_out,
-                                      os.path.join(args.save_dir,
-                                                   'epoch{:d}_test_out.txt').format(epoch_id))
         if valid_bleu_score > best_valid_bleu:
             best_valid_bleu = valid_bleu_score
             save_path = os.path.join(args.save_dir, 'valid_best.params')

From aa43acbefc5f44ae2c96086bce920ad676ddef34 Mon Sep 17 00:00:00 2001
From: "WANG, Chen" <16307110064@fudan.edu.cn>
Date: Sun, 21 Jun 2020 11:07:42 +0800
Subject: [PATCH 26/42] [website] Add a Catalog page for the GluonNLP trained
 models (#1184)

* Add menu for catalog folder

* Add wikitext-2 table

* Remove the catalog folder and add the training command

* Add reference to the paper

* Update reference link

* Convert train shell into links

* Add log information

* Separate the log from the command column

* Convert hyperlink to anonymous hyperlink

* Add Cache Language Model Part

* Add machine translation page

* Add Sentiment Analysis Tables

* Add fine-tuned tables

* Add reference for GNMT and TextCNN
---
 scripts/catalog.rst | 336 ++++++++++++++++++++++++++++++++++++++++++++
 scripts/index.rst   |   2 +
 2 files changed, 338 insertions(+)
 create mode 100644 scripts/catalog.rst

diff --git a/scripts/catalog.rst b/scripts/catalog.rst
new file mode 100644
index 0000000000..aa8062374a
--- /dev/null
+++ b/scripts/catalog.rst
@@ -0,0 +1,336 @@
+Model Catalog
+=============
+
+
+
+
+Language Model
+--------------
+`Language Model Model Zoo Index <./language_model/index.html>`_
+
+Word Language Model
+~~~~~~~~~~~~~~~~~~~
+
+Dataset: Wikitext-2
+
++---------------------------------------+-----------------+-----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
+| Pre-trained Model                     | Test Perplexity |Training Command                                                                                                             | log                                                                                                                         |
++=======================================+=================+=============================================================================================================================+=============================================================================================================================+
+| standard_lstm_lm_200_wikitext-2  [1]_ | 101.64          |`command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/standard_lstm_lm_200_wikitext-2.sh>`__   |  `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/standard_lstm_lm_200_wikitext-2.log>`__    |
++---------------------------------------+-----------------+-----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
+| standard_lstm_lm_650_wikitext-2  [1]_ | 86.91           |`command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/standard_lstm_lm_650_wikitext-2.sh>`__   |  `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/standard_lstm_lm_650_wikitext-2.log>`__    |
++---------------------------------------+-----------------+-----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
+| standard_lstm_lm_1500_wikitext-2 [1]_ | 82.29           |`command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/standard_lstm_lm_1500_wikitext-2.sh>`__  |  `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/standard_lstm_lm_1500_wikitext-2.log>`__   |
++---------------------------------------+-----------------+-----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
+| awd_lstm_lm_600_wikitext-2       [1]_ | 80.67           |`command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/awd_lstm_lm_600_wikitext-2.sh>`__        |  `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/awd_lstm_lm_600_wikitext-2.log>`__         |
++---------------------------------------+-----------------+-----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
+| awd_lstm_lm_1150_wikitext-2      [1]_ | 65.62           |`command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/awd_lstm_lm_1150_wikitext-2.sh>`__       |  `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/awd_lstm_lm_1150_wikitext-2.log>`__        |
++---------------------------------------+-----------------+-----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
+
+
+Cache Language Model
+~~~~~~~~~~~~~~~~~~~~
+
+Dataset: Wikitext-2
+
++---------------------------------------------+-----------------+----------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------+
+| Pre-trained Model                           | Test Perplexity |Training Command                                                                                                                  | log                                                                                                                           |
++=============================================+=================+==================================================================================================================================+===============================================================================================================================+
+| cache_awd_lstm_lm_1150_wikitext-2      [2]_ | 51.46           |`command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/cache_awd_lstm_lm_1150_wikitext-2.sh>`__      |`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/cache_awd_lstm_lm_1150_wikitext-2.log>`__      |
++---------------------------------------------+-----------------+----------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------+
+| cache_awd_lstm_lm_600_wikitext-2       [2]_ | 62.19           |`command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/cache_awd_lstm_lm_600_wikitext-2.sh>`__       |`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/cache_awd_lstm_lm_600_wikitext-2.log>`__       |
++---------------------------------------------+-----------------+----------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------+
+| cache_standard_lstm_lm_1500_wikitext-2 [2]_ | 62.79           |`command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/cache_standard_lstm_lm_1500_wikitext-2.sh>`__ |`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/cache_standard_lstm_lm_1500_wikitext-2.log>`__ |
++---------------------------------------------+-----------------+----------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------+
+| cache_standard_lstm_lm_650_wikitext-2  [2]_ | 65.85           |`command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/cache_standard_lstm_lm_650_wikitext-2.sh>`__  |`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/cache_standard_lstm_lm_650_wikitext-2.log>`__  |
++---------------------------------------------+-----------------+----------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------+
+| cache_standard_lstm_lm_200_wikitext-2  [2]_ | 73.74           |`command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/cache_standard_lstm_lm_200_wikitext-2.sh>`__  |`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/cache_standard_lstm_lm_200_wikitext-2.log>`__  |
++---------------------------------------------+-----------------+----------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------+
+
+
+
+Large Scale Word Language Model
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Dataset: Google’s 1 billion words dataset
+
++-------------------------+-----------------+-------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------+
+| Pre-trained Model       | Test Perplexity |Training Command                                                                                                   | log                                                                                                            |
++=========================+=================+===================================================================================================================+================================================================================================================+
+| LSTM-2048-512      [3]_ | 43.62           |`command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/big_rnn_lm_2048_512_gbw.sh>`__ |`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/big_rnn_lm_2048_512_gbw.log>`__ |
++-------------------------+-----------------+-------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------+
+
+
+Machine Translation
+-------------------
+`Machine Translation Model Zoo Index <./machine_translation/index.html>`_
+
+
+Google Neural Machine Translation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Dataset: IWLST2015-en-vi
+
++---------------------+-----------+-------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------+
+| Pre-trained Model   | Test BLEU |Training Command                                                                           | log                                                                                                            |
++=====================+===========+===========================================================================================+================================================================================================================+
+| GNMT           [4]_ | 26.2      | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/nmt/gnmt.sh>`__      |                                                                                                                |
++---------------------+-----------+-------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------+
+
+
+Transformers
+~~~~~~~~~~~~
+
+Dataset: WMT14-en-de
+Requisite: sacremoses package: pip install scaremoses --user
+
++------------------------------+-----------+-------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------+
+| Pre-trained Model            | Test BLEU |Training Command                                                                                                   | log                                                                                                            |
++==============================+===========+===================================================================================================================+================================================================================================================+
+| transformer_en_de_512_WMT2014| 27.65     | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/nmt/transformer_en_de_u512.sh>`__            |`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/nmt/transformer_en_de_u512.log>`__             |
++------------------------------+-----------+-------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------+
+
+
+Sentiment Analysis
+------------------
+`Sentiment Analysis Model Zoo Index <./sentiment_analysis/index.html>`_
+
+Through Fine-tuning Word Language Model
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Dataset: IMDB
+
++------------------------------+---------------+---------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------+
+| Model                        | Test Accuracy |Training Command                                                                                                     | log                                                                                                              |
++==============================+===============+=====================================================================================================================+==================================================================================================================+
+| lstm from scratch            | 85.60%        | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/sentiment_raw_20180817.sh>`__        | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/sentiment_raw_20180817.log>`__        |
++------------------------------+---------------+---------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------+
+| lstm with pre-trained model  | 86.46%        | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/sentiment_pretrained_20180817.sh>`__ | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/sentiment_pretrained_20180817.log>`__ |
++------------------------------+---------------+---------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------+
+
+TextCNN
+~~~~~~~
+
+Dataset: MR
+
++--------------------------+---------------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+| Model                    | Cross-Validation Accuracy | Training Command                                                                                                 | Log                                                                                                           |
++==========================+===========================+==================================================================================================================+===============================================================================================================+
+| TextCNN-rand        [5]_ | 75.80%                    | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/MR_rand.sh>`__                    | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/MR_rand.log>`__                    |
++--------------------------+---------------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+| TextCNN-static      [5]_ | 79.40%                    | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/MR_static.sh>`__                  | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/MR_static.log>`__                  |
++--------------------------+---------------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+| TextCNN-non-static  [5]_ | 80.00%                    | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/MR_non-static.sh>`__              | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/MR_non-static.log>`__              |
++--------------------------+---------------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+|TextCNN-multichannel [5]_ | 80.00%                    | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/MR_multichannel.sh>`__            | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/MR_multichannel.log>`__            |
++--------------------------+---------------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+
+Dataset: Subj
+
++---------------------------+---------------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+| Model                     | Cross-Validation Accuracy | Training Command                                                                                                 | Log                                                                                                           |
++===========================+===========================+==================================================================================================================+===============================================================================================================+
+| TextCNN-rand         [5]_ | 89.30%                    | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/Subj_rand.sh>`__                  | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/Subj_rand.log>`__                  |
++---------------------------+---------------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+| TextCNN-static       [5]_ | 91.80%                    | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/Subj_static.sh>`__                | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/Subj_static.log>`__                |
++---------------------------+---------------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+| TextCNN-non-static   [5]_ | 91.90%                    | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/Subj_non-static.sh>`__            | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/Subj_non-static.log>`__            |
++---------------------------+---------------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+| TextCNN-multichannel [5]_ | 92.10%                    | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/Subj_multichannel.sh>`__          | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/Subj_multichannel.log>`__          |
++---------------------------+---------------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+
+Dataset: CR
+
++---------------------------+---------------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+| Model                     | Cross-Validation Accuracy | Training Command                                                                                                 | Log                                                                                                           |
++===========================+===========================+==================================================================================================================+===============================================================================================================+
+| TextCNN-rand         [5]_ | 79.50%                    | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/CR_rand.sh>`__                    | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/CR_rand.log>`__                    |
++---------------------------+---------------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+| TextCNN-static       [5]_ | 83.10%                    | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/CR_static.sh>`__                  | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/CR_static.log>`__                  |
++---------------------------+---------------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+| TextCNN-non-static   [5]_ | 82.90%                    | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/CR_non-static.sh>`__              | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/CR_non-static.log>`__              |
++---------------------------+---------------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+| TextCNN-multichannel [5]_ | 83.30%                    | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/CR_multichannel.sh>`__            | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/CR_multichannel.log>`__            |
++---------------------------+---------------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+
+Dataset: MPQA
+
++---------------------------+---------------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+| Model                     | Cross-Validation Accuracy | Training Command                                                                                                 | Log                                                                                                           |
++===========================+===========================+==================================================================================================================+===============================================================================================================+
+| TextCNN-rand         [5]_ | 85.30%                    | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/MPQA_rand.sh>`__                  | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/MPQA_rand.log>`__                  |
++---------------------------+---------------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+| TextCNN-static       [5]_ | 89.60%                    | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/MPQA_static.sh>`__                | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/MPQA_static.log>`__                |
++---------------------------+---------------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+| TextCNN-non-static   [5]_ | 89.20%                    | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/MPQA_non-static.sh>`__            | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/MPQA_non-static.log>`__            |
++---------------------------+---------------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+| TextCNN-multichannel [5]_ | 89.60%                    | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/MPQA_multichannel.sh>`__          | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/MPQA_multichannel.log>`__          |
++---------------------------+---------------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+
+Dataset: SST-1
+
++---------------------------+---------------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+| Model                     | Cross-Validation Accuracy | Training Command                                                                                                 | Log                                                                                                           |
++===========================+===========================+==================================================================================================================+===============================================================================================================+
+| TextCNN-rand         [5]_ | 44.30%                    | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/SST-1_rand.sh>`__                 | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/SST-1_rand.log>`__                 |
++---------------------------+---------------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+| TextCNN-static       [5]_ | 48.10%                    | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/SST-1_static.sh>`__               | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/SST-1_static.log>`__               |
++---------------------------+---------------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+| TextCNN-non-static   [5]_ | 47.00%                    | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/SST-1_non-static.sh>`__           | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/SST-1_non-static.log>`__           |
++---------------------------+---------------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+| TextCNN-multichannel [5]_ | 48.10%                    | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/SST-1_multichannel.sh>`__         | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/SST-1_multichannel.log>`__         |
++---------------------------+---------------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+
+Dataset: SST-2
+
++---------------------------+---------------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+| Model                     | Cross-Validation Accuracy | Training Command                                                                                                 | Log                                                                                                           |
++===========================+===========================+==================================================================================================================+===============================================================================================================+
+| TextCNN-rand         [5]_ | 82.10%                    | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/SST-2_rand.sh>`__                 | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/SST-2_rand.log>`__                 |
++---------------------------+---------------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+| TextCNN-static       [5]_ | 87.10%                    | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/SST-2_static.sh>`__               | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/SST-2_static.log>`__               |
++---------------------------+---------------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+| TextCNN-non-static   [5]_ | 85.60%                    | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/SST-2_non-static.sh>`__           | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/SST-2_non-static.log>`__           |
++---------------------------+---------------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+| TextCNN-multichannel [5]_ | 85.80%                    | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/SST-2_multichannel.sh>`__         | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/SST-2_multichannel.log>`__         |
++---------------------------+---------------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+
+Dataset: TREC
+
++---------------------------+---------------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+| Model                     | Cross-Validation Accuracy | Training Command                                                                                                 | Log                                                                                                           |
++===========================+===========================+==================================================================================================================+===============================================================================================================+
+| TextCNN-rand         [5]_ | 90.20%                    | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/TREC_rand.sh>`__                  | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/TREC_rand.log>`__                  |
++---------------------------+---------------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+| TextCNN-static       [5]_ | 91.40%                    | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/TREC_static.sh>`__                | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/TREC_static.log>`__                |
++---------------------------+---------------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+| TextCNN-non-static   [5]_ | 93.20%                    | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/TREC_non-static.sh>`__            | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/TREC_non-static.log>`__            |
++---------------------------+---------------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+| TextCNN-multichannel [5]_ | 93.20%                    | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/TREC_multichannel.sh>`__          | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/TREC_multichannel.log>`__          |
++---------------------------+---------------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+
+Finetuning
+----------
+`BERT Model Zoo Index <./bert/index.html>`_
+
+Task: Sentence Classification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Dataset: MRPC
+
++------------------+---------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+| Pretrained Model | Validation Accuracy | Training Command                                                                                                 | Log                                                                                                           |
++==================+=====================+==================================================================================================================+===============================================================================================================+
+| BERT-base        | 88.70%              | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/bert/finetuned_mrpc.sh>`__                  | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/bert/finetuned_mrpc.log>`__                  |
++------------------+---------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+
+Dataset: RTE
+
++------------------+---------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+| Pretrained Model | Validation Accuracy | Training Command                                                                                                 | Log                                                                                                           |
++==================+=====================+==================================================================================================================+===============================================================================================================+
+| BERT-base        | 70.80%              | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/bert/finetuned_rte.sh>`__                   | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/bert/finetuned_rte.log>`__                   |
++------------------+---------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+
+Dataset: SST-2
+
++------------------+---------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+| Pretrained Model | Validation Accuracy | Training Command                                                                                                 | Log                                                                                                           |
++==================+=====================+==================================================================================================================+===============================================================================================================+
+| BERT-base        | 93%                 | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/bert/finetuned_sst.sh>`__                   | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/bert/finetuned_sst.log>`__                   |
++------------------+---------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+| RoBERTa-base     | 95.3%               | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/roberta/finetuned_sst.sh>`__                | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/roberta/finetuned_sst.log>`__                |
++------------------+---------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+
+
+Dataset: MNLI-M/MM
+
++------------------+---------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+| Pretrained Model | Validation Accuracy | Training Command                                                                                                 | Log                                                                                                           |
++==================+=====================+==================================================================================================================+===============================================================================================================+
+| BERT-base        | 84.55%/84.66%       | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/bert/finetuned_mnli.sh>`__                  | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/bert/finetuned_mnli.log>`__                  |
++------------------+---------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+| RoBERTa-base     | 87.69%/87.23%       | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/roberta/finetuned_mnli.sh>`__               | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/roberta/mnli_1e-5-32.log>`__                 |
++------------------+---------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+
+Dataset: XNLI(Chinese)
+
++------------------+---------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+| Pretrained Model | Validation Accuracy | Training Command                                                                                                 | Log                                                                                                           |
++==================+=====================+==================================================================================================================+===============================================================================================================+
+| BERT-base        | 78.27%              | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/bert/finetune_XNLI-B_base_mx1.6.0rc1.sh>`__ | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/bert/finetuned_xnli.log>`__                  |
++------------------+---------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+
+Task: Question Answering
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+Dataset: SQuAD 1.1
+
++------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------+
+| Pretrained Model | F1/EM               | Training Command                                                                                                   | Log                                                                                                             |
++==================+=====================+====================================================================================================================+=================================================================================================================+
+| BERT-base        | 88.53%/80.98%       |`command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/bert/finetune_squad1.1_base_mx1.6.0rc1.sh>`__  |`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/bert/finetune_squad1.1_base_mx1.6.0rc1.log>`__  |
++------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------+
+| BERT-large       | 90.97%/84.05%       |`command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/bert/finetune_squad1.1_large_mx1.6.0rc1.sh>`__ |`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/bert/finetune_squad1.1_large_mx1.6.0rc1.log>`__ |
++------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------+
+
+Dataset: SQuAD 2.0
+
++------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------+
+| Pretrained Model | F1/EM               | Training Command                                                                                                   | Log                                                                                                             |
++==================+=====================+====================================================================================================================+=================================================================================================================+
+| BERT-large       | 77.96%/81.02%       |`command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/bert/finetune_squad2.0_large_mx1.6.0rc1.sh>`__ |`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/bert/finetune_squad2.0_large_mx1.6.0rc1.log>`__ |
++------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------+
+
+Task: Named Entity Recognition
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Requisite: python3 and seqeval package: pip3 install seqeval --user
+
+Dataset:  CoNLL-2003
+
++------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------+
+| Pretrained Model | F1                  | Training Command                                                                                                   | Log                                                                                                             |
++==================+=====================+====================================================================================================================+=================================================================================================================+
+| BERT-large       | 92.20%              |                                                                                                                    |`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/bert/finetuned_conll2003.log>`__                |
++------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------+
+
+Task: Joint Intent Classification and Slot Labelling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Requisite: python3 and seqeval & tqdm packages: pip3 install seqeval --user and pip3 install tqdm --user
+
+Dataset:  ATIS
+
++------------------+---------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+| Pretrained Model | F1/Accuracy         | Training Command                                                                                                 | Log                                                                                                           |
++==================+=====================+==================================================================================================================+===============================================================================================================+
+| BERT-base        | 95.83%/98.66%       |                                                                                                                  |                                                                                                               |
++------------------+---------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+
+Dataset:  SNIPS
+
++------------------+---------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+| Pretrained Model | F1/Accuracy         | Training Command                                                                                                 | Log                                                                                                           |
++==================+=====================+==================================================================================================================+===============================================================================================================+
+| BERT-base        | 96.06%/98.71%       |                                                                                                                  |                                                                                                               |
++------------------+---------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
+
+
+
+
+.. [1] Merity, S., et al.  \
+       "`Regularizing and optimizing LSTM language models <https://openreview.net/pdf?id=SyyGPP0TZ>`_". \
+       ICLR 2018
+.. [2] Grave, E., et al. \
+       "`Improving neural language models with a continuous cache <https://openreview.net/pdf?id=B184E5qee>`_".\
+       ICLR 2017
+.. [3] Jozefowicz, Rafal, et al. \
+       "`Exploring the limits of language modeling <https://arxiv.org/abs/1602.02410>`_".\
+       arXiv preprint arXiv:1602.02410 (2016).
+.. [4] Wu, Y., Schuster, M., Chen, Z., Le, Q. V., Norouzi, M., Macherey, W., ... & Klingner, J. (2016). \
+       "`Google's neural machine translation system: Bridging the gap between human and machine translation. <https://arxiv.org/abs/1609.08144>`_". \
+       arXiv preprint arXiv:1609.08144.
+.. [5] Kim, Y. (2014). \
+       "`Convolutional neural networks for sentence classification <https://arxiv.org/abs/1408.5882>`_". \
+       arXiv preprint arXiv:1408.5882.
diff --git a/scripts/index.rst b/scripts/index.rst
index ca7d84ac00..6266a06b54 100644
--- a/scripts/index.rst
+++ b/scripts/index.rst
@@ -79,6 +79,8 @@ Model Zoo
    :hidden:
    :maxdepth: 1
 
+   
+   catalog
    word_embeddings/index
    language_model/index
    machine_translation/index

From 1e66d8085bc29ac976da78fedd6e294c0d3b0880 Mon Sep 17 00:00:00 2001
From: "WANG, Chen" <chenw23@uci.edu>
Date: Wed, 24 Jun 2020 06:58:53 +0800
Subject: [PATCH 27/42] Lift timeout restriction per notebook to 90 minutes
 (#1252)

---
 docs/md2ipynb.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/md2ipynb.py b/docs/md2ipynb.py
index 21e466b79c..ef9edf475a 100644
--- a/docs/md2ipynb.py
+++ b/docs/md2ipynb.py
@@ -12,7 +12,7 @@
 args = parser.parse_args()
 
 # timeout for each notebook, in sec
-timeout = 40 * 60
+timeout = 90 * 60
 
 # the files will be ignored for execution
 ignore_execution = []

From fb6290aa8b0b02625bf5dc2701e11bdaf53e9c8d Mon Sep 17 00:00:00 2001
From: "WANG, Chen" <chenw23@uci.edu>
Date: Tue, 7 Jul 2020 01:05:22 +0800
Subject: [PATCH 28/42] Update mxnet intersphinx to release site (#1256)

---
 docs/conf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/conf.py b/docs/conf.py
index 1543c0ed56..f707e1277f 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -210,7 +210,7 @@
 
 intersphinx_mapping = {
     'python': ('https://docs.python.org/{.major}'.format(sys.version_info), None),
-    'mxnet': ('https://beta.mxnet.io/', None),
+    'mxnet': ('https://mxnet.apache.org/api/python/docs/', None),
     'numpy': ('http://docs.scipy.org/doc/numpy/', None),
     'scipy': ('http://docs.scipy.org/doc/scipy/reference', None),
     'matplotlib': ('http://matplotlib.org/', None),

From 03a394597b69aac75f60d78f2fb2c2db00e9c411 Mon Sep 17 00:00:00 2001
From: "WANG, Chen" <chenw23@uci.edu>
Date: Wed, 8 Jul 2020 04:45:13 +0800
Subject: [PATCH 29/42] Lift timeout restriction on CI configuration (#1255)

---
 ci/jenkins/Jenkinsfile_py3-master_gpu_doc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/jenkins/Jenkinsfile_py3-master_gpu_doc b/ci/jenkins/Jenkinsfile_py3-master_gpu_doc
index 82d6cc5fee..745608d052 100644
--- a/ci/jenkins/Jenkinsfile_py3-master_gpu_doc
+++ b/ci/jenkins/Jenkinsfile_py3-master_gpu_doc
@@ -70,7 +70,7 @@ core_logic: {
                 conda activate ./conda/cpu/py3-master
 
                 python3 ci/batch/submit-job.py --region us-east-1 --wait \
-                  --timeout 1800 --saved-output ./docs/examples --conda-env docker/py3 \
+                  --timeout 3600 --saved-output ./docs/examples --conda-env docker/py3 \
                   --name GluonNLP-${env.BRANCH_NAME}-${env.BUILD_NUMBER} \
                   --save-path batch/${env.BRANCH_NAME}/${env.BUILD_NUMBER}/docs/examples \
                   --work-dir . --source-ref refs/pull/${env.CHANGE_ID}/head \
@@ -103,7 +103,7 @@ core_logic: {
                 conda activate ./conda/cpu/py3-master
 
                 python3 ci/batch/submit-job.py --region us-east-1 --wait \
-                  --timeout 1800 --saved-output ./docs/examples --conda-env docker/py3 \
+                  --timeout 3600 --saved-output ./docs/examples --conda-env docker/py3 \
                   --name GluonNLP-${env.BRANCH_NAME}-${env.BUILD_NUMBER} \
                   --save-path batch/${env.BRANCH_NAME}/${env.BUILD_NUMBER}/docs/examples \
                   --work-dir . --source-ref ${env.BRANCH_NAME} \

From 528283dfe714ac3ce56ddbef62643f07ecb0c11a Mon Sep 17 00:00:00 2001
From: xcgoner <xcgoner1108@gmail.com>
Date: Wed, 5 Aug 2020 19:10:53 -0500
Subject: [PATCH 30/42] [BUGFIX] fix NTA implementation (#1277)

* fix NTA

* update language model webpage

* update language model webpage

* trigger ci test again

* trigger ci test again
---
 scripts/language_model/index.rst              | 28 +++++++++----------
 scripts/language_model/word_language_model.py |  2 +-
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/scripts/language_model/index.rst b/scripts/language_model/index.rst
index 9a69f347e0..15ca929351 100644
--- a/scripts/language_model/index.rst
+++ b/scripts/language_model/index.rst
@@ -36,9 +36,9 @@ The dataset used for training the models is wikitext-2.
 +---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
 | Weight_drop   | 0.5                                                                                                                        | 0.2                                                                                                                       | 0                                                                                                                               | 0                                                                                                                              | 0                                                                                                                              |
 +---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
-| Val PPL       | 68.71                                                                                                                      | 84.89                                                                                                                     | 86.51                                                                                                                           | 90.96                                                                                                                          | 107.59                                                                                                                         |
+| Val PPL       | 71.78                                                                                                                      | 80.11                                                                                                                     | 86.28                                                                                                                           | 91.30                                                                                                                          | 108.17                                                                                                                         |
 +---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
-| Test PPL      | 65.62                                                                                                                      | 80.67                                                                                                                     | 82.29                                                                                                                           | 86.91                                                                                                                          | 101.64                                                                                                                         |
+| Test PPL      | 68.55                                                                                                                      | 76.14                                                                                                                     | 81.99                                                                                                                           | 85.82                                                                                                                          | 102.49                                                                                                                         |
 +---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
 | Command       | [1]                                                                                                                        | [2]                                                                                                                       | [3]                                                                                                                             | [4]                                                                                                                            | [5]                                                                                                                            |
 +---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
@@ -47,31 +47,31 @@ The dataset used for training the models is wikitext-2.
 
 For all the above model settings, we set Tied = True and NTASGD = True .
 
-[1] awd_lstm_lm_1150_wikitext-2 (Val PPL 68.71 Test PPL 65.62 )
+[1] awd_lstm_lm_1150_wikitext-2 (Val PPL 71.78 Test PPL 68.55 )
 
 .. code-block:: console
 
    $ python word_language_model.py --gpu 0 --tied --ntasgd --lr_update_interval 30 --lr_update_factor 0.1 --save awd_lstm_lm_1150_wikitext-2
 
-[2] awd_lstm_lm_600_wikitext-2 (Val PPL 84.89 Test PPL 80.67)
+[2] awd_lstm_lm_600_wikitext-2 (Val PPL 80.11 Test PPL 76.14)
 
 .. code-block:: console
 
    $ python word_language_model.py --gpu 0 --emsize 200 --nhid 600 --epochs 750 --dropout 0.2 --dropout_h 0.1 --dropout_i 0.3 --dropout_e 0.05 --weight_drop 0.2 --tied --ntasgd --lr_update_interval 30 --lr_update_factor 0.1 --save awd_lstm_lm_600_wikitext-2
 
-[3] standard_lstm_lm_1500_wikitext-2 (Val PPL 86.51 Test PPL 82.29)
+[3] standard_lstm_lm_1500_wikitext-2 (Val PPL 86.28 Test PPL 81.99)
 
 .. code-block:: console
 
    $ python word_language_model.py --gpu 0 --emsize 1500 --nhid 1500 --nlayers 2 --lr 20 --epochs 750 --batch_size 20 --bptt 35 --dropout 0.65 --dropout_h 0 --dropout_i 0 --dropout_e 0 --weight_drop 0 --tied --wd 0 --alpha 0 --beta 0 --ntasgd --lr_update_interval 30 --lr_update_factor 0.1 --save standard_lstm_lm_1500_wikitext-2
 
-[4] standard_lstm_lm_650_wikitext-2 (Val PPL 90.96 Test PPL 86.91)
+[4] standard_lstm_lm_650_wikitext-2 (Val PPL 91.30 Test PPL 85.82)
 
 .. code-block:: console
 
    $ python word_language_model.py --gpu 0 --emsize 650 --nhid 650 --nlayers 2 --lr 20 --epochs 750 --batch_size 20 --bptt 35 --dropout 0.5 --dropout_h 0 --dropout_i 0 --dropout_e 0 --weight_drop 0 --tied --wd 0 --alpha 0 --beta 0 --ntasgd --lr_update_interval 30 --lr_update_factor 0.1 --save standard_lstm_lm_650_wikitext-2
 
-[5] standard_lstm_lm_200_wikitext-2 (Val PPL 107.59 Test PPL 101.64)
+[5] standard_lstm_lm_200_wikitext-2 (Val PPL 108.17 Test PPL 102.49)
 
 .. code-block:: console
 
@@ -93,9 +93,9 @@ The dataset used for training the models is wikitext-2.
 +=====================+===================================================================================================================================+==================================================================================================================================+========================================================================================================================================+=======================================================================================================================================+=======================================================================================================================================+
 | Pre-trained setting | Refer to: awd_lstm_lm_1150_wikitext-2                                                                                             | Refer to: awd_lstm_lm_600_wikitext-2                                                                                             | Refer to: standard_lstm_lm_1500_wikitext-2                                                                                             | Refer to: standard_lstm_lm_650_wikitext-2                                                                                             | Refer to: standard_lstm_lm_200_wikitext-2                                                                                             |
 +---------------------+-----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| Val PPL             | 53.41                                                                                                                             | 64.51                                                                                                                            | 65.54                                                                                                                                  | 68.47                                                                                                                                 | 77.51                                                                                                                                 |
+| Val PPL             | 58.18                                                                                                                             | 64.09                                                                                                                            | 73.19                                                                                                                                  | 69.27                                                                                                                                 | 81.68                                                                                                                                 |
 +---------------------+-----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| Test PPL            | 51.46                                                                                                                             | 62.19                                                                                                                            | 62.79                                                                                                                                  | 65.85                                                                                                                                 | 73.74                                                                                                                                 |
+| Test PPL            | 56.08                                                                                                                             | 61.62                                                                                                                            | 70.91                                                                                                                                  | 66.39                                                                                                                                 | 77.83                                                                                                                                 |
 +---------------------+-----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | Command             | [1]                                                                                                                               | [2]                                                                                                                              | [3]                                                                                                                                    | [4]                                                                                                                                   | [5]                                                                                                                                   |
 +---------------------+-----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
@@ -104,31 +104,31 @@ The dataset used for training the models is wikitext-2.
 
 For all the above model settings, we set lambdas = 0.1279, theta = 0.662, window = 2000 and bptt= 2000 .
 
-[1] cache_awd_lstm_lm_1150_wikitext-2 (Val PPL 53.41 Test PPL 51.46)
+[1] cache_awd_lstm_lm_1150_wikitext-2 (Val PPL 58.18 Test PPL 56.08)
 
 .. code-block:: console
 
    $ python cache_language_model.py --gpus 0 --model_name awd_lstm_lm_1150
 
-[2] cache_awd_lstm_lm_600_wikitext-2 (Val PPL 64.51 Test PPL 62.19)
+[2] cache_awd_lstm_lm_600_wikitext-2 (Val PPL 64.09 Test PPL 61.62)
 
 .. code-block:: console
 
    $ python cache_language_model.py --gpus 0 --model_name awd_lstm_lm_600
 
-[3] cache_standard_lstm_lm_1500_wikitext-2 (Val PPL 65.54 Test PPL 62.79)
+[3] cache_standard_lstm_lm_1500_wikitext-2 (Val PPL 73.19 Test PPL 70.91)
 
 .. code-block:: console
 
    $ python cache_language_model.py --gpus 0 --model_name standard_lstm_lm_1500
 
-[4] cache_standard_lstm_lm_650_wikitext-2 (Val PPL 68.47 Test PPL 65.85)
+[4] cache_standard_lstm_lm_650_wikitext-2 (Val PPL 69.27 Test PPL 66.39)
 
 .. code-block:: console
 
    $ python cache_language_model.py --gpus 0 --model_name standard_lstm_lm_650
 
-[5] cache_standard_lstm_lm_200_wikitext-2 (Val PPL 77.51 Test PPL 73.74)
+[5] cache_standard_lstm_lm_200_wikitext-2 (Val PPL 81.68 Test PPL 77.83)
 
 .. code-block:: console
 
diff --git a/scripts/language_model/word_language_model.py b/scripts/language_model/word_language_model.py
index 12df344d79..2833361d4f 100644
--- a/scripts/language_model/word_language_model.py
+++ b/scripts/language_model/word_language_model.py
@@ -426,7 +426,7 @@ def train():
             trainer.learning_rate))
 
         if args.ntasgd and avg_trigger == 0:
-            if t > n and val_L > min(valid_losses[-n:]):
+            if t > n and val_L > min(valid_losses[:-n]):
                 if param_dict_avg is None:
                     param_dict_avg = {k.split(model._prefix)[1]: v.data(context[0]).copy()
                                       for k, v in parameters.items()}

From d75185ec7eb1eb082ee92992be8677666aaf7ec7 Mon Sep 17 00:00:00 2001
From: Haibin Lin <linhaibin.eric@gmail.com>
Date: Wed, 12 Aug 2020 14:36:22 -0700
Subject: [PATCH 31/42] fix bias concat (#1296)

Co-authored-by: Lin <haibilin@a483e7be4c92.ant.amazon.com>
---
 src/gluonnlp/model/bert.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/gluonnlp/model/bert.py b/src/gluonnlp/model/bert.py
index ed29c8aaae..5bd8b1a8ef 100644
--- a/src/gluonnlp/model/bert.py
+++ b/src/gluonnlp/model/bert.py
@@ -134,7 +134,11 @@ def hybrid_forward(self, F, qkv, valid_len, query_bias, key_bias, value_bias,
         value_weight = value_weight.reshape(shape=(self._num_heads, -1, 0), reverse=True)
         in_weight = F.concat(query_weight, key_weight, value_weight, dim=-2)
         in_weight = in_weight.reshape(shape=(-1, 0), reverse=True)
-        in_bias = F.concat(query_bias, key_bias, value_bias, dim=0)
+        # concat bias
+        query_bias = query_bias.reshape(shape=(self._num_heads, -1), reverse=True)
+        key_bias = key_bias.reshape(shape=(self._num_heads, -1), reverse=True)
+        value_bias = value_bias.reshape(shape=(self._num_heads, -1), reverse=True)
+        in_bias = F.stack(query_bias, key_bias, value_bias, axis=1).reshape(-1)
 
         # qkv_proj shape = (seq_length, batch_size, num_heads * head_dim * 3)
         qkv_proj = F.FullyConnected(data=qkv, weight=in_weight, bias=in_bias,

From 2c9c1f5ca3c2b014835c39011de2e241d87ba28e Mon Sep 17 00:00:00 2001
From: Leonard Lausen <lausen@amazon.com>
Date: Tue, 1 Sep 2020 18:25:00 -0700
Subject: [PATCH 32/42] Fix env/docker/py3.yml (#1325)

* Fix env/docker/py3.yml

* mxnet-cu101==1.6.0.post0

* Update Jenkinsfile_py3-master_gpu_doc

* Switch to Python 3.6 for Batch

* Update Jenkinsfile_py3-master_gpu_doc

* Update py3.yml

* Update py3.yml

* Update py3.yml
---
 env/docker/py3.yml | 53 +++++++++++++++++++++++-----------------------
 env/gpu/py3.yml    |  2 +-
 2 files changed, 28 insertions(+), 27 deletions(-)

diff --git a/env/docker/py3.yml b/env/docker/py3.yml
index 11b34e4d25..5f99c7e013 100644
--- a/env/docker/py3.yml
+++ b/env/docker/py3.yml
@@ -1,43 +1,44 @@
 channels:
   - conda-forge
 dependencies:
-  - python=3.5
+  - python=3.6
   - pip
   - perl
   - pandoc=2.9.2.1
   - tornado=5.1.1
   - sphinx=2.4.4
   - pip:
-    - numpy==1.18.4
+    - numpy==1.19.1
     - notedown==1.5.1
-    - sphinx-gallery==0.6.2
+    - nbformat==5.0.7
+    - sphinx-gallery==0.7.0
     - recommonmark==0.6.0
     - nbconvert==5.6.1
-    - nbsphinx==0.6.1
+    - nbsphinx==0.7.1
     - ipython
     - ipykernel
-    - numba==0.47
-    - llvmlite==0.31.0
+    - numba==0.51.1
+    - llvmlite==0.34.0
     - https://github.com/szha/mx-theme/tarball/master
-    - seaborn
-    - jieba
-    - scikit-learn==0.21.3
-    - cython
-    - pytype==2020.5.7
-    - pytest==5.2.3
+    - seaborn==0.10.1
+    - jieba==0.42.1
+    - scikit-learn==0.23.2
+    - cython==0.29.21
+    - pytype==2020.8.17
+    - pytest==6.0.1
     - pytest-env==0.6.2
-    - pytest-cov==2.8.1
-    - pytest-xdist==1.30.0
-    - pylint==2.4.4
+    - pytest-cov==2.10.1
+    - pytest-xdist==2.1.0
+    - pylint==2.6.0
     - pylint-quotes==0.2.1
-    - flaky==3.6.1
-    - flake8==3.7.9
-    - mock<3
-    - mxnet-cu101==1.6.0
-    - scipy==1.3.2
-    - regex==2020.4.4
-    - nltk==3.4.5
-    - sacremoses==0.0.35
-    - spacy==2.2.2
-    - sentencepiece==0.1.83
-    - sphinx-autodoc-typehints==1.7.0
+    - flaky==3.7.0
+    - flake8==3.8.3
+    - mock==4.0.2
+    - mxnet-cu101==1.6.0.post0
+    - scipy==1.5.2
+    - regex==2020.7.14
+    - nltk==3.5
+    - sacremoses==0.0.43
+    - spacy==2.3.2
+    - sentencepiece==0.1.91
+    - sphinx-autodoc-typehints==1.11.0
diff --git a/env/gpu/py3.yml b/env/gpu/py3.yml
index 6eed672474..3238774479 100644
--- a/env/gpu/py3.yml
+++ b/env/gpu/py3.yml
@@ -33,7 +33,7 @@ dependencies:
     - flaky==3.6.1
     - flake8==3.7.9
     - mock<3
-    - mxnet-cu101==1.6.0
+    - mxnet-cu101==1.6.0.post0
     - scipy==1.3.2
     - regex==2020.4.4
     - nltk==3.4.5

From 7f29267155f9afa0429a323e22fdc918ddb52358 Mon Sep 17 00:00:00 2001
From: MoisesHer <50716238+MoisesHer@users.noreply.github.com>
Date: Thu, 10 Sep 2020 09:06:31 -0700
Subject: [PATCH 33/42] Deploy BERT model -  Script (#1237)

* Add example script to deploy BERT

* Add options to better measure performance

* Allow specification of path for exported model

* Add option to use custom graph pass

* Add optimization for MHA in custom graph pass

* Correct bug with input shapes in optimize_for

* correct typo

* fix lint

* fix lint

* Add documentation

* Add documentation for using deploy script

* Correct typo/add spaces in documentation

* Add setup.py to compile pass, update documentation

* Fix bug in path to include dir & fix pylint

* Add unitest for deploy bert script

* change CUDA version in wheel

* test latest wheel

* change path to custom pass library

* fixing trigger custom pass compilation

* fix lint

* fix lint

* Update mxnet pip version

* Only GPU versions changed

* fix lint

* change wheel to include mkl headers

* lint docstring

* remove debug print

* change include paths

* lint

* debugging lib_api.h

* debugging lib_api.h

* debugging

* Disable test for now

* skip test if mxnet_version < 1.7.0

* use pytest.mark.skipif to skip test

* test only BERT-base (fp16/fp32, SST/QA, embeddings) to avoid timeout

Co-authored-by: Leonard Lausen <lausen@amazon.com>
---
 env/gpu/py3-master.yml              |   2 +-
 scripts/bert/bertpass_gpu.cc        | 450 ++++++++++++++++++
 scripts/bert/deploy.py              | 702 ++++++++++++++++++++++++++++
 scripts/bert/finetune_classifier.py | 533 ++++++++++-----------
 scripts/bert/finetune_squad.py      | 652 +++++++++++++-------------
 scripts/bert/index.rst              |  15 +-
 scripts/bert/setup.py               |  48 ++
 scripts/tests/test_scripts.py       |  22 +
 8 files changed, 1828 insertions(+), 596 deletions(-)
 create mode 100644 scripts/bert/bertpass_gpu.cc
 create mode 100644 scripts/bert/deploy.py
 create mode 100644 scripts/bert/setup.py

diff --git a/env/gpu/py3-master.yml b/env/gpu/py3-master.yml
index a29bbbb149..77a6e17552 100644
--- a/env/gpu/py3-master.yml
+++ b/env/gpu/py3-master.yml
@@ -33,7 +33,7 @@ dependencies:
     - flaky==3.6.1
     - flake8==3.7.9
     - mock<3
-    - https://repo.mxnet.io/dist/python/cu100/mxnet_cu100-1.6.0-py2.py3-none-manylinux1_x86_64.whl
+    - https://repo.mxnet.io/dist/python/cu100/mxnet_cu100-1.7.0b20200809-py2.py3-none-manylinux2014_x86_64.whl
     - scipy==1.3.2
     - regex==2020.4.4
     - nltk==3.4.5
diff --git a/scripts/bert/bertpass_gpu.cc b/scripts/bert/bertpass_gpu.cc
new file mode 100644
index 0000000000..a773698454
--- /dev/null
+++ b/scripts/bert/bertpass_gpu.cc
@@ -0,0 +1,450 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file subgraph_lib.cc
+ * \brief subgraph operator implementation library file
+ */
+
+#include <math.h>
+#include <iostream>
+#include <algorithm>
+#include <unordered_set>
+#include <functional>
+#include "mxnet/lib_api.h"
+
+class Node;
+struct NodeEntry {
+  Node* node;
+  int entry;
+};
+
+class Node {
+ public:
+  std::string op,name;
+  std::vector<NodeEntry> inputs;
+  std::vector<NodeEntry> outputs;
+  std::unordered_map<std::string, std::string> attrs;
+};
+
+class Graph {
+ public:
+  Graph() {}
+  static Graph fromString(const std::string& json) {
+    JsonParser parser;
+    JsonVal val = parser.parse_to_json(json);
+    return fromJson(val);
+  }
+  ~Graph() {
+    for(int i=0; i<nodes.size(); i++)
+      delete nodes[i];
+  }
+  static Graph fromJson(JsonVal val) {
+    // get nodes list
+    JsonVal nodes = val.map[JsonVal("nodes")];
+    Graph g;
+
+    std::map<int, Node*> nodeMap;
+    // loop over nodes
+    for(int i=0; i<nodes.list.size(); i++) {
+      Node* n = new Node();
+      g.nodes.push_back(n);
+      JsonVal node = nodes.list[i];
+
+      // set the op info
+      n->op = node.map[JsonVal("op")].str;
+      n->name = node.map[JsonVal("name")].str;
+
+      // if op is null its an input to the graph
+      if(n->op.compare("null") == 0)
+        g.inputs.push_back(n);
+      
+      // set attrs
+      JsonVal attributes = node.map[JsonVal("attrs")];
+      for(auto& kv : attributes.map) {
+        n->attrs[kv.first.str] = kv.second.str;
+      }
+
+      // set node inputs
+      JsonVal node_inputs = node.map[JsonVal("inputs")];
+      n->inputs.resize(node_inputs.list.size());
+      for(int j=0; j<node_inputs.list.size(); j++) {
+        JsonVal input = node_inputs.list[j];
+        NodeEntry& entry = n->inputs[j];
+        //get pointer to other node
+        entry.node = nodeMap[input.list[0].num];
+        //get the other node's output index
+        entry.entry = input.list[1].num;
+        //set other nodes output as connected to this node
+        entry.node->outputs.push_back({n,j});
+      }
+      nodeMap[i] = n;
+    }
+
+    JsonVal& heads = val.map[JsonVal("heads")];
+    g.outputs.resize(heads.list.size());
+    for(int i=0; i<heads.list.size(); i++) {
+      JsonVal head = heads.list[i];
+      g.outputs[i].node = nodeMap[head.list[0].num];
+      g.outputs[i].entry = head.list[1].num;
+    }
+    
+    JsonParser parser;
+    for(auto& kv : val.map) {
+      if(kv.first.str.compare("nodes") != 0 &&
+         kv.first.str.compare("heads") != 0 &&
+         kv.first.str.compare("node_row_ptr") != 0 &&
+         kv.first.str.compare("arg_nodes") != 0) {
+        g.attrs[kv.first.str] = kv.second;
+      }
+    }
+    return g;
+  }
+  JsonVal toJson() {
+    JsonVal val(MAP);
+    for(auto& kv : attrs) {
+      val.map[JsonVal(kv.first)] = kv.second;
+    }
+
+    std::map<Node*, int> nodeMap;
+    std::vector<Node*> sorted = topological_sort();
+    for(int i=sorted.size()-1; i>=0; i--) {
+      nodeMap[sorted[i]] = sorted.size()-1-i;
+    }
+
+    val.map[JsonVal("node_row_ptr")] = JsonVal(LIST);
+    JsonVal& node_row_ptr = val.map[JsonVal("node_row_ptr")];
+    for(int i=0; i<nodes.size(); i++)
+      node_row_ptr.list.push_back(JsonVal(i));
+
+    val.map[JsonVal("arg_nodes")] = JsonVal(LIST);
+    JsonVal& arg_nodes = val.map[JsonVal("arg_nodes")];
+    for(int i=0; i<inputs.size(); i++)
+      arg_nodes.list.push_back(JsonVal(nodeMap[inputs[i]]));
+    
+    val.map[JsonVal("heads")] = JsonVal(LIST);
+    JsonVal& heads = val.map[JsonVal("heads")];
+    for(int i=0; i<outputs.size(); i++) {
+      heads.list.push_back(JsonVal(LIST));
+      JsonVal& out = heads.list[i];
+      out.list.push_back(JsonVal(nodeMap[outputs[i].node]));
+      out.list.push_back(JsonVal(outputs[i].entry));
+      out.list.push_back(JsonVal(0));
+    }
+    
+    val.map[JsonVal("nodes")] = JsonVal(LIST);
+    JsonVal& nodes_ = val.map[JsonVal("nodes")];
+    for(int i=sorted.size()-1; i>=0; i--) {
+      nodes_.list.push_back(JsonVal(MAP));
+      Node* n = sorted[i];
+      JsonVal& n_ = nodes_.list[nodes_.list.size()-1];
+      
+      n_.map[JsonVal("op")] = JsonVal(n->op);
+      n_.map[JsonVal("name")] = JsonVal(n->name);
+      n_.map[JsonVal("inputs")] = JsonVal(LIST);
+
+      JsonVal& inputs_ = n_.map[JsonVal("inputs")];
+      for(int j=0; j<n->inputs.size(); j++) {
+        inputs_.list.push_back(JsonVal(LIST));
+        NodeEntry& entry = n->inputs[j];
+        JsonVal& in = inputs_.list[j];
+        in.list.push_back(JsonVal(nodeMap[entry.node]));
+        in.list.push_back(JsonVal(entry.entry));
+        in.list.push_back(JsonVal(0));
+      }
+
+      n_.map[JsonVal("attrs")] = JsonVal(MAP);
+      JsonVal& attrs_ = n_.map[JsonVal("attrs")];
+      for(auto& kv : n->attrs) {
+        attrs_.map[JsonVal(kv.first)] = JsonVal(kv.second);
+      }
+    }
+    return val;
+  }
+  std::string toString() {
+    JsonParser parser;
+    return parser.dump(toJson());
+  }
+
+  void _dfs_util(Node* n, std::unordered_set<Node*>* to_visit,
+                 std::function<void(Node*)> handler) {
+    to_visit->erase(n);
+    for(NodeEntry& e : n->outputs) {
+      Node* o = e.node;
+      if(to_visit->count(o) != 0) {
+        _dfs_util(o,to_visit,handler);
+      }
+    }
+    handler(n);
+  }
+
+  void DFS(std::function<void(Node*)> handler) {
+    std::unordered_set<Node*> to_visit;
+    //put all nodes in set to visit
+    for(auto& n : nodes)
+      to_visit.insert(n);
+    //visit all inputs first
+    for(auto& i : inputs)
+      if(to_visit.count(i) != 0)
+        _dfs_util(i, &to_visit, handler);
+    //visit any nodes left
+    while(to_visit.size() > 0)
+      _dfs_util(*(to_visit.begin()), &to_visit, handler);
+  }
+
+  std::vector<Node*> topological_sort() {
+    std::vector<Node*> sorted;
+    auto handler = [&](Node* n) {
+      sorted.push_back(n);
+    };
+    DFS(handler);
+    return sorted;
+  }
+
+  void print() {
+    std::cout << "########### Graph #############" << std::endl;
+    std::cout << "inputs: " << inputs.size() << std::endl;
+    std::cout << "outputs: " << outputs.size() << std::endl;
+    std::cout << "nodes: " << nodes.size() << std::endl;
+    std::vector<Node*> sorted;
+    auto handler = [&](Node* n) {
+      sorted.push_back(n);
+    };
+    DFS(handler);
+
+    for(int i=sorted.size()-1; i>=0; i--) {
+      std::cout << "Node: " << sorted[i]->name << std::endl;
+      for(int j=0; j<sorted[i]->inputs.size(); j++) {
+        std::cout << "\tInput: " << sorted[i]->inputs[j].node->name << " " << sorted[i]->inputs[j].entry << std::endl;
+      }
+      for(int j=0; j<sorted[i]->outputs.size(); j++) {
+        std::cout << "\tOutput: " << sorted[i]->outputs[j].node->name << " " << sorted[i]->outputs[j].entry << std::endl;
+      }
+    }
+    std::cout << "###############################" << std::endl;
+  }  
+  
+  std::vector<Node*> nodes;
+  std::vector<Node*> inputs;
+  std::vector<NodeEntry> outputs;
+  std::map<std::string, JsonVal> attrs;
+};
+
+// example Sam: https://gist.github.com/samskalicky/5f44e159e9f1b04237eed8d20e5d9f28
+MXReturnValue custom_pass(const std::string& in_graph, const std::string** out_graph,
+                          const std::unordered_map<std::string, std::string>& options,
+                          const std::unordered_map<std::string, MXTensor>& args,
+                          const std::unordered_map<std::string, MXTensor>& aux,
+                          const PassResource& res) {
+
+  for (auto kv : options)
+    std::cout << "option: " << kv.first << " ==> " << kv.second << std::endl;
+    
+  //convert graph from JSON string to Graph/Node data structure
+  Graph g = Graph::fromString(in_graph);
+  //g.print();
+  
+  /////////////////////// AddBias + GELU //////////////////////////
+  std::string str_ffn1 = "ffn_1_fwd";
+  for(Node* n : g.nodes){
+      if (n->name.find(str_ffn1) != std::string::npos) {
+          Node* node_ffn1_fwd = n;
+          Node* node_ffn1_bias = node_ffn1_fwd->inputs[2].node;
+          Node* node_gelu = node_ffn1_fwd->outputs[0].node;
+          
+          std::size_t pos = n->name.find("fwd");
+          std::string base_name = n->name.substr(0,pos-1);
+          
+          // remove Bias terms in FC
+          node_ffn1_fwd->attrs["no_bias"]="True";
+          node_ffn1_fwd->inputs.pop_back();
+          
+          // create 2 expand_dims nodes to expand bias dimensions
+          Node* node_expand_1_bias = new Node();
+          node_expand_1_bias->name = base_name + "_expand_1_bias";
+          node_expand_1_bias->op = "expand_dims";
+          node_expand_1_bias->attrs["axis"]="0";
+          node_expand_1_bias->inputs.resize(1);
+          node_expand_1_bias->inputs[0].node = node_ffn1_bias;
+          node_expand_1_bias->inputs[0].entry = 0;
+          Node* node_expand_2_bias = new Node();
+          node_expand_2_bias->name = base_name + "_expand_2_bias";
+          node_expand_2_bias->op = "expand_dims";
+          node_expand_2_bias->attrs["axis"]="0";
+          node_expand_2_bias->inputs.resize(1);
+          node_expand_2_bias->inputs[0].node = node_expand_1_bias;
+          node_expand_2_bias->inputs[0].entry = 0;
+          g.nodes.push_back(node_expand_1_bias);
+          g.nodes.push_back(node_expand_2_bias);
+          
+          // create broadcast_like node
+          Node* node_bcst_like = new Node();
+          node_bcst_like->name = base_name + "_broadcast_like";
+          node_bcst_like->op = "broadcast_like";;
+          node_bcst_like->inputs.resize(2);
+          node_bcst_like->inputs[0].node = node_expand_2_bias;
+          node_bcst_like->inputs[0].entry = 0;
+          node_bcst_like->inputs[1].node = node_ffn1_fwd;
+          node_bcst_like->inputs[1].entry = 0;
+          g.nodes.push_back(node_bcst_like);
+          
+          // create BiasAdd Node
+          Node* node_add_bias = new Node();
+          node_add_bias->name = base_name + "_add_bias";
+          node_add_bias->op = "elemwise_add";
+          node_add_bias->inputs.resize(2);
+          node_add_bias->inputs[0].node = node_ffn1_fwd;
+          node_add_bias->inputs[0].entry = 0;
+          node_add_bias->inputs[1].node = node_bcst_like;
+          node_add_bias->inputs[1].entry = 0;
+          g.nodes.push_back(node_add_bias);
+          
+          //set BiasAdd node as gelu input
+          node_gelu->inputs[0].node = node_add_bias;
+          node_gelu->inputs[0].entry = 0;
+      }    
+  }
+  /////////////////////////////////////////////////////////////////
+
+
+  //////////////// MHA remove reshapes & concat ///////////////////
+  // find shape of weight / bias, number of heads, and count number of MHA layers
+  std::string query0_weight = "bertencoder0_transformer0_dotproductselfattentioncell0_query_weight";
+  std::string mult_qk0 = "bertencoder0_transformer0_dotproductselfattentioncell0_interleaved_matmul_selfatt_qk0";
+  std::string str_projection = "_dotproductselfattentioncell0_fullyconnected0";
+  int num_mha_layers = 0;
+  int num_heads = 0;
+  int head_dimension = 0;
+  int shape0, shape1;
+  for(Node* n : g.nodes){
+      if (n->name.find(query0_weight) != std::string::npos) {
+          std::string shape = n->attrs["__shape__"];
+          int pos_comma = shape.find(",");
+          shape0 = stoi(shape.substr(1, pos_comma-1));
+          shape1 = stoi(shape.substr(pos_comma+2, shape.length()-pos_comma-3)); 
+      }
+      if (n->name.find(mult_qk0) != std::string::npos) {
+          std::string h = n->attrs["heads"];
+          num_heads = stoi(h);
+      }
+      if (n->name.find(str_projection) != std::string::npos) {
+          num_mha_layers++;
+      }
+  }
+  head_dimension = shape0 / num_heads;
+
+  // find projection nodes and set new interleaved intputs
+  for(Node* n : g.nodes){
+      if (n->name.find("_dotproductselfattentioncell0_fullyconnected0") != std::string::npos) {
+          Node* node_projection = n;
+          std::size_t pos = node_projection->name.find("_fullyconnected0");
+          std::string base_name = n->name.substr(0,pos);
+
+          //////////////////// WEIGHTS ////////////////////
+          // create new arg with interleaved weights
+          std::string name_qkv_weights_interleaved = base_name + "_qkv_weights_interleaved";
+          MXTensor* qkv_weights_interleaved = res.alloc_arg(name_qkv_weights_interleaved, {3*shape0,shape1}, MXContext::CPU(0), kFloat32);
+          float* qkv_w_data = qkv_weights_interleaved->data<float>();
+          // read from previous values and interleave them
+          MXTensor query_w = args.at(base_name+"_query_weight");
+          MXTensor key_w = args.at(base_name+"_key_weight");
+          MXTensor value_w = args.at(base_name+"_value_weight");
+          float* query_w_data = query_w.data<float>();
+          float* key_w_data = key_w.data<float>();
+          float* value_w_data = value_w.data<float>();
+          for(int h=0; h<num_heads; ++h){
+              for(int e=0; e<head_dimension*shape1; ++e){
+                  qkv_w_data[h*head_dimension*shape1*3 + e] =
+                      query_w_data[h*head_dimension*shape1 + e];
+              }
+              for(int e=0; e<head_dimension*shape1; ++e){
+                  qkv_w_data[h*head_dimension*shape1*3 + head_dimension*shape1 + e] =
+                      key_w_data[h*head_dimension*shape1 + e];
+              }
+              for(int e=0; e<head_dimension*shape1; ++e){
+                  qkv_w_data[h*head_dimension*shape1*3 + 2*head_dimension*shape1 + e] =
+                      value_w_data[h*head_dimension*shape1 + e];
+              }
+          }
+          // create a new input Node
+          Node* node_qkv_weights = new Node();
+          node_qkv_weights->name = name_qkv_weights_interleaved;
+          node_qkv_weights->op = "null";
+          //add a new node in graph, also as input
+          g.nodes.push_back(node_qkv_weights);
+          g.inputs.push_back(node_qkv_weights);
+          // set connection with new input
+          node_projection->inputs[1].node = node_qkv_weights;
+          node_projection->inputs[1].entry = 0;
+          
+          //////////////////// BIAS ////////////////////
+          // create new arg with all bias
+          std::string name_qkv_bias = base_name + "_qkv_bias";
+          MXTensor* qkv_bias = res.alloc_arg(name_qkv_bias, {3*shape0,}, MXContext::CPU(0), kFloat32);
+          float* qkv_bias_data = qkv_bias->data<float>();
+          // read from previous values and join them
+          MXTensor query_bias = args.at(base_name+"_query_bias");
+          MXTensor key_bias = args.at(base_name+"_key_bias");
+          MXTensor value_bias = args.at(base_name+"_value_bias");
+          float* query_bias_data = query_bias.data<float>();
+          float* key_bias_data = key_bias.data<float>();
+          float* value_bias_data = value_bias.data<float>();
+          for(int e=0; e<shape0; ++e){
+              qkv_bias_data[e] = query_bias_data[e];
+          }
+          for(int e=0; e<shape0; ++e){
+              qkv_bias_data[shape0 + e] = key_bias_data[e];
+          }
+          for(int e=0; e<shape0; ++e){
+              qkv_bias_data[2*shape0 + e] = value_bias_data[e];
+          }
+          // create a new input Node
+          Node* node_qkv_bias = new Node();
+          node_qkv_bias->name = name_qkv_bias;
+          node_qkv_bias->op = "null";
+          //add a new node in graph, also as input
+          g.nodes.push_back(node_qkv_bias);
+          g.inputs.push_back(node_qkv_bias);
+          // set connection with new input
+          node_projection->inputs[2].node = node_qkv_bias;
+          node_projection->inputs[2].entry = 0;
+      }
+  }
+  //////////////////////////////////////////////////////////////////
+
+  //convert back to JSON string from Graph/Node
+  *out_graph = new std::string(g.toString());
+  return MX_SUCCESS;
+
+}
+
+
+REGISTER_PASS(custom_pass)
+.setBody(custom_pass);
+
+MXReturnValue initialize(int version) {
+  if (version >= 10400) {
+    std::cout << "MXNet version " << version << " supported" << std::endl;
+    return MX_SUCCESS;
+  } else {
+    std::cout << "MXNet version " << version << " not supported" << std::endl;
+    return MX_FAIL;
+  }
+}
diff --git a/scripts/bert/deploy.py b/scripts/bert/deploy.py
new file mode 100644
index 0000000000..8978e0c120
--- /dev/null
+++ b/scripts/bert/deploy.py
@@ -0,0 +1,702 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint:disable=redefined-outer-name,logging-format-interpolation
+"""
+Export and deploy the BERT Model for Deployment (testing with Validation datasets)
+====================================
+
+This script exports the BERT model to a hybrid model serialized as a symbol.json file,
+which is suitable for deployment, or use with MXNet Module API.
+
+@article{devlin2018bert,
+  title={BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding},
+  author={Devlin, Jacob and Chang, Ming- \
+      Wei and Lee, Kenton and Toutanova, Kristina},
+  journal={arXiv preprint arXiv:1810.04805},
+  year={2018}
+}
+"""
+
+import argparse
+import collections
+import json
+import logging
+import warnings
+from functools import partial
+import os
+import io
+import time
+
+import mxnet as mx
+import gluonnlp as nlp
+from gluonnlp.model import get_model, BERTClassifier
+from gluonnlp.data import SQuAD
+from gluonnlp.data.classification import get_task
+from model.qa import BertForQA
+from finetune_squad import preprocess_dataset as qa_preprocess_data
+from finetune_classifier import convert_examples_to_features as classifier_examples2features
+from bert_qa_evaluate import get_F1_EM, predict, PredResult
+
+nlp.utils.check_version('0.9')
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Export hybrid BERT base model.')
+
+    parser.add_argument('--model_parameters',
+                        type=str,
+                        default=None,
+                        help='The model parameter file saved from training.')
+
+    parser.add_argument('--bert_model',
+                        type=str,
+                        default='bert_12_768_12',
+                        choices=['bert_12_768_12', 'bert_24_1024_16'],
+                        help='BERT model name. Options are "bert_12_768_12" and "bert_24_1024_16"')
+
+    parser.add_argument('--task',
+                        type=str,
+                        choices=['QA', 'embedding', 'MRPC', 'QQP', 'QNLI', 'RTE', 'STS-B', 'CoLA',
+                                 'MNLI', 'WNLI', 'SST', 'XNLI', 'LCQMC', 'ChnSentiCorp'],
+                        help='In Classification:'
+                        'The name of the task to fine-tune. Choices are QA, embedding, '
+                        'MRPC, QQP, QNLI, RTE, STS-B, CoLA, MNLI, WNLI, SST, XNLI, LCQMC, '
+                        'ChnSentiCorp')
+
+    parser.add_argument('--dataset_name',
+                        type=str,
+                        default='book_corpus_wiki_en_uncased',
+                        choices=['book_corpus_wiki_en_uncased', 'book_corpus_wiki_en_cased',
+                                 'wiki_multilingual_uncased', 'wiki_multilingual_cased',
+                                 'wiki_cn_cased'],
+                        help='BERT dataset name. Options include '
+                             '"book_corpus_wiki_en_uncased", "book_corpus_wiki_en_cased", '
+                             '"wiki_multilingual_uncased", "wiki_multilingual_cased", '
+                             '"wiki_cn_cased"')
+
+    parser.add_argument('--output_dir',
+                        type=str,
+                        default='./output_dir',
+                        help='The directory where the exported model symbol will be created. '
+                             'The default is ./output_dir')
+
+    parser.add_argument('--exported_model',
+                        type=str,
+                        default=None,
+                        help='Prefix path of exported model:'
+                        'Should be prefix for -symbol.json / -0000.params files')
+
+    parser.add_argument('--test_batch_size',
+                        type=int,
+                        default=128,
+                        help='Test batch size. default is 128')
+
+    parser.add_argument('--seq_length',
+                        type=int,
+                        default=128,
+                        help='The maximum total input sequence length after WordPiece tokenization.'
+                             'Sequences longer than this needs to be truncated, and sequences '
+                             'shorter than this needs to be padded. Default is 128')
+
+    parser.add_argument('--dropout',
+                        type=float,
+                        default=0.1,
+                        help='The dropout probability for the classification/regression head.')
+
+    parser.add_argument('--gpu',
+                        type=int,
+                        default=None,
+                        help='Id of the gpu to use. Set it to empty means to use cpu.')
+
+    parser.add_argument('--only_infer',
+                        action='store_true',
+                        help='if set, it does not export the model again.')
+
+    parser.add_argument('--dtype',
+                        type=str,
+                        default='float32',
+                        help='Data type used for training. Either float32 or float16')
+
+    parser.add_argument('--custom_pass',
+                        type=str,
+                        default=None,
+                        help='Specify a custom graph pass for the network (library),'
+                        'allowing to customize the graph')
+
+    parser.add_argument('--max_iters',
+                        type=int,
+                        default=None,
+                        help='If set, it runs the maximum number of iterations specified')
+
+    parser.add_argument('--check_accuracy',
+                        action='store_true',
+                        help='If set, it will check accuracy')
+
+    # Specific for QA
+    parser.add_argument('--QA_version_2',
+                        action='store_true',
+                        help='In Question-Answering:'
+                        'SQuAD examples whether contain some that do not have an answer.')
+
+    parser.add_argument('--QA_n_best_size',
+                        type=int,
+                        default=20,
+                        help='In Question-Answering:'
+                        'The total number of n-best predictions to generate in the '
+                        'nbest_predictions.json output file. default is 20')
+
+    parser.add_argument('--QA_max_answer_length',
+                        type=int,
+                        default=30,
+                        help='In Question-Answering:'
+                        'The maximum length of an answer that can be generated. This is needed '
+                        'because the start and end predictions are not conditioned on one another.'
+                        ' default is 30')
+
+    parser.add_argument('--QA_doc_stride',
+                        type=int,
+                        default=128,
+                        help='In Question-Answering:'
+                        'When splitting up a long document into chunks, how much stride to '
+                        'take between chunks. default is 128')
+
+    parser.add_argument('--QA_max_query_length',
+                        type=int,
+                        default=64,
+                        help='In Question-Answering:'
+                        'The maximum number of tokens for the question. Questions longer than '
+                        'this will be truncated to this length. default is 64')
+
+    parser.add_argument('--QA_null_score_diff_threshold',
+                        type=float,
+                        default=0.0,
+                        help='In Question-Answering:'
+                        'If null_score - best_non_null is greater than the threshold predict null.'
+                        'Typical values are between -1.0 and -5.0. default is 0.0')
+
+    # specific for embedding
+    parser.add_argument('--oov_way', type=str, default='avg',
+                        help='how to handle subword embeddings\n'
+                             'avg: average all subword embeddings to represent the original token\n'
+                             'sum: sum all subword embeddings to represent the original token\n'
+                             'last: use last subword embeddings to represent the original token\n')
+
+    args = parser.parse_args()
+
+    # create output dir
+    output_dir = args.output_dir
+    nlp.utils.mkdir(output_dir)
+
+    #set context and type
+    if args.gpu is not None:
+        ctx = mx.gpu(args.gpu)
+    else:
+        ctx = mx.cpu()
+    dtype = args.dtype
+
+    ###############################################################################
+    #                                Logging                                      #
+    ###############################################################################
+
+    log = logging.getLogger('gluonnlp')
+    log.setLevel(logging.DEBUG)
+    formatter = logging.Formatter(fmt='%(levelname)s:%(name)s:%(asctime)s %(message)s',
+                                  datefmt='%H:%M:%S')
+    fh = logging.FileHandler(os.path.join(args.output_dir, 'hybrid_export_bert.log'), mode='w')
+    fh.setLevel(logging.INFO)
+    fh.setFormatter(formatter)
+    console = logging.StreamHandler()
+    console.setLevel(logging.INFO)
+    console.setFormatter(formatter)
+    log.addHandler(console)
+    log.addHandler(fh)
+    log.info(args)
+
+    ###############################################################################
+    #                              Hybridize the model                            #
+    ###############################################################################
+    export_ctx = mx.cpu()
+    seq_length = args.seq_length
+    do_lower_case = 'uncased' in args.dataset_name
+
+    if args.task == 'QA':
+        bert, vocab = get_model(
+            name=args.bert_model,
+            dataset_name=args.dataset_name,
+            pretrained=False,
+            use_pooler=False,
+            use_decoder=False,
+            use_classifier=False,
+            ctx=export_ctx)
+        net = BertForQA(bert)
+    elif args.task == 'embedding':
+        bert, vocab = get_model(
+            name=args.bert_model,
+            dataset_name=args.dataset_name,
+            pretrained=True,
+            use_pooler=False,
+            use_decoder=False,
+            use_classifier=False,
+            ctx=export_ctx)
+        net = bert
+    else:
+        specific_task = get_task(args.task)
+        do_regression = not specific_task.class_labels
+        if do_regression:
+            bert, vocab = get_model(
+                name=args.bert_model,
+                dataset_name=args.dataset_name,
+                pretrained=False,
+                use_pooler=True,
+                use_decoder=False,
+                use_classifier=False,
+                ctx=export_ctx)
+            net = BERTClassifier(bert, num_classes=1, dropout=args.dropout)
+        else:
+            # classification task
+            bert, vocab = get_model(
+                name=args.bert_model,
+                dataset_name=args.dataset_name,
+                pretrained=False,
+                use_pooler=True,
+                use_decoder=False,
+                use_classifier=False)
+            num_classes = len(specific_task.class_labels)
+            net = BERTClassifier(bert, num_classes=num_classes, dropout=args.dropout)
+
+    if args.model_parameters and args.task != 'embedding':
+        net.load_parameters(args.model_parameters, ctx=export_ctx)
+    elif args.task != 'embedding':
+        net.initialize(ctx=export_ctx)
+        warnings.warn('--model_parameters is not provided. The parameter checkpoint (.params) '
+                      'file will be created based on default parameter initialization.')
+
+    net.hybridize(static_alloc=True, static_shape=True)
+    test_batch_size = args.test_batch_size
+
+###############################################################################
+#                              Export the model                               #
+###############################################################################
+def export(prefix):
+    """Export the model."""
+    log.info('Exporting the model ... ')
+
+    # dummy input data
+    inputs = mx.nd.arange(test_batch_size * seq_length)
+    inputs = inputs.reshape(shape=(test_batch_size, seq_length))
+    token_types = mx.nd.zeros_like(inputs)
+    valid_length = mx.nd.arange(test_batch_size)
+    batch = inputs, token_types, valid_length
+    inputs, token_types, valid_length = batch
+
+    net(inputs.as_in_context(export_ctx),
+        token_types.as_in_context(export_ctx),
+        valid_length.as_in_context(export_ctx))
+    net.export(prefix, epoch=0)
+    assert os.path.isfile(prefix + '-symbol.json')
+    assert os.path.isfile(prefix + '-0000.params')
+
+    if args.custom_pass is not None:
+        # load library
+        libpath = os.path.abspath(args.custom_pass)
+        mx.library.load(libpath)
+        sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, 0)
+
+        arg_array = arg_params
+        arg_array['data0'] = mx.nd.ones((test_batch_size, seq_length), dtype='float32')
+        arg_array['data1'] = mx.nd.ones((test_batch_size, seq_length), dtype='float32')
+        arg_array['data2'] = mx.nd.ones((test_batch_size, ), dtype='float32')
+        custom_sym = sym.optimize_for('custom_pass', arg_array, aux_params)
+
+        nheads = 12
+        if args.bert_model == 'bert_24_1024_16':
+            nheads = 24
+        for i in range(nheads):
+            basename = 'bertencoder0_transformer' + str(i) + '_dotproductselfattentioncell0'
+            arg_array.pop(basename + '_query_weight')
+            arg_array.pop(basename + '_key_weight')
+            arg_array.pop(basename + '_value_weight')
+            arg_array.pop(basename + '_query_bias')
+            arg_array.pop(basename + '_key_bias')
+            arg_array.pop(basename + '_value_bias')
+        arg_array.pop('data0')
+        arg_array.pop('data1')
+        arg_array.pop('data2')
+
+        mx.model.save_checkpoint(prefix, 0, custom_sym, arg_params, aux_params)
+
+# Function to preprocess dataset to test, which depends on the task
+def preprocess_data(tokenizer, task):
+    """Preprocess dataset to test."""
+    log.info('Loading dev data...')
+    if task == 'QA':
+        # question_answering
+        batchify_fn = nlp.data.batchify.Tuple(
+            nlp.data.batchify.Stack(),
+            nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token], round_to=seq_length),
+            nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token], round_to=seq_length),
+            nlp.data.batchify.Stack('float32'),
+            nlp.data.batchify.Stack('float32'),
+            nlp.data.batchify.Stack('float32'))
+        if args.QA_version_2:
+            dev_data = SQuAD('dev', version='2.0')
+        else:
+            dev_data = SQuAD('dev', version='1.1')
+        dev_dataset = qa_preprocess_data(tokenizer,
+                                         dev_data,
+                                         max_seq_length=seq_length,
+                                         doc_stride=args.QA_doc_stride,
+                                         max_query_length=args.QA_max_query_length,
+                                         input_features=False)
+        dev_data_transform = qa_preprocess_data(tokenizer,
+                                                dev_data,
+                                                max_seq_length=seq_length,
+                                                doc_stride=args.QA_doc_stride,
+                                                max_query_length=args.QA_max_query_length,
+                                                input_features=True)
+        dev_dataloader = mx.gluon.data.DataLoader(dev_data_transform,
+                                                  batchify_fn=batchify_fn,
+                                                  num_workers=4,
+                                                  batch_size=test_batch_size,
+                                                  shuffle=False,
+                                                  last_batch='keep')
+        return dev_dataloader, len(dev_data_transform), dev_dataset
+
+    else:
+        # classification / regression
+        classification_task = get_task(task)
+
+        label_dtype = 'int32' if classification_task.class_labels else 'float32'
+        truncate_length = seq_length - 3 if classification_task.is_pair else seq_length - 2
+        trans = partial(classifier_examples2features, tokenizer=tokenizer,
+                        truncate_length=truncate_length,
+                        cls_token=vocab.cls_token,
+                        sep_token=vocab.sep_token,
+                        class_labels=classification_task.class_labels,
+                        label_alias=classification_task.label_alias, vocab=vocab)
+
+        batchify_fn = nlp.data.batchify.Tuple(
+            nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token],
+                                  round_to=seq_length), # input
+            nlp.data.batchify.Pad(axis=0, pad_val=0, round_to=seq_length),  # segment
+            nlp.data.batchify.Stack(),  # length
+            nlp.data.batchify.Stack(label_dtype))  # label
+
+        # data dev. For MNLI, more than one dev set is available
+        dev_tsv = classification_task.dataset_dev()
+        dev_tsv_list = dev_tsv if isinstance(dev_tsv, list) else [dev_tsv]
+        loader_dev_list = []
+        nsamples = 0
+        for segment, data in dev_tsv_list:
+            data_dev = mx.gluon.data.SimpleDataset(list(map(trans, data)))
+            nsamples = nsamples + len(data_dev)
+            loader_dev = mx.gluon.data.DataLoader(data_dev,
+                                                  batchify_fn=batchify_fn,
+                                                  num_workers=4,
+                                                  batch_size=test_batch_size,
+                                                  shuffle=False,
+                                                  last_batch='keep')
+            loader_dev_list.append((segment, loader_dev))
+        return loader_dev_list, nsamples, None
+
+# Function to calculate final accuracy and print it out. It also save predictions within a file
+def compute_accuracy_save_results(task, all_results, SQuAD_dataset=None, segment=None, metric=None):
+    """Compute accuracy and save predictions."""
+    all_predictions = collections.OrderedDict()
+    if task == 'QA':
+        assert SQuAD_dataset is not None
+        if args.QA_version_2:
+            dev_data = SQuAD('dev', version='2.0')
+        else:
+            dev_data = SQuAD('dev', version='1.1')
+        for features in SQuAD_dataset:
+            results = all_results[features[0].example_id]
+            example_qas_id = features[0].qas_id
+            prediction, _ = predict(
+                features=features,
+                results=results,
+                tokenizer=nlp.data.BERTBasicTokenizer(lower=do_lower_case),
+                max_answer_length=args.QA_max_answer_length,
+                null_score_diff_threshold=args.QA_null_score_diff_threshold,
+                n_best_size=args.QA_n_best_size,
+                version_2=args.QA_version_2)
+            all_predictions[example_qas_id] = prediction
+        if args.QA_version_2:
+            log.info('Please run evaluate-v2.0.py to get evaluation results for SQuAD 2.0')
+        else:
+            F1_EM = get_F1_EM(dev_data, all_predictions)
+            log.info(F1_EM)
+        # save results
+        with io.open(os.path.join(output_dir, task + '-predictions.json'),
+                     'w', encoding='utf-8') as fout:
+            data = json.dumps(all_predictions, ensure_ascii=False)
+            fout.write(data)
+
+    elif task == 'embedding':
+        final_results = []
+        padding_idx, cls_idx, sep_idx = None, None, None
+        if vocab.padding_token:
+            padding_idx = vocab[vocab.padding_token]
+        if vocab.cls_token:
+            cls_idx = vocab[vocab.cls_token]
+        if vocab.sep_token:
+            sep_idx = vocab[vocab.sep_token]
+        for token_ids, sequence_outputs in all_results:
+            token_ids = token_ids.astype(int)
+            tokens = []
+            tensors = []
+            oov_len = 1
+            for token_id, sequence_output in zip(token_ids, sequence_outputs):
+                # [PAD] token, sequence is finished.
+                if padding_idx and token_id == padding_idx:
+                    break
+                # [CLS], [SEP]
+                if cls_idx and token_id == cls_idx:
+                    continue
+                if sep_idx and token_id == sep_idx:
+                    continue
+                token = vocab.idx_to_token[token_id]
+                tokenizer = nlp.data.BERTTokenizer(vocab, lower=do_lower_case)
+                if not tokenizer.is_first_subword(token):
+                    tokens.append(token)
+                    if args.oov_way == 'last':
+                        tensors[-1] = sequence_output
+                    else:
+                        tensors[-1] += sequence_output
+                    if args.oov_way == 'avg':
+                        oov_len += 1
+                else:  # iv, avg last oov
+                    if oov_len > 1:
+                        tensors[-1] /= oov_len
+                        oov_len = 1
+                    tokens.append(token)
+                    tensors.append(sequence_output)
+            if oov_len > 1:  # if the whole sentence is one oov, handle this special case
+                tensors[-1] /= oov_len
+            final_results.append((tokens, tensors))
+
+        with io.open(os.path.join(output_dir, task + '-output.tsv'),
+                     'w', encoding='utf-8') as fout:
+            for embeddings in final_results:
+                sent, tokens_embedding = embeddings
+                fout.write(u'Text: \t%s\n' % (str(sent)))
+                fout.write(u'Tokens embedding: \t%s\n\n' % (str(tokens_embedding)))
+
+    else:
+        # classification / regression
+        assert segment is not None
+        assert metric is not None
+        specific_task = get_task(task)
+        metric_nm, metric_val = metric.get()
+        if not isinstance(metric_nm, list):
+            metric_nm, metric_val = [metric_nm], [metric_val]
+        metric_str = 'validation metrics: ' + ', '.join([i + ':%.4f' for i in metric_nm])
+        log.info(metric_str, *metric_val)
+        # save results
+        final_results = []
+        if not specific_task.class_labels:
+            # regression task
+            for result in all_results:
+                for probs in result.asnumpy().reshape(-1).tolist():
+                    final_results.append('{:.3f}'.format(probs))
+        else:
+            # classification task
+            for result in all_results:
+                indices = mx.nd.topk(result, k=1, ret_typ='indices', dtype='int32').asnumpy()
+                for index in indices:
+                    final_results.append(specific_task.class_labels[int(index)])
+        with io.open(os.path.join(output_dir, task + '-' + segment + '-predictions.tsv'),
+                     'w', encoding='utf-8') as fout:
+            fout.write(u'index\tprediction\n')
+            for i, pred in enumerate(final_results):
+                fout.write(u'%d\t%s\n\n' % (i, str(pred)))
+
+###############################################################################
+#                             Perform inference                               #
+###############################################################################
+def infer(prefix, task):
+    """Perform inference."""
+    assert os.path.isfile(prefix + '-symbol.json')
+    assert os.path.isfile(prefix + '-0000.params')
+
+     # import with SymbolBlock. Alternatively, you can use Module.load APIs.
+    imported_net = mx.gluon.nn.SymbolBlock.imports(prefix + '-symbol.json',
+                                                   ['data0', 'data1', 'data2'],
+                                                   prefix + '-0000.params',
+                                                   ctx=ctx)
+    imported_net.hybridize(static_alloc=True, static_shape=True)
+    if dtype == 'float16':
+        imported_net.cast('float16')
+    tokenizer = nlp.data.BERTTokenizer(vocab=vocab, lower=do_lower_case)
+
+    num_warmup = 2
+
+    if task == 'QA':
+        dataloader, _, SQuAD_dataset = preprocess_data(tokenizer, task)
+        # run warmup iterations
+        for data in dataloader:
+            example_ids, token_ids, token_types, valid_length, _, _ = data
+            out = imported_net(token_ids.as_in_context(ctx),
+                               token_types.as_in_context(ctx),
+                               valid_length.as_in_context(ctx).astype(dtype))
+            output = mx.nd.split(out, axis=2, num_outputs=2)
+            pred_start = output[0].reshape((0, -3)).asnumpy()
+            pred_end = output[1].reshape((0, -3)).asnumpy()
+            num_warmup -= 1
+            if not num_warmup:
+                break
+        # run forward inference
+        log.info('Start inference ... ')
+        total_iters = 0
+        total_samples = 0
+        total_latency_time = 0.0
+        all_results = collections.defaultdict(list)
+        tic = time.time()
+        for data in dataloader:
+            example_ids, token_ids, token_types, valid_length, _, _ = data
+            tic_latency = time.time()
+            out = imported_net(token_ids.as_in_context(ctx),
+                               token_types.as_in_context(ctx),
+                               valid_length.as_in_context(ctx).astype(dtype))
+            output = mx.nd.split(out, axis=2, num_outputs=2)
+            pred_start = output[0].reshape((0, -3)).asnumpy()
+            pred_end = output[1].reshape((0, -3)).asnumpy()
+            toc_latency = time.time()
+            total_latency_time += (toc_latency - tic_latency)
+            total_iters += 1
+            total_samples += len(token_ids)
+            if args.check_accuracy:
+                example_ids = example_ids.asnumpy().tolist()
+                for example_id, start, end in zip(example_ids, pred_start, pred_end):
+                    all_results[example_id].append(PredResult(start=start, end=end))
+            if args.max_iters and total_iters >= args.max_iters:
+                break
+        mx.nd.waitall()
+        toc = time.time()
+        log.info('BatchSize={}, NumberIterations={}:  '.format(test_batch_size, total_iters))
+        log.info('Throughput={:.2f} samples/s, Average Latency={:.4f} ms'
+                 .format(total_samples / (toc - tic), (total_latency_time / total_iters) * 1000))
+        if args.check_accuracy:
+            compute_accuracy_save_results(task, all_results, SQuAD_dataset=SQuAD_dataset)
+
+    elif task == 'embedding':
+        # Uses SST dataset as example
+        dataloader_list, _, _ = preprocess_data(tokenizer, 'SST')
+        _, dataloader = dataloader_list[0]
+        # run warmup iterations
+        for data in dataloader:
+            token_ids, token_types, valid_length, _ = data
+            sequence_outputs = imported_net(token_ids.as_in_context(ctx),
+                                            token_types.as_in_context(ctx),
+                                            valid_length.as_in_context(ctx).astype(dtype))
+            sequence_outputs.asnumpy()
+            num_warmup -= 1
+            if not num_warmup:
+                break
+        # run forward inference
+        log.info('Start inference ... ')
+        total_iters = 0
+        total_samples = 0
+        total_latency_time = 0.0
+        all_results = []
+        tic = time.time()
+        for data in dataloader:
+            token_ids, token_types, valid_length, _ = data
+            tic_latency = time.time()
+            sequence_outputs = imported_net(token_ids.as_in_context(ctx),
+                                            token_types.as_in_context(ctx),
+                                            valid_length.as_in_context(ctx).astype(dtype))
+            sequence_outputs.asnumpy()
+            toc_latency = time.time()
+            total_latency_time += (toc_latency - tic_latency)
+            total_iters += 1
+            total_samples += len(token_ids)
+            if args.check_accuracy:
+                for token_id, sequence_output in zip(token_ids.asnumpy(),
+                                                     sequence_outputs.asnumpy()):
+                    all_results.append((token_id, sequence_output))
+            if args.max_iters and total_iters >= args.max_iters:
+                break
+        mx.nd.waitall()
+        toc = time.time()
+        log.info('BatchSize={}, NumberIterations={}:  '.format(test_batch_size, total_iters))
+        log.info('Throughput={:.2f} samples/s, Average Latency={:.4f} ms'
+                 .format(total_samples / (toc - tic), (total_latency_time / total_iters) * 1000))
+        if args.check_accuracy:
+            compute_accuracy_save_results(task, all_results)
+
+    else:
+        # classification / regression task
+        dataloader_list, _, _ = preprocess_data(tokenizer, task)
+        specific_task = get_task(task)
+        metric = specific_task.metrics
+        # run warmup iterations
+        _, dataloader = dataloader_list[0]
+        for data in dataloader:
+            token_ids, token_types, valid_length, label = data
+            out = imported_net(token_ids.as_in_context(ctx),
+                               token_types.as_in_context(ctx),
+                               valid_length.as_in_context(ctx).astype(dtype))
+            out.asnumpy()
+            num_warmup -= 1
+            if not num_warmup:
+                break
+        # run forward inference
+        for segment, dataloader in dataloader_list:
+            log.info('Start inference ... ')
+            total_iters = 0
+            total_samples = 0
+            total_latency_time = 0.0
+            all_results = []
+            metric.reset()
+            tic = time.time()
+            for data in dataloader:
+                token_ids, token_types, valid_length, label = data
+                label = label.as_in_context(ctx)
+                tic_latency = time.time()
+                out = imported_net(token_ids.as_in_context(ctx),
+                                   token_types.as_in_context(ctx),
+                                   valid_length.as_in_context(ctx).astype(dtype))
+                out.asnumpy()
+                toc_latency = time.time()
+                total_latency_time += (toc_latency - tic_latency)
+                total_iters += 1
+                total_samples += len(token_ids)
+                if args.check_accuracy:
+                    if not do_regression:
+                        label = label.reshape((-1))
+                    metric.update([label], [out])
+                    all_results.append(out)
+                if args.max_iters and total_iters >= args.max_iters:
+                    break
+            mx.nd.waitall()
+            toc = time.time()
+            log.info('Segment {}'.format(segment))
+            log.info('BatchSize={}, NumberIterations={}:  '.format(test_batch_size, total_iters))
+            log.info('Throughput={:.2f} samples/s, Average Latency={:.4f} ms'
+                     .format(total_samples / (toc - tic),
+                             (total_latency_time / total_iters) * 1000))
+            if args.check_accuracy:
+                compute_accuracy_save_results(task, all_results, segment=segment, metric=metric)
+
+if __name__ == '__main__':
+    if args.exported_model:
+        prefix = args.exported_model
+    else:
+        prefix = os.path.join(args.output_dir, args.task)
+    if not args.only_infer:
+        export(prefix)
+    infer(prefix, args.task)
diff --git a/scripts/bert/finetune_classifier.py b/scripts/bert/finetune_classifier.py
index 8a400fb8b9..156c1e0f31 100644
--- a/scripts/bert/finetune_classifier.py
+++ b/scripts/bert/finetune_classifier.py
@@ -53,267 +53,268 @@
 
 nlp.utils.check_version('0.9', warning_only=True)
 
-parser = argparse.ArgumentParser(
-    description='BERT fine-tune examples for classification/regression tasks.',
-    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-
-parser.add_argument('--optimizer', type=str, default='bertadam',
-                    help='The optimizer to be used for training')
-parser.add_argument('--epochs', type=int, default=3, help='number of epochs.')
-parser.add_argument(
-    '--training_steps', type=int, help='The total training steps. '
-    'Note that if specified, epochs will be ignored.')
-parser.add_argument(
-    '--batch_size',
-    type=int,
-    default=32,
-    help='Batch size. Number of examples per gpu in a minibatch.')
-parser.add_argument(
-    '--dev_batch_size',
-    type=int,
-    default=8,
-    help='Batch size for dev set and test set')
-parser.add_argument(
-    '--lr',
-    type=float,
-    default=3e-5,
-    help='Initial learning rate')
-parser.add_argument(
-    '--epsilon',
-    type=float,
-    default=1e-6,
-    help='Small value to avoid division by 0'
-)
-parser.add_argument(
-    '--warmup_ratio',
-    type=float,
-    default=0.1,
-    help='ratio of warmup steps used in NOAM\'s stepsize schedule')
-parser.add_argument(
-    '--log_interval',
-    type=int,
-    default=10,
-    help='report interval')
-parser.add_argument(
-    '--max_len',
-    type=int,
-    default=128,
-    help='Maximum length of the sentence pairs')
-parser.add_argument(
-    '--round_to', type=int, default=None,
-    help='The length of padded sequences will be rounded up to be multiple of this argument.'
-         'When round to is set to 8, training throughput may increase for mixed precision'
-         'training on GPUs with tensorcores.')
-parser.add_argument(
-    '--seed', type=int, default=2, help='Random seed')
-parser.add_argument(
-    '--accumulate',
-    type=int,
-    default=None,
-    help='The number of batches for gradients accumulation to simulate large batch size. '
-         'Default is None')
-parser.add_argument(
-    '--gpu', type=int, default=None, help='Which gpu for finetuning.')
-parser.add_argument(
-    '--task_name',
-    type=str,
-    choices=['MRPC', 'QNLI', 'RTE', 'STS-B', 'CoLA',
-             'MNLI', 'WNLI', 'SST', 'XNLI', 'LCQMC', 'ChnSentiCorp'],
-    help='The name of the task to fine-tune. Choices include MRPC, QQP, '
-         'QNLI, RTE, STS-B, CoLA, MNLI, WNLI, SST.')
-parser.add_argument(
-    '--bert_model',
-    type=str,
-    default='bert_12_768_12',
-    choices=['bert_12_768_12', 'bert_24_1024_16', 'roberta_12_768_12', 'roberta_24_1024_16'],
-    help='The name of pre-trained BERT model to fine-tune')
-parser.add_argument(
-    '--bert_dataset',
-    type=str,
-    default='book_corpus_wiki_en_uncased',
-    choices=['book_corpus_wiki_en_uncased', 'book_corpus_wiki_en_cased',
-             'openwebtext_book_corpus_wiki_en_uncased', 'wiki_multilingual_uncased',
-             'wiki_multilingual_cased', 'wiki_cn_cased',
-             'openwebtext_ccnews_stories_books_cased'],
-    help='The dataset BERT pre-trained with.')
-parser.add_argument(
-    '--pretrained_bert_parameters',
-    type=str,
-    default=None,
-    help='Pre-trained bert model parameter file.')
-parser.add_argument(
-    '--model_parameters',
-    type=str,
-    default=None,
-    help='A parameter file for the model that is loaded into the model'
-    ' before training/inference. It is different from the parameter'
-    ' file written after the model is trained.')
-parser.add_argument(
-    '--output_dir',
-    type=str,
-    default='./output_dir',
-    help='The output directory where the model params will be written.')
-parser.add_argument(
-    '--only_inference',
-    action='store_true',
-    help='If set, we skip training and only perform inference on dev and test data.')
-parser.add_argument(
-    '--dtype',
-    type=str,
-    default='float32',
-    choices=['float32', 'float16'],
-    help='The data type for training.')
-parser.add_argument(
-    '--early_stop',
-    type=int,
-    default=None,
-    help='Whether to perform early stopping based on the metric on dev set. '
-         'The provided value is the patience. ')
-parser.add_argument('--deploy', action='store_true',
-                    help='whether load static model for deployment')
-parser.add_argument('--model_prefix', type=str, required=False,
-                    help='load static model as hybridblock.')
-parser.add_argument('--only_calibration', action='store_true',
-                    help='quantize model')
-parser.add_argument('--num_calib_batches', type=int, default=5,
-                    help='number of batches for calibration')
-parser.add_argument('--quantized_dtype', type=str, default='auto',
-                    choices=['auto', 'int8', 'uint8'],
-                    help='quantization destination data type for input data')
-parser.add_argument('--calib_mode', type=str, default='customize',
-                    choices=['none', 'naive', 'entropy', 'customize'],
-                    help='calibration mode used for generating calibration table '
-                         'for the quantized symbol.')
-
-args = parser.parse_args()
-
-
-log = logging.getLogger()
-log.setLevel(logging.INFO)
-
-logging.captureWarnings(True)
-fh = logging.FileHandler('log_{0}.txt'.format(args.task_name))
-formatter = logging.Formatter(fmt='%(levelname)s:%(name)s:%(asctime)s %(message)s',
-                              datefmt='%H:%M:%S')
-fh.setLevel(logging.INFO)
-fh.setFormatter(formatter)
-console = logging.StreamHandler()
-console.setLevel(logging.INFO)
-console.setFormatter(formatter)
-log.addHandler(console)
-log.addHandler(fh)
-logging.info(args)
-
-batch_size = args.batch_size
-dev_batch_size = args.dev_batch_size
-task_name = args.task_name
-lr = args.lr
-epsilon = args.epsilon
-accumulate = args.accumulate
-log_interval = args.log_interval * accumulate if accumulate else args.log_interval
-if accumulate:
-    logging.info('Using gradient accumulation. Effective batch size = ' \
-                 'batch_size * accumulate = %d', accumulate * batch_size)
-
-# random seed
-np.random.seed(args.seed)
-random.seed(args.seed)
-mx.random.seed(args.seed)
-
-ctx = mx.cpu() if args.gpu is None else mx.gpu(args.gpu)
-
-task = get_task(task_name)
-
-# data type with mixed precision training
-if args.dtype == 'float16':
-    amp.init()
-
-# model and loss
-only_inference = args.only_inference
-model_name = args.bert_model
-dataset = args.bert_dataset
-pretrained_bert_parameters = args.pretrained_bert_parameters
-model_parameters = args.model_parameters
-
-# load symbolic model
-deploy = args.deploy
-model_prefix = args.model_prefix
-
-if only_inference and not model_parameters:
-    warnings.warn('model_parameters is not set. '
-                  'Randomly initialized model will be used for inference.')
-
-get_pretrained = not (pretrained_bert_parameters is not None or model_parameters is not None)
-
-use_roberta = 'roberta' in model_name
-get_model_params = {
-    'name': model_name,
-    'dataset_name': dataset,
-    'pretrained': get_pretrained,
-    'ctx': ctx,
-    'use_decoder': False,
-    'use_classifier': False,
-}
-# RoBERTa does not contain parameters for sentence pair classification
-if not use_roberta:
-    get_model_params['use_pooler'] = True
-
-bert, vocabulary = nlp.model.get_model(**get_model_params)
-
-# initialize the rest of the parameters
-initializer = mx.init.Normal(0.02)
-# STS-B is a regression task.
-# STSBTask().class_labels returns None
-do_regression = not task.class_labels
-if do_regression:
-    num_classes = 1
-    loss_function = gluon.loss.L2Loss()
-else:
-    num_classes = len(task.class_labels)
-    loss_function = gluon.loss.SoftmaxCELoss()
-# reuse the BERTClassifier class with num_classes=1 for regression
-if use_roberta:
-    model = RoBERTaClassifier(bert, dropout=0.0, num_classes=num_classes)
-else:
-    model = BERTClassifier(bert, dropout=0.1, num_classes=num_classes)
-# initialize classifier
-if not model_parameters:
-    model.classifier.initialize(init=initializer, ctx=ctx)
-
-# load checkpointing
-output_dir = args.output_dir
-if pretrained_bert_parameters:
-    logging.info('loading bert params from %s', pretrained_bert_parameters)
-    nlp.utils.load_parameters(model.bert, pretrained_bert_parameters, ctx=ctx, ignore_extra=True,
-                              cast_dtype=True)
-if model_parameters:
-    logging.info('loading model params from %s', model_parameters)
-    nlp.utils.load_parameters(model, model_parameters, ctx=ctx, cast_dtype=True)
-nlp.utils.mkdir(output_dir)
-
-logging.debug(model)
-model.hybridize(static_alloc=True)
-loss_function.hybridize(static_alloc=True)
-
-if deploy:
-    logging.info('load symbol file directly as SymbolBlock for model deployment')
-    model = mx.gluon.SymbolBlock.imports('{}-symbol.json'.format(args.model_prefix),
-                                         ['data0', 'data1', 'data2'],
-                                         '{}-0000.params'.format(args.model_prefix))
-    model.hybridize(static_alloc=True, static_shape=True)
-
-# data processing
-do_lower_case = 'uncased' in dataset
-if use_roberta:
-    bert_tokenizer = nlp.data.GPT2BPETokenizer()
-else:
-    bert_tokenizer = BERTTokenizer(vocabulary, lower=do_lower_case)
-
-# calibration config
-only_calibration = args.only_calibration
-num_calib_batches = args.num_calib_batches
-quantized_dtype = args.quantized_dtype
-calib_mode = args.calib_mode
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='BERT fine-tune examples for classification/regression tasks.',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+    parser.add_argument('--optimizer', type=str, default='bertadam',
+                        help='The optimizer to be used for training')
+    parser.add_argument('--epochs', type=int, default=3, help='number of epochs.')
+    parser.add_argument(
+        '--training_steps', type=int, help='The total training steps. '
+        'Note that if specified, epochs will be ignored.')
+    parser.add_argument(
+        '--batch_size',
+        type=int,
+        default=32,
+        help='Batch size. Number of examples per gpu in a minibatch.')
+    parser.add_argument(
+        '--dev_batch_size',
+        type=int,
+        default=8,
+        help='Batch size for dev set and test set')
+    parser.add_argument(
+        '--lr',
+        type=float,
+        default=3e-5,
+        help='Initial learning rate')
+    parser.add_argument(
+        '--epsilon',
+        type=float,
+        default=1e-6,
+        help='Small value to avoid division by 0'
+    )
+    parser.add_argument(
+        '--warmup_ratio',
+        type=float,
+        default=0.1,
+        help='ratio of warmup steps used in NOAM\'s stepsize schedule')
+    parser.add_argument(
+        '--log_interval',
+        type=int,
+        default=10,
+        help='report interval')
+    parser.add_argument(
+        '--max_len',
+        type=int,
+        default=128,
+        help='Maximum length of the sentence pairs')
+    parser.add_argument(
+        '--round_to', type=int, default=None,
+        help='The length of padded sequences will be rounded up to be multiple of this argument.'
+             'When round to is set to 8, training throughput may increase for mixed precision'
+             'training on GPUs with tensorcores.')
+    parser.add_argument(
+        '--seed', type=int, default=2, help='Random seed')
+    parser.add_argument(
+        '--accumulate',
+        type=int,
+        default=None,
+        help='The number of batches for gradients accumulation to simulate large batch size. '
+             'Default is None')
+    parser.add_argument(
+        '--gpu', type=int, default=None, help='Which gpu for finetuning.')
+    parser.add_argument(
+        '--task_name',
+        type=str,
+        choices=['MRPC', 'QNLI', 'RTE', 'STS-B', 'CoLA',
+                 'MNLI', 'WNLI', 'SST', 'XNLI', 'LCQMC', 'ChnSentiCorp'],
+        help='The name of the task to fine-tune. Choices include MRPC, QQP, '
+             'QNLI, RTE, STS-B, CoLA, MNLI, WNLI, SST.')
+    parser.add_argument(
+        '--bert_model',
+        type=str,
+        default='bert_12_768_12',
+        choices=['bert_12_768_12', 'bert_24_1024_16', 'roberta_12_768_12', 'roberta_24_1024_16'],
+        help='The name of pre-trained BERT model to fine-tune')
+    parser.add_argument(
+        '--bert_dataset',
+        type=str,
+        default='book_corpus_wiki_en_uncased',
+        choices=['book_corpus_wiki_en_uncased', 'book_corpus_wiki_en_cased',
+                 'openwebtext_book_corpus_wiki_en_uncased', 'wiki_multilingual_uncased',
+                 'wiki_multilingual_cased', 'wiki_cn_cased',
+                 'openwebtext_ccnews_stories_books_cased'],
+        help='The dataset BERT pre-trained with.')
+    parser.add_argument(
+        '--pretrained_bert_parameters',
+        type=str,
+        default=None,
+        help='Pre-trained bert model parameter file.')
+    parser.add_argument(
+        '--model_parameters',
+        type=str,
+        default=None,
+        help='A parameter file for the model that is loaded into the model'
+        ' before training/inference. It is different from the parameter'
+        ' file written after the model is trained.')
+    parser.add_argument(
+        '--output_dir',
+        type=str,
+        default='./output_dir',
+        help='The output directory where the model params will be written.')
+    parser.add_argument(
+        '--only_inference',
+        action='store_true',
+        help='If set, we skip training and only perform inference on dev and test data.')
+    parser.add_argument(
+        '--dtype',
+        type=str,
+        default='float32',
+        choices=['float32', 'float16'],
+        help='The data type for training.')
+    parser.add_argument(
+        '--early_stop',
+        type=int,
+        default=None,
+        help='Whether to perform early stopping based on the metric on dev set. '
+             'The provided value is the patience. ')
+    parser.add_argument('--deploy', action='store_true',
+                        help='whether load static model for deployment')
+    parser.add_argument('--model_prefix', type=str, required=False,
+                        help='load static model as hybridblock.')
+    parser.add_argument('--only_calibration', action='store_true',
+                        help='quantize model')
+    parser.add_argument('--num_calib_batches', type=int, default=5,
+                        help='number of batches for calibration')
+    parser.add_argument('--quantized_dtype', type=str, default='auto',
+                        choices=['auto', 'int8', 'uint8'],
+                        help='quantization destination data type for input data')
+    parser.add_argument('--calib_mode', type=str, default='customize',
+                        choices=['none', 'naive', 'entropy', 'customize'],
+                        help='calibration mode used for generating calibration table '
+                             'for the quantized symbol.')
+
+    args = parser.parse_args()
+
+
+    log = logging.getLogger()
+    log.setLevel(logging.INFO)
+
+    logging.captureWarnings(True)
+    fh = logging.FileHandler('log_{0}.txt'.format(args.task_name))
+    formatter = logging.Formatter(fmt='%(levelname)s:%(name)s:%(asctime)s %(message)s',
+                                  datefmt='%H:%M:%S')
+    fh.setLevel(logging.INFO)
+    fh.setFormatter(formatter)
+    console = logging.StreamHandler()
+    console.setLevel(logging.INFO)
+    console.setFormatter(formatter)
+    log.addHandler(console)
+    log.addHandler(fh)
+    logging.info(args)
+
+    batch_size = args.batch_size
+    dev_batch_size = args.dev_batch_size
+    task_name = args.task_name
+    lr = args.lr
+    epsilon = args.epsilon
+    accumulate = args.accumulate
+    log_interval = args.log_interval * accumulate if accumulate else args.log_interval
+    if accumulate:
+        logging.info('Using gradient accumulation. Effective batch size = ' \
+                     'batch_size * accumulate = %d', accumulate * batch_size)
+
+    # random seed
+    np.random.seed(args.seed)
+    random.seed(args.seed)
+    mx.random.seed(args.seed)
+
+    ctx = mx.cpu() if args.gpu is None else mx.gpu(args.gpu)
+
+    task = get_task(task_name)
+
+    # data type with mixed precision training
+    if args.dtype == 'float16':
+        amp.init()
+
+    # model and loss
+    only_inference = args.only_inference
+    model_name = args.bert_model
+    dataset = args.bert_dataset
+    pretrained_bert_parameters = args.pretrained_bert_parameters
+    model_parameters = args.model_parameters
+
+    # load symbolic model
+    deploy = args.deploy
+    model_prefix = args.model_prefix
+
+    if only_inference and not model_parameters:
+        warnings.warn('model_parameters is not set. '
+                      'Randomly initialized model will be used for inference.')
+
+    get_pretrained = not (pretrained_bert_parameters is not None or model_parameters is not None)
+
+    use_roberta = 'roberta' in model_name
+    get_model_params = {
+        'name': model_name,
+        'dataset_name': dataset,
+        'pretrained': get_pretrained,
+        'ctx': ctx,
+        'use_decoder': False,
+        'use_classifier': False,
+    }
+    # RoBERTa does not contain parameters for sentence pair classification
+    if not use_roberta:
+        get_model_params['use_pooler'] = True
+
+    bert, vocabulary = nlp.model.get_model(**get_model_params)
+
+    # initialize the rest of the parameters
+    initializer = mx.init.Normal(0.02)
+    # STS-B is a regression task.
+    # STSBTask().class_labels returns None
+    do_regression = not task.class_labels
+    if do_regression:
+        num_classes = 1
+        loss_function = gluon.loss.L2Loss()
+    else:
+        num_classes = len(task.class_labels)
+        loss_function = gluon.loss.SoftmaxCELoss()
+    # reuse the BERTClassifier class with num_classes=1 for regression
+    if use_roberta:
+        model = RoBERTaClassifier(bert, dropout=0.0, num_classes=num_classes)
+    else:
+        model = BERTClassifier(bert, dropout=0.1, num_classes=num_classes)
+    # initialize classifier
+    if not model_parameters:
+        model.classifier.initialize(init=initializer, ctx=ctx)
+
+    # load checkpointing
+    output_dir = args.output_dir
+    if pretrained_bert_parameters:
+        logging.info('loading bert params from %s', pretrained_bert_parameters)
+        nlp.utils.load_parameters(model.bert, pretrained_bert_parameters, ctx=ctx,
+                                  ignore_extra=True, cast_dtype=True)
+    if model_parameters:
+        logging.info('loading model params from %s', model_parameters)
+        nlp.utils.load_parameters(model, model_parameters, ctx=ctx, cast_dtype=True)
+    nlp.utils.mkdir(output_dir)
+
+    logging.debug(model)
+    model.hybridize(static_alloc=True)
+    loss_function.hybridize(static_alloc=True)
+
+    if deploy:
+        logging.info('load symbol file directly as SymbolBlock for model deployment')
+        model = mx.gluon.SymbolBlock.imports('{}-symbol.json'.format(args.model_prefix),
+                                             ['data0', 'data1', 'data2'],
+                                             '{}-0000.params'.format(args.model_prefix))
+        model.hybridize(static_alloc=True, static_shape=True)
+
+    # data processing
+    do_lower_case = 'uncased' in dataset
+    if use_roberta:
+        bert_tokenizer = nlp.data.GPT2BPETokenizer()
+    else:
+        bert_tokenizer = BERTTokenizer(vocabulary, lower=do_lower_case)
+
+    # calibration config
+    only_calibration = args.only_calibration
+    num_calib_batches = args.num_calib_batches
+    quantized_dtype = args.quantized_dtype
+    calib_mode = args.calib_mode
 
 def convert_examples_to_features(example, tokenizer=None, truncate_length=512, cls_token=None,
                                  sep_token=None, class_labels=None, label_alias=None, vocab=None,
@@ -412,11 +413,11 @@ def preprocess_data(tokenizer, task, batch_size, dev_batch_size, max_len, vocab)
         loader_test_list.append((segment, loader_test))
     return loader_train, loader_dev_list, loader_test_list, len(data_train)
 
-
-# Get the loader.
-logging.info('processing dataset...')
-train_data, dev_data_list, test_data_list, num_train_examples = preprocess_data(
-    bert_tokenizer, task, batch_size, dev_batch_size, args.max_len, vocabulary)
+if __name__ == '__main__':
+    # Get the loader.
+    logging.info('processing dataset...')
+    train_data, dev_data_list, test_data_list, num_train_examples = preprocess_data(
+        bert_tokenizer, task, batch_size, dev_batch_size, args.max_len, vocabulary)
 
 def calibration(net, dev_data_list, num_calib_batches, quantized_dtype, calib_mode):
     """calibration function on the dev dataset."""
diff --git a/scripts/bert/finetune_squad.py b/scripts/bert/finetune_squad.py
index 2f21356986..bdf4195c40 100644
--- a/scripts/bert/finetune_squad.py
+++ b/scripts/bert/finetune_squad.py
@@ -62,331 +62,333 @@
 random.seed(6)
 mx.random.seed(6)
 
-log = logging.getLogger('gluonnlp')
-log.setLevel(logging.DEBUG)
-formatter = logging.Formatter(
-    fmt='%(levelname)s:%(name)s:%(asctime)s %(message)s', datefmt='%H:%M:%S')
-
-parser = argparse.ArgumentParser(
-    description='BERT QA example.'
-    'We fine-tune the BERT model on SQuAD dataset.')
-
-parser.add_argument('--only_predict',
-                    action='store_true',
-                    help='Whether to predict only.')
-
-parser.add_argument('--model_parameters',
-                    type=str,
-                    default=None,
-                    help='Model parameter file')
-
-parser.add_argument('--bert_model',
-                    type=str,
-                    default='bert_12_768_12',
-                    help='BERT model name. options are bert_12_768_12 and bert_24_1024_16.')
-
-parser.add_argument('--bert_dataset',
-                    type=str,
-                    default='book_corpus_wiki_en_uncased',
-                    help='BERT dataset name.'
-                    'options are book_corpus_wiki_en_uncased and book_corpus_wiki_en_cased.')
-
-parser.add_argument('--pretrained_bert_parameters',
-                    type=str,
-                    default=None,
-                    help='Pre-trained bert model parameter file. default is None')
-
-parser.add_argument('--uncased',
-                    action='store_false',
-                    help='if not set, inputs are converted to lower case.')
-
-parser.add_argument('--output_dir',
-                    type=str,
-                    default='./output_dir',
-                    help='The output directory where the model params will be written.'
-                    ' default is ./output_dir')
-
-parser.add_argument('--epochs',
-                    type=int,
-                    default=3,
-                    help='number of epochs, default is 3')
-parser.add_argument('--training_steps',
-                    type=int,
-                    help='training steps, epochs will be ignored '
-                    'if trainin_steps is specified.')
-parser.add_argument('--batch_size',
-                    type=int,
-                    default=32,
-                    help='Batch size. Number of examples per gpu in a minibatch. default is 32')
-
-parser.add_argument('--test_batch_size',
-                    type=int,
-                    default=24,
-                    help='Test batch size. default is 24')
-
-parser.add_argument('--optimizer',
-                    type=str,
-                    default='bertadam',
-                    help='optimization algorithm. default is bertadam')
-
-parser.add_argument('--accumulate',
-                    type=int,
-                    default=None,
-                    help='The number of batches for '
-                    'gradients accumulation to simulate large batch size. Default is None')
-
-parser.add_argument('--lr',
-                    type=float,
-                    default=5e-5,
-                    help='Initial learning rate. default is 5e-5')
-
-parser.add_argument('--warmup_ratio',
-                    type=float,
-                    default=0.1,
-                    help='ratio of warmup steps that linearly increase learning rate from '
-                    '0 to target learning rate. default is 0.1')
-
-parser.add_argument('--log_interval',
-                    type=int,
-                    default=50,
-                    help='report interval. default is 50')
-
-parser.add_argument('--max_seq_length',
-                    type=int,
-                    default=384,
-                    help='The maximum total input sequence length after WordPiece tokenization.'
-                    'Sequences longer than this will be truncated, and sequences shorter '
-                    'than this will be padded. default is 384')
-
-parser.add_argument(
-    '--round_to', type=int, default=None,
-    help='The length of padded sequences will be rounded up to be multiple of this argument.'
-         'When round to is set to 8, training throughput may increase for mixed precision'
-         'training on GPUs with tensorcores.')
-
-parser.add_argument('--doc_stride',
-                    type=int,
-                    default=128,
-                    help='When splitting up a long document into chunks, how much stride to '
-                    'take between chunks. default is 128')
-
-parser.add_argument('--max_query_length',
-                    type=int,
-                    default=64,
-                    help='The maximum number of tokens for the question. Questions longer than '
-                    'this will be truncated to this length. default is 64')
-
-parser.add_argument('--n_best_size',
-                    type=int,
-                    default=20,
-                    help='The total number of n-best predictions to generate in the '
-                    'nbest_predictions.json output file. default is 20')
-
-parser.add_argument('--max_answer_length',
-                    type=int,
-                    default=30,
-                    help='The maximum length of an answer that can be generated. This is needed '
-                    'because the start and end predictions are not conditioned on one another.'
-                    ' default is 30')
-
-parser.add_argument('--version_2',
-                    action='store_true',
-                    help='SQuAD examples whether contain some that do not have an answer.')
-
-parser.add_argument('--null_score_diff_threshold',
-                    type=float,
-                    default=0.0,
-                    help='If null_score - best_non_null is greater than the threshold predict null.'
-                    'Typical values are between -1.0 and -5.0. default is 0.0')
-
-parser.add_argument('--gpu',
-                    action='store_true',
-                    help='use GPU instead of CPU')
-
-parser.add_argument('--sentencepiece',
-                    type=str,
-                    default=None,
-                    help='Path to the sentencepiece .model file for both tokenization and vocab.')
-
-parser.add_argument('--debug',
-                    action='store_true',
-                    help='Run the example in test mode for sanity checks')
-
-parser.add_argument('--dtype',
-                    type=str,
-                    default='float32',
-                    help='Data type used for training. Either float32 or float16')
-
-parser.add_argument('--comm_backend',
-                    type=str,
-                    default=None,
-                    help='Communication backend. Set to horovod if horovod is used for '
-                         'multi-GPU training')
-
-parser.add_argument('--deploy', action='store_true',
-                    help='whether load static model for deployment')
-
-parser.add_argument('--model_prefix', type=str, required=False,
-                    help='load static model as hybridblock.')
-
-parser.add_argument('--only_calibration', action='store_true',
-                    help='quantize model')
-
-parser.add_argument('--num_calib_batches', type=int, default=10,
-                    help='number of batches for calibration')
-
-parser.add_argument('--quantized_dtype', type=str, default='auto',
-                    choices=['auto', 'int8', 'uint8'],
-                    help='quantization destination data type for input data')
-
-parser.add_argument('--calib_mode', type=str, default='customize',
-                    choices=['none', 'naive', 'entropy', 'customize'],
-                    help='calibration mode used for generating calibration table '
-                         'for the quantized symbol.')
-
-args = parser.parse_args()
-
-output_dir = args.output_dir
-if not os.path.exists(output_dir):
-    os.mkdir(output_dir)
-
-fh = logging.FileHandler(os.path.join(args.output_dir, 'finetune_squad.log'),
-                         mode='w')
-fh.setLevel(logging.INFO)
-fh.setFormatter(formatter)
-console = logging.StreamHandler()
-console.setLevel(logging.INFO)
-console.setFormatter(formatter)
-log.addHandler(console)
-log.addHandler(fh)
-
-log.info(args)
-
-if args.comm_backend == 'horovod':
-    import horovod.mxnet as hvd
-    hvd.init()
-    rank = hvd.rank()
-    size = hvd.size()
-    local_rank = hvd.local_rank()
-else:
-    rank = 0
-    size = 1
-    local_rank = 0
-
-if args.dtype == 'float16':
-    from mxnet.contrib import amp
-    amp.init()
-
-model_name = args.bert_model
-dataset_name = args.bert_dataset
-only_predict = args.only_predict
-model_parameters = args.model_parameters
-pretrained_bert_parameters = args.pretrained_bert_parameters
-if pretrained_bert_parameters and model_parameters:
-    raise ValueError('Cannot provide both pre-trained BERT parameters and '
-                     'BertForQA model parameters.')
-lower = args.uncased
-
-batch_size = args.batch_size
-test_batch_size = args.test_batch_size
-lr = args.lr
-ctx = mx.gpu(local_rank) if args.gpu else mx.cpu()
-
-accumulate = args.accumulate
-log_interval = args.log_interval * accumulate if accumulate else args.log_interval
-if accumulate:
-    log.info('Using gradient accumulation. Effective total batch size = {}'.
-             format(accumulate*batch_size*size))
-
-optimizer = args.optimizer
-warmup_ratio = args.warmup_ratio
-
-
-version_2 = args.version_2
-null_score_diff_threshold = args.null_score_diff_threshold
-
-max_seq_length = args.max_seq_length
-doc_stride = args.doc_stride
-max_query_length = args.max_query_length
-n_best_size = args.n_best_size
-max_answer_length = args.max_answer_length
-
-if max_seq_length <= max_query_length + 3:
-    raise ValueError('The max_seq_length (%d) must be greater than max_query_length '
-                     '(%d) + 3' % (max_seq_length, max_query_length))
-
-# vocabulary and tokenizer
-if args.sentencepiece:
-    logging.info('loading vocab file from sentence piece model: %s', args.sentencepiece)
-    if dataset_name:
-        warnings.warn('Both --dataset_name and --sentencepiece are provided. '
-                      'The vocabulary will be loaded based on --sentencepiece.')
-    vocab = nlp.vocab.BERTVocab.from_sentencepiece(args.sentencepiece)
-    dataset_name = None
-else:
-    vocab = None
-
-pretrained = not model_parameters and not pretrained_bert_parameters and not args.sentencepiece
-bert, vocab = nlp.model.get_model(
-    name=model_name,
-    dataset_name=dataset_name,
-    vocab=vocab,
-    pretrained=pretrained,
-    ctx=ctx,
-    use_pooler=False,
-    use_decoder=False,
-    use_classifier=False)
-
-if args.sentencepiece:
-    tokenizer = nlp.data.BERTSPTokenizer(args.sentencepiece, vocab, lower=lower)
-else:
-    tokenizer = nlp.data.BERTTokenizer(vocab=vocab, lower=lower)
-
-batchify_fn = nlp.data.batchify.Tuple(
-    nlp.data.batchify.Stack(),
-    nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token], round_to=args.round_to),
-    nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token], round_to=args.round_to),
-    nlp.data.batchify.Stack('float32'),
-    nlp.data.batchify.Stack('float32'),
-    nlp.data.batchify.Stack('float32'))
-
-# load symbolic model
-deploy = args.deploy
-model_prefix = args.model_prefix
-
-net = BertForQA(bert=bert)
-if model_parameters:
-    # load complete BertForQA parameters
-    nlp.utils.load_parameters(net, model_parameters, ctx=ctx, cast_dtype=True)
-elif pretrained_bert_parameters:
-    # only load BertModel parameters
-    nlp.utils.load_parameters(bert, pretrained_bert_parameters, ctx=ctx,
-                              ignore_extra=True, cast_dtype=True)
-    net.span_classifier.initialize(init=mx.init.Normal(0.02), ctx=ctx)
-elif pretrained:
-    # only load BertModel parameters
-    net.span_classifier.initialize(init=mx.init.Normal(0.02), ctx=ctx)
-else:
-    # no checkpoint is loaded
-    net.initialize(init=mx.init.Normal(0.02), ctx=ctx)
-
-net.hybridize(static_alloc=True)
-
-loss_function = BertForQALoss()
-loss_function.hybridize(static_alloc=True)
-
-if deploy:
-    logging.info('load symbol file directly as SymbolBlock for model deployment')
-    net = mx.gluon.SymbolBlock.imports('{}-symbol.json'.format(args.model_prefix),
-                                       ['data0', 'data1', 'data2'],
-                                       '{}-0000.params'.format(args.model_prefix))
-    net.hybridize(static_alloc=True, static_shape=True)
-
-# calibration config
-only_calibration = args.only_calibration
-num_calib_batches = args.num_calib_batches
-quantized_dtype = args.quantized_dtype
-calib_mode = args.calib_mode
+if __name__ == '__main__':
+    log = logging.getLogger('gluonnlp')
+    log.setLevel(logging.DEBUG)
+    formatter = logging.Formatter(
+        fmt='%(levelname)s:%(name)s:%(asctime)s %(message)s', datefmt='%H:%M:%S')
+
+    parser = argparse.ArgumentParser(
+        description='BERT QA example.'
+        'We fine-tune the BERT model on SQuAD dataset.')
+
+    parser.add_argument('--only_predict',
+                        action='store_true',
+                        help='Whether to predict only.')
+
+    parser.add_argument('--model_parameters',
+                        type=str,
+                        default=None,
+                        help='Model parameter file')
+
+    parser.add_argument('--bert_model',
+                        type=str,
+                        default='bert_12_768_12',
+                        help='BERT model name. options are bert_12_768_12 and bert_24_1024_16.')
+
+    parser.add_argument('--bert_dataset',
+                        type=str,
+                        default='book_corpus_wiki_en_uncased',
+                        help='BERT dataset name.'
+                        'options are book_corpus_wiki_en_uncased and book_corpus_wiki_en_cased.')
+
+    parser.add_argument('--pretrained_bert_parameters',
+                        type=str,
+                        default=None,
+                        help='Pre-trained bert model parameter file. default is None')
+
+    parser.add_argument('--uncased',
+                        action='store_false',
+                        help='if not set, inputs are converted to lower case.')
+
+    parser.add_argument('--output_dir',
+                        type=str,
+                        default='./output_dir',
+                        help='The output directory where the model params will be written.'
+                        ' default is ./output_dir')
+
+    parser.add_argument('--epochs',
+                        type=int,
+                        default=3,
+                        help='number of epochs, default is 3')
+    parser.add_argument('--training_steps',
+                        type=int,
+                        help='training steps, epochs will be ignored '
+                        'if trainin_steps is specified.')
+    parser.add_argument('--batch_size',
+                        type=int,
+                        default=32,
+                        help='Batch size. Number of examples per gpu in a minibatch. default is 32')
+
+    parser.add_argument('--test_batch_size',
+                        type=int,
+                        default=24,
+                        help='Test batch size. default is 24')
+
+    parser.add_argument('--optimizer',
+                        type=str,
+                        default='bertadam',
+                        help='optimization algorithm. default is bertadam')
+
+    parser.add_argument('--accumulate',
+                        type=int,
+                        default=None,
+                        help='The number of batches for '
+                        'gradients accumulation to simulate large batch size. Default is None')
+
+    parser.add_argument('--lr',
+                        type=float,
+                        default=5e-5,
+                        help='Initial learning rate. default is 5e-5')
+
+    parser.add_argument('--warmup_ratio',
+                        type=float,
+                        default=0.1,
+                        help='ratio of warmup steps that linearly increase learning rate from '
+                        '0 to target learning rate. default is 0.1')
+
+    parser.add_argument('--log_interval',
+                        type=int,
+                        default=50,
+                        help='report interval. default is 50')
+
+    parser.add_argument('--max_seq_length',
+                        type=int,
+                        default=384,
+                        help='The maximum total input sequence length after WordPiece tokenization.'
+                        'Sequences longer than this will be truncated, and sequences shorter '
+                        'than this will be padded. default is 384')
+
+    parser.add_argument(
+        '--round_to', type=int, default=None,
+        help='The length of padded sequences will be rounded up to be multiple of this argument.'
+             'When round to is set to 8, training throughput may increase for mixed precision'
+             'training on GPUs with tensorcores.')
+
+    parser.add_argument('--doc_stride',
+                        type=int,
+                        default=128,
+                        help='When splitting up a long document into chunks, how much stride to '
+                        'take between chunks. default is 128')
+
+    parser.add_argument('--max_query_length',
+                        type=int,
+                        default=64,
+                        help='The maximum number of tokens for the question. Questions longer than '
+                        'this will be truncated to this length. default is 64')
+
+    parser.add_argument('--n_best_size',
+                        type=int,
+                        default=20,
+                        help='The total number of n-best predictions to generate in the '
+                        'nbest_predictions.json output file. default is 20')
+
+    parser.add_argument('--max_answer_length',
+                        type=int,
+                        default=30,
+                        help='The maximum length of an answer that can be generated. '
+                        'This is needed because the start and end predictions are not '
+                        'conditioned on one another. default is 30')
+
+    parser.add_argument('--version_2',
+                        action='store_true',
+                        help='SQuAD examples whether contain some that do not have an answer.')
+
+    parser.add_argument('--null_score_diff_threshold',
+                        type=float,
+                        default=0.0,
+                        help='If null_score - best_non_null is greater than the threshold '
+                        'predict null. Typical values are between -1.0 and -5.0. default is 0.0')
+
+    parser.add_argument('--gpu',
+                        action='store_true',
+                        help='use GPU instead of CPU')
+
+    parser.add_argument('--sentencepiece',
+                        type=str,
+                        default=None,
+                        help='Path to the sentencepiece .model file for both tokenization and '
+                        'vocab.')
+
+    parser.add_argument('--debug',
+                        action='store_true',
+                        help='Run the example in test mode for sanity checks')
+
+    parser.add_argument('--dtype',
+                        type=str,
+                        default='float32',
+                        help='Data type used for training. Either float32 or float16')
+
+    parser.add_argument('--comm_backend',
+                        type=str,
+                        default=None,
+                        help='Communication backend. Set to horovod if horovod is used for '
+                             'multi-GPU training')
+
+    parser.add_argument('--deploy', action='store_true',
+                        help='whether load static model for deployment')
+
+    parser.add_argument('--model_prefix', type=str, required=False,
+                        help='load static model as hybridblock.')
+
+    parser.add_argument('--only_calibration', action='store_true',
+                        help='quantize model')
+
+    parser.add_argument('--num_calib_batches', type=int, default=10,
+                        help='number of batches for calibration')
+
+    parser.add_argument('--quantized_dtype', type=str, default='auto',
+                        choices=['auto', 'int8', 'uint8'],
+                        help='quantization destination data type for input data')
+
+    parser.add_argument('--calib_mode', type=str, default='customize',
+                        choices=['none', 'naive', 'entropy', 'customize'],
+                        help='calibration mode used for generating calibration table '
+                             'for the quantized symbol.')
+
+    args = parser.parse_args()
+
+    output_dir = args.output_dir
+    if not os.path.exists(output_dir):
+        os.mkdir(output_dir)
+
+    fh = logging.FileHandler(os.path.join(args.output_dir, 'finetune_squad.log'),
+                             mode='w')
+    fh.setLevel(logging.INFO)
+    fh.setFormatter(formatter)
+    console = logging.StreamHandler()
+    console.setLevel(logging.INFO)
+    console.setFormatter(formatter)
+    log.addHandler(console)
+    log.addHandler(fh)
+
+    log.info(args)
+
+    if args.comm_backend == 'horovod':
+        import horovod.mxnet as hvd
+        hvd.init()
+        rank = hvd.rank()
+        size = hvd.size()
+        local_rank = hvd.local_rank()
+    else:
+        rank = 0
+        size = 1
+        local_rank = 0
+
+    if args.dtype == 'float16':
+        from mxnet.contrib import amp
+        amp.init()
+
+    model_name = args.bert_model
+    dataset_name = args.bert_dataset
+    only_predict = args.only_predict
+    model_parameters = args.model_parameters
+    pretrained_bert_parameters = args.pretrained_bert_parameters
+    if pretrained_bert_parameters and model_parameters:
+        raise ValueError('Cannot provide both pre-trained BERT parameters and '
+                         'BertForQA model parameters.')
+    lower = args.uncased
+
+    batch_size = args.batch_size
+    test_batch_size = args.test_batch_size
+    lr = args.lr
+    ctx = mx.gpu(local_rank) if args.gpu else mx.cpu()
+
+    accumulate = args.accumulate
+    log_interval = args.log_interval * accumulate if accumulate else args.log_interval
+    if accumulate:
+        log.info('Using gradient accumulation. Effective total batch size = {}'.
+                 format(accumulate*batch_size*size))
+
+    optimizer = args.optimizer
+    warmup_ratio = args.warmup_ratio
+
+
+    version_2 = args.version_2
+    null_score_diff_threshold = args.null_score_diff_threshold
+
+    max_seq_length = args.max_seq_length
+    doc_stride = args.doc_stride
+    max_query_length = args.max_query_length
+    n_best_size = args.n_best_size
+    max_answer_length = args.max_answer_length
+
+    if max_seq_length <= max_query_length + 3:
+        raise ValueError('The max_seq_length (%d) must be greater than max_query_length '
+                         '(%d) + 3' % (max_seq_length, max_query_length))
+
+    # vocabulary and tokenizer
+    if args.sentencepiece:
+        logging.info('loading vocab file from sentence piece model: %s', args.sentencepiece)
+        if dataset_name:
+            warnings.warn('Both --dataset_name and --sentencepiece are provided. '
+                          'The vocabulary will be loaded based on --sentencepiece.')
+        vocab = nlp.vocab.BERTVocab.from_sentencepiece(args.sentencepiece)
+        dataset_name = None
+    else:
+        vocab = None
+
+    pretrained = not model_parameters and not pretrained_bert_parameters and not args.sentencepiece
+    bert, vocab = nlp.model.get_model(
+        name=model_name,
+        dataset_name=dataset_name,
+        vocab=vocab,
+        pretrained=pretrained,
+        ctx=ctx,
+        use_pooler=False,
+        use_decoder=False,
+        use_classifier=False)
+
+    if args.sentencepiece:
+        tokenizer = nlp.data.BERTSPTokenizer(args.sentencepiece, vocab, lower=lower)
+    else:
+        tokenizer = nlp.data.BERTTokenizer(vocab=vocab, lower=lower)
+
+    batchify_fn = nlp.data.batchify.Tuple(
+        nlp.data.batchify.Stack(),
+        nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token], round_to=args.round_to),
+        nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token], round_to=args.round_to),
+        nlp.data.batchify.Stack('float32'),
+        nlp.data.batchify.Stack('float32'),
+        nlp.data.batchify.Stack('float32'))
+
+    # load symbolic model
+    deploy = args.deploy
+    model_prefix = args.model_prefix
+
+    net = BertForQA(bert=bert)
+    if model_parameters:
+        # load complete BertForQA parameters
+        nlp.utils.load_parameters(net, model_parameters, ctx=ctx, cast_dtype=True)
+    elif pretrained_bert_parameters:
+        # only load BertModel parameters
+        nlp.utils.load_parameters(bert, pretrained_bert_parameters, ctx=ctx,
+                                  ignore_extra=True, cast_dtype=True)
+        net.span_classifier.initialize(init=mx.init.Normal(0.02), ctx=ctx)
+    elif pretrained:
+        # only load BertModel parameters
+        net.span_classifier.initialize(init=mx.init.Normal(0.02), ctx=ctx)
+    else:
+        # no checkpoint is loaded
+        net.initialize(init=mx.init.Normal(0.02), ctx=ctx)
+
+    net.hybridize(static_alloc=True)
+
+    loss_function = BertForQALoss()
+    loss_function.hybridize(static_alloc=True)
+
+    if deploy:
+        logging.info('load symbol file directly as SymbolBlock for model deployment')
+        net = mx.gluon.SymbolBlock.imports('{}-symbol.json'.format(args.model_prefix),
+                                           ['data0', 'data1', 'data2'],
+                                           '{}-0000.params'.format(args.model_prefix))
+        net.hybridize(static_alloc=True, static_shape=True)
+
+    # calibration config
+    only_calibration = args.only_calibration
+    num_calib_batches = args.num_calib_batches
+    quantized_dtype = args.quantized_dtype
+    calib_mode = args.calib_mode
 
 def train():
     """Training function."""
diff --git a/scripts/bert/index.rst b/scripts/bert/index.rst
index 158da3be52..5834565915 100644
--- a/scripts/bert/index.rst
+++ b/scripts/bert/index.rst
@@ -335,14 +335,21 @@ To use sentencepiece vocab for pre-training, please set --sentencepiece=my_vocab
 Export BERT for Deployment
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Current export.py support exporting BERT models. Supported values for --task argument include classification, regression and question answering.
+The script deploy.py allows exporting/importing BERT models. Supported values for --task argument include QA (question-answering), embedding (see below section), and classification and regression tasks specifying one of the following datasets: MRPC, QQP, QNLI, RTE, STS-B, CoLA, MNLI, WNLI, SST, XNLI, LCQMC, ChnSentiCorp. It uses available validation datasets to perform and test inference.
 
 .. code-block:: console
 
-    $ python export.py --task classification --model_parameters /path/to/saved/ckpt.params --output_dir /path/to/output/dir/ --seq_length 128
+    $ MXNET_SAFE_ACCUMULATION=1 MXNET_FC_TRUE_FP16=1 python deploy.py --task SST --model_parameters /path/to/saved/ckpt.params --output_dir /path/to/output/dir/ --seq_length 128 --gpu 0 --dtype float16
+
+This will export the BERT model and its parameters for a classification (sentiment analysis) task to symbol.json/param files, saved into the directory specified by --output_dir.
+
+Once the model is exported, you can import the model by setting --only_infer, and specifying the path to your model with --exported_model followed by the prefix name of the symbol.json/param files.
+
+The batch size can be specified via --test_batch_size option, and accuracy can be checked setting --check_accuracy.
+
+When using GPU and data type FP16 (--dtype float16), we recommend to use MXNET_FC_TRUE_FP16=1 for boosting performance.
+Moreover, you can use a custom graph pass for BERT, via --custom_pass [custom_pass_file], to improve the performance on GPU. To generate the pass you can run setup.py within the BERT scripts directory. These GPU optimizations require MXNet version 1.7 or higher.
 
-This will export the BERT model for classification to a symbol.json file, saved to the directory specified by --output_dir.
-The --model_parameters argument is optional. If not set, the .params file saved in the output directory will be randomly initialized parameters.
 
 BERT for Sentence or Tokens Embedding
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/scripts/bert/setup.py b/scripts/bert/setup.py
new file mode 100644
index 0000000000..5872faa0f9
--- /dev/null
+++ b/scripts/bert/setup.py
@@ -0,0 +1,48 @@
+"""
+setup.py: prepares required libraries for BERT scripts
+"""
+#!/usr/bin/env python
+import pathlib
+import sys
+import os
+import logging
+from distutils.command.install import install
+from setuptools import setup
+import mxnet
+
+requirements = [
+    'numpy>=1.16.0',
+]
+
+def CompileBERTCustomPass():
+    """Compiles custom graph pass for BERT into a library. It offers performance improvements"""
+    logging.basicConfig(stream=sys.stderr, level=logging.INFO)
+    log = logging.getLogger()
+    input_pass_file = 'bertpass_gpu.cc'
+    out_lib_file = 'bertpass_lib.so'
+    log.info(' ... compiling BERT custom graph pass into %s', out_lib_file)
+    mxnet_path = pathlib.Path(mxnet.__file__).parent.absolute()
+    mxnet_include_path = pathlib.Path.joinpath(mxnet_path, 'include')
+    pass_path = os.path.dirname(os.path.realpath(__file__))
+    source = os.path.join(pass_path, input_pass_file)
+    target = os.path.join(pass_path, out_lib_file)
+    os.system('g++ -shared -fPIC -std=c++11 ' + str(source) +
+              ' -o ' + str(target) + ' -I ' +
+              str(mxnet_include_path))
+
+class CompileBERTPass(install):
+    def run(self):
+        install.run(self)
+        self.execute(CompileBERTCustomPass, ())
+
+setup(
+    # Metadata
+    name='gluonnlp-scripts-bert',
+    python_requires='>=3.5',
+    author='Gluon NLP Toolkit Contributors',
+    author_email='mxnet-gluon@amazon.com',
+    url='https://github.com/dmlc/gluon-nlp',
+    description='MXNet Gluon NLP Toolkit - BERT scripts',
+    license='Apache-2.0',
+    cmdclass={'install': CompileBERTPass}
+)
diff --git a/scripts/tests/test_scripts.py b/scripts/tests/test_scripts.py
index 9e848a774f..cc858a5d65 100644
--- a/scripts/tests/test_scripts.py
+++ b/scripts/tests/test_scripts.py
@@ -421,3 +421,25 @@ def test_xlnet_finetune_squad():
     process = subprocess.check_call([sys.executable, './scripts/language_model/run_squad.py']
                                     + arguments)
     time.sleep(5)
+
+@pytest.mark.skipif(mx.__version__ < '1.7.0', reason="Requires MXNet 1.7 or higher")
+@pytest.mark.serial
+@pytest.mark.gpu
+@pytest.mark.remote_required
+@pytest.mark.integration
+@pytest.mark.parametrize('bert_model', ['bert_12_768_12'])
+@pytest.mark.parametrize('task', ['SST', 'embedding', 'QA'])
+@pytest.mark.parametrize('dtype', ['float32', 'float16'])
+def test_deploy_bert(bert_model, task, dtype):
+    subprocess.check_call([sys.executable, './scripts/bert/setup.py', 'install'])
+    arguments = ['--bert_model', bert_model, '--task', task, '--dtype', dtype,
+                 '--gpu', '0', '--seq_length', '128', '--test_batch_size', '300',
+                 '--custom_pass', 'scripts/bert/bertpass_lib.so', '--check_accuracy']
+    if dtype == 'float16':
+        os.environ['MXNET_FC_TRUE_FP16'] = '1'
+        os.environ['MXNET_SAFE_ACCUMULATION'] = '1'
+    process = subprocess.check_call([sys.executable, './scripts/bert/deploy.py']
+                                    + arguments)
+    os.environ['MXNET_FC_TRUE_FP16'] = '0'
+    os.environ['MXNET_SAFE_ACCUMULATION'] = '0'
+    time.sleep(5)

From 52e3de1dbe1b680eca7e2d771c5c380d849d5fcf Mon Sep 17 00:00:00 2001
From: Sheng Zha <szha@users.noreply.github.com>
Date: Fri, 18 Sep 2020 13:25:00 -0700
Subject: [PATCH 34/42] fix sig (#1368)

---
 scripts/machine_translation/gnmt.py           |  2 +-
 src/gluonnlp/model/bert.py                    |  2 +-
 src/gluonnlp/model/seq2seq_encoder_decoder.py | 10 +++++-----
 src/gluonnlp/model/transformer.py             |  2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/scripts/machine_translation/gnmt.py b/scripts/machine_translation/gnmt.py
index c31cb1d66f..736f7f8472 100644
--- a/scripts/machine_translation/gnmt.py
+++ b/scripts/machine_translation/gnmt.py
@@ -133,7 +133,7 @@ def __call__(self, inputs, states=None, valid_length=None):
         """
         return super(GNMTEncoder, self).__call__(inputs, states, valid_length)
 
-    def forward(self, inputs, states=None, valid_length=None):  #pylint: disable=arguments-differ, missing-docstring
+    def forward(self, inputs, states=None, valid_length=None):  #pylint: missing-docstring
         # TODO(sxjscience) Accelerate the forward using HybridBlock
         _, length, _ = inputs.shape
         new_states = []
diff --git a/src/gluonnlp/model/bert.py b/src/gluonnlp/model/bert.py
index 5bd8b1a8ef..881b902467 100644
--- a/src/gluonnlp/model/bert.py
+++ b/src/gluonnlp/model/bert.py
@@ -340,7 +340,7 @@ def __init__(self, *, num_layers=2, units=512, hidden_size=2048,
                     activation=activation, layer_norm_eps=layer_norm_eps)
                 self.transformer_cells.add(cell)
 
-    def __call__(self, inputs, states=None, valid_length=None):  # pylint: disable=arguments-differ
+    def __call__(self, inputs, states=None, valid_length=None):
         """Encode the inputs given the states and valid sequence length.
 
         Parameters
diff --git a/src/gluonnlp/model/seq2seq_encoder_decoder.py b/src/gluonnlp/model/seq2seq_encoder_decoder.py
index 42e9f9f1ba..3851aaa67a 100644
--- a/src/gluonnlp/model/seq2seq_encoder_decoder.py
+++ b/src/gluonnlp/model/seq2seq_encoder_decoder.py
@@ -53,27 +53,27 @@ class Seq2SeqEncoder(Block):
     r"""Base class of the encoders in sequence to sequence learning models.
     """
 
-    def __call__(self, inputs, valid_length=None, states=None):  #pylint: disable=arguments-differ
+    def __call__(self, inputs, states=None, valid_length=None):  #pylint: disable=arguments-differ
         """Encode the input sequence.
 
         Parameters
         ----------
         inputs : NDArray
             The input sequence, Shape (batch_size, length, C_in).
+        states : list of NDArrays or None, default None
+            List that contains the initial states of the encoder.
         valid_length : NDArray or None, default None
             The valid length of the input sequence, Shape (batch_size,). This is used when the
             input sequences are padded. If set to None, all elements in the sequence are used.
-        states : list of NDArrays or None, default None
-            List that contains the initial states of the encoder.
 
         Returns
         -------
         outputs : list
             Outputs of the encoder.
         """
-        return super(Seq2SeqEncoder, self).__call__(inputs, valid_length, states)
+        return super(Seq2SeqEncoder, self).__call__(inputs, states, valid_length)
 
-    def forward(self, inputs, valid_length=None, states=None):  #pylint: disable=arguments-differ
+    def forward(self, inputs, states=None, valid_length=None):  #pylint: disable=arguments-differ
         raise NotImplementedError
 
 
diff --git a/src/gluonnlp/model/transformer.py b/src/gluonnlp/model/transformer.py
index decad26d45..e1adf4cf82 100644
--- a/src/gluonnlp/model/transformer.py
+++ b/src/gluonnlp/model/transformer.py
@@ -344,7 +344,7 @@ def __init__(self, *, attention_cell='multi_head', num_layers=2, units=512, hidd
                     scaled=scaled, output_attention=output_attention, prefix='transformer%d_' % i)
                 self.transformer_cells.add(cell)
 
-    def __call__(self, inputs, states=None, valid_length=None): #pylint: disable=arguments-differ
+    def __call__(self, inputs, states=None, valid_length=None):
         """Encode the inputs given the states and valid sequence length.
 
         Parameters

From cbf5bd0b8c0e6ce7f4e8e0e2ea08dc796b1cc5c7 Mon Sep 17 00:00:00 2001
From: Leonard Lausen <lausen@amazon.com>
Date: Mon, 5 Oct 2020 18:41:58 -0700
Subject: [PATCH 35/42] Build horovod with gloo (#1383)

* Build horovod with gloo

* Update prepare_clean_env.sh
---
 ci/prepare_clean_env.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/ci/prepare_clean_env.sh b/ci/prepare_clean_env.sh
index 1a224c418a..7411940614 100755
--- a/ci/prepare_clean_env.sh
+++ b/ci/prepare_clean_env.sh
@@ -11,6 +11,8 @@ export MXNET_HOME=$PWD/tests/data
 export HOROVOD_WITHOUT_TENSORFLOW=1
 export HOROVOD_WITHOUT_PYTORCH=1
 export HOROVOD_WITH_MXNET=1
+export HOROVOD_WITH_GLOO=1
+export HOROVOD_WITHOUT_MPI=1
 
 make clean
 conda env update --prune -p conda/${env_name} -f env/${env_name}.yml
@@ -18,6 +20,7 @@ conda activate ./conda/${env_name}
 conda list
 printenv
 
+pip install cmake
 pip install -v -e .
 pip install horovod --no-cache-dir -U
 python -m spacy download en

From 2a554631ced1fe0401938551303db356326d7a98 Mon Sep 17 00:00:00 2001
From: shishirb126 <71469961+shishirb126@users.noreply.github.com>
Date: Thu, 8 Oct 2020 12:48:20 -0700
Subject: [PATCH 36/42] [PERFORMANCE] Improve vocab lookup performance by
 working with a dict() directly (#1382)

Co-authored-by: Sheng Zha <szha@users.noreply.github.com>
---
 src/gluonnlp/vocab/vocab.py | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/src/gluonnlp/vocab/vocab.py b/src/gluonnlp/vocab/vocab.py
index 5483b3b678..01fee4a5ab 100644
--- a/src/gluonnlp/vocab/vocab.py
+++ b/src/gluonnlp/vocab/vocab.py
@@ -30,7 +30,7 @@
 
 from .. import _constants as C
 from .. import embedding as emb
-from ..data.utils import Counter, DefaultLookupDict, count_tokens
+from ..data.utils import Counter, count_tokens
 
 UNK_IDX = 0
 _DEPR_PAD = object()
@@ -219,10 +219,7 @@ def __init__(self, counter: Optional[Counter] = None, max_size: Optional[int] =
         # Set up idx_to_token and token_to_idx based on presence of unknown token
         self._unknown_token = unknown_token
         self._idx_to_token = [unknown_token] if unknown_token else []
-        if unknown_token:
-            self._token_to_idx = DefaultLookupDict(UNK_IDX)
-        else:
-            self._token_to_idx = {}
+        self._token_to_idx = dict()
 
         # Handle special tokens
         special_tokens = []
@@ -267,10 +264,6 @@ def __init__(self, counter: Optional[Counter] = None, max_size: Optional[int] =
 
         if token_to_idx:
             self._sort_index_according_to_user_specification(token_to_idx)
-            if unknown_token:
-                self._token_to_idx._default = \
-                    self._token_to_idx[unknown_token]  # pytype: disable=not-writable
-
 
     def _index_counter_keys(self, counter, unknown_token, special_tokens, max_size,
                             min_freq):
@@ -395,9 +388,17 @@ def __getitem__(self, tokens):
         """
 
         if not isinstance(tokens, (list, tuple)):
-            return self._token_to_idx[tokens]
+            if self._unknown_token:
+                unknown_token_idx = self._token_to_idx[self._unknown_token]
+                return self._token_to_idx.get(tokens, unknown_token_idx)
+            else:
+                return self._token_to_idx[tokens]
         else:
-            return [self._token_to_idx[token] for token in tokens]
+            if self._unknown_token:
+                unknown_token_idx = self._token_to_idx[self._unknown_token]
+                return [self._token_to_idx.get(token, unknown_token_idx) for token in tokens]
+            else:
+                return [self._token_to_idx[token] for token in tokens]
 
     def __len__(self):
         return len(self._idx_to_token)

From 8a96e603767f7525d56422620e01d5c7884aa4cf Mon Sep 17 00:00:00 2001
From: Sheng Zha <szha@users.noreply.github.com>
Date: Tue, 5 Jan 2021 13:47:39 -0800
Subject: [PATCH 37/42] Update README.rst

fix #1465
---
 README.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.rst b/README.rst
index cf004dc838..250fb7bae7 100644
--- a/README.rst
+++ b/README.rst
@@ -215,4 +215,4 @@ The bibtex entry for the `reference paper <https://arxiv.org/abs/1907.04433>`__
 New to Deep Learning or NLP?
 ============================
 
-For background knowledge of deep learning or NLP, please refer to the open source book `Dive into Deep Learning <http://en.diveintodeeplearning.org/>`__.
+For background knowledge of deep learning or NLP, please refer to the open source book `Dive into Deep Learning <https://d2l.ai/>`__ (`中文版 <https://zh.d2l.ai>`__).

From dbfb1ed6c1d245df0c4abcf6c4065eae8580504b Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Wed, 24 Feb 2021 09:43:03 -0800
Subject: [PATCH 38/42] backport ci from master

---
 .github/workflows/unittests.yml | 65 +++++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)
 create mode 100644 .github/workflows/unittests.yml

diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml
new file mode 100644
index 0000000000..9315dc6536
--- /dev/null
+++ b/.github/workflows/unittests.yml
@@ -0,0 +1,65 @@
+name: continuous build
+
+on: [push, pull_request]
+
+defaults:
+  run:
+    shell: bash
+
+jobs:
+  unittest:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        # TODO Add windows test by using "windows-latest"
+        os: [ubuntu-latest, macos-latest]
+        python-version: [ '3.6', '3.7', '3.8']
+        exclude:
+          - os: macos-latest
+            python-version: 3.6
+          - os: macos-latest
+            python-version: 3.8
+          - os: ubuntu-latest
+            python-version: 3.7
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v2
+
+      - name: Compilation cache
+        uses: actions/cache@v2
+        with:
+          path: ~/.ccache
+          # We include the commit sha in the cache key, as new cache entries are
+          # only created if there is no existing entry for the key yet.
+          key: ${{ runner.os }}-ccache-${{ github.sha }}
+          # Restore any ccache cache entry, if none for
+          # ${{ runner.os }}-ccache-${{ github.sha }} exists
+          restore-keys: |
+            ${{ runner.os }}-ccache
+      # Install Linux specific dependencies
+      - name: Install Linux dependencies
+        if: matrix.os == 'ubuntu-latest'
+        # TODO https://github.com/apache/incubator-mxnet/issues/18293
+        run: |
+          sudo apt-get install -y libopenblas-dev ninja-build libedit-dev libxml2-dev
+      - name: Setup python
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+          architecture: x64
+      - name: Install Other Dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install setuptools pytest pytest-cov contextvars
+          python -m pip install --upgrade cython
+          python -m pip install mxnet<2
+          python -m pip install -U -e .[extras,dev]
+
+      - name: Run Unittests
+        run: |
+          pytest -n 4 -m 'not (gpu or serial)' --durations=30 --cov=./ --cov-report=xml tests/unittest
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v1.0.10
+        with:
+          env_vars: OS,PYTHON

From 742103c60e7e4fd33a6de9cf6923e2080b23143a Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Wed, 24 Feb 2021 09:49:53 -0800
Subject: [PATCH 39/42] update mxnet version

---
 .github/workflows/unittests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml
index 9315dc6536..14bc834437 100644
--- a/.github/workflows/unittests.yml
+++ b/.github/workflows/unittests.yml
@@ -53,7 +53,7 @@ jobs:
           python -m pip install --upgrade pip
           python -m pip install setuptools pytest pytest-cov contextvars
           python -m pip install --upgrade cython
-          python -m pip install mxnet<2
+          python -m pip install mxnet>=1.6.0
           python -m pip install -U -e .[extras,dev]
 
       - name: Run Unittests

From f49468e7a3e23e72dae877c47937330bedc88bc1 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Wed, 24 Feb 2021 10:01:40 -0800
Subject: [PATCH 40/42] update mxnet version

---
 .github/workflows/unittests.yml | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml
index 14bc834437..b7d93930b4 100644
--- a/.github/workflows/unittests.yml
+++ b/.github/workflows/unittests.yml
@@ -52,9 +52,13 @@ jobs:
         run: |
           python -m pip install --upgrade pip
           python -m pip install setuptools pytest pytest-cov contextvars
-          python -m pip install --upgrade cython
-          python -m pip install mxnet>=1.6.0
+          python -m pip install mxnet==1.7.0
           python -m pip install -U -e .[extras,dev]
+          python -m pip install -v -e .
+          python -m pip install horovod --no-cache-dir -U
+          python -m spacy download en
+          python -m spacy download de
+          python -m nltk.downloader all
 
       - name: Run Unittests
         run: |

From 43a012c34c09c7d2fbdc2c7ba1a911c8e9d94bc8 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Wed, 24 Feb 2021 10:08:29 -0800
Subject: [PATCH 41/42] update mxnet version

---
 .github/workflows/unittests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml
index b7d93930b4..f73a2a5811 100644
--- a/.github/workflows/unittests.yml
+++ b/.github/workflows/unittests.yml
@@ -52,7 +52,7 @@ jobs:
         run: |
           python -m pip install --upgrade pip
           python -m pip install setuptools pytest pytest-cov contextvars
-          python -m pip install mxnet==1.7.0
+          python -m pip install --pre "mxnet>=1.9.0b20210220" -f https://dist.mxnet.io/python
           python -m pip install -U -e .[extras,dev]
           python -m pip install -v -e .
           python -m pip install horovod --no-cache-dir -U

From 6b263d4fb2993f420f61a5228a1d5787d8ded8de Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Wed, 24 Feb 2021 10:12:22 -0800
Subject: [PATCH 42/42] remove some dependecies

---
 .github/workflows/unittests.yml | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml
index f73a2a5811..a879d21589 100644
--- a/.github/workflows/unittests.yml
+++ b/.github/workflows/unittests.yml
@@ -37,12 +37,6 @@ jobs:
           # ${{ runner.os }}-ccache-${{ github.sha }} exists
           restore-keys: |
             ${{ runner.os }}-ccache
-      # Install Linux specific dependencies
-      - name: Install Linux dependencies
-        if: matrix.os == 'ubuntu-latest'
-        # TODO https://github.com/apache/incubator-mxnet/issues/18293
-        run: |
-          sudo apt-get install -y libopenblas-dev ninja-build libedit-dev libxml2-dev
       - name: Setup python
         uses: actions/setup-python@v2
         with: