From 0fbdb69249dd4e9ca84fafef8e88103b330bfaed Mon Sep 17 00:00:00 2001
From: ebsmothers <ebs@meta.com>
Date: Fri, 12 Jul 2024 10:07:09 -0700
Subject: [PATCH] update models docs (#1167)

---
 docs/source/api_ref_models.rst                | 47 +++++++++++++++----
 .../models/code_llama2/_model_builders.py     |  3 ++
 torchtune/models/llama2/__init__.py           |  2 -
 torchtune/models/llama2/_model_builders.py    |  3 ++
 torchtune/models/llama3/__init__.py           |  2 -
 torchtune/models/mistral/_model_builders.py   |  2 +-
 torchtune/models/phi3/_sentencepiece.py       |  8 ++--
 7 files changed, 48 insertions(+), 19 deletions(-)

diff --git a/docs/source/api_ref_models.rst b/docs/source/api_ref_models.rst
index b5a8ba60ed..c1c047545f 100644
--- a/docs/source/api_ref_models.rst
+++ b/docs/source/api_ref_models.rst
@@ -11,9 +11,18 @@ llama3
 
 All models from the `Llama3 family <https://llama.meta.com/llama3/>`_.
 
+To download the Llama3-8B-Instruct model:
+
+.. code-block:: bash
+
+    tune download meta-llama/Meta-Llama-3-8B-Instruct --hf-token <HF_TOKEN>
+
+To download the Llama3-70B-Instruct model:
+
 .. code-block:: bash
 
-    tune download meta-llama/Meta-Llama-3-8B-Instruct --hf-token <ACCESS_TOKEN>
+    tune download meta-llama/Meta-Llama-3-70B-Instruct --hf-token <HF_TOKEN>
+    --ignore-patterns "original/consolidated*"
 
 
 .. autosummary::
@@ -34,11 +43,23 @@ llama2
 
 All models from the `Llama2 family <https://llama.meta.com/llama2/>`_.
 
-Pre-trained models can be downloaded from the Hugging Face Hub with the following command:
+To download the Llama2-7B model:
+
+.. code-block:: bash
+
+    tune download meta-llama/Llama-2-7b-hf --hf-token <HF_TOKEN>
+
+To download the Llama2-13B model:
 
 .. code-block:: bash
 
-    tune download meta-llama/Llama-2-7b-hf --hf-token <ACCESS_TOKEN>
+    tune download meta-llama/Llama-2-13b-hf --hf-token <HF_TOKEN>
+
+To download the Llama2-70B model:
+
+.. code-block:: bash
+
+    tune download meta-llama/Llama-2-70b-hf --hf-token <HF_TOKEN>
 
 .. autosummary::
     :toctree: generated/
@@ -61,11 +82,11 @@ code llama
 
 Models from the `Code Llama family <https://arxiv.org/pdf/2308.12950>`_.
 
-Pre-trained models can be downloaded from the Hugging Face Hub with the following command:
+To download the CodeLlama-7B model:
 
 .. code-block:: bash
 
-    tune download codellama/CodeLlama-7b-hf --hf-token <ACCESS_TOKEN>
+    tune download codellama/CodeLlama-7b-hf --hf-token <HF_TOKEN>
 
 .. autosummary::
     :toctree: generated/
@@ -87,7 +108,7 @@ phi-3
 
 Models from the `Phi-3 mini family <https://news.microsoft.com/source/features/ai/the-phi-3-small-language-models-with-big-potential/>`_.
 
-Pre-trained models can be download from the Hugging Face Hub with the following command:
+To download the Phi-3 Mini 4k instruct model:
 
 .. code-block:: bash
 
@@ -108,11 +129,11 @@ mistral
 
 All models from `Mistral AI family <https://mistral.ai/technology/#models>`_.
 
-Pre-trained models can be downloaded from the Hugging Face Hub with the following command:
+To download the Mistral 7B v0.1 model:
 
 .. code-block:: bash
 
-    tune download mistralai/Mistral-7B-v0.1
+    tune download mistralai/Mistral-7B-v0.1 --hf-token <HF_TOKEN>
 
 .. autosummary::
     :toctree: generated/
@@ -132,11 +153,17 @@ gemma
 
 Models of size 2B and 7B from the `Gemma family <https://blog.google/technology/developers/gemma-open-models/>`_.
 
-Pre-trained models can be downloaded from the Hugging Face Hub with the following command:
+To download the Gemma 2B model:
+
+.. code-block:: bash
+
+    tune download google/gemma-2b --hf-token <HF_TOKEN> --ignore-patterns ""
+
+To download the Gemma 7B model:
 
 .. code-block:: bash
 
-    tune download google/gemma-2b --hf-token <ACCESS_TOKEN> --ignore-patterns ""
+    tune download google/gemma-7b --hf-token <HF_TOKEN> --ignore-patterns "gemma-7b.gguf"
 
 .. autosummary::
     :toctree: generated/
diff --git a/torchtune/models/code_llama2/_model_builders.py b/torchtune/models/code_llama2/_model_builders.py
index 47b17b724b..fca17367d8 100644
--- a/torchtune/models/code_llama2/_model_builders.py
+++ b/torchtune/models/code_llama2/_model_builders.py
@@ -59,6 +59,7 @@ def lora_code_llama2_7b(
             Default: False
         lora_rank (int): rank of each low-rank approximation
         lora_alpha (float): scaling factor for the low-rank approximation
+        lora_dropout (float): dropout probability for LoRA linear layers. Default: 0.05
         quantize_base (bool): Whether to quantize base model weights
 
     Returns:
@@ -139,6 +140,7 @@ def lora_code_llama2_13b(
             Default: False
         lora_rank (int): rank of each low-rank approximation
         lora_alpha (float): scaling factor for the low-rank approximation
+        lora_dropout (float): dropout probability for LoRA linear layers. Default: 0.05
         quantize_base (bool): Whether to quantize base model weights
 
     Returns:
@@ -220,6 +222,7 @@ def lora_code_llama2_70b(
             Default: False
         lora_rank (int): rank of each low-rank approximation
         lora_alpha (float): scaling factor for the low-rank approximation
+        lora_dropout (float): dropout probability for LoRA linear layers. Default: 0.05
         quantize_base (bool): Whether to quantize base model weights
 
     Returns:
diff --git a/torchtune/models/llama2/__init__.py b/torchtune/models/llama2/__init__.py
index e45227ec96..ccdf875fe1 100644
--- a/torchtune/models/llama2/__init__.py
+++ b/torchtune/models/llama2/__init__.py
@@ -18,7 +18,6 @@
     qlora_llama2_70b,
     qlora_llama2_7b,
 )
-from ._model_utils import scale_hidden_dim_for_mlp
 
 __all__ = [
     "llama2",
@@ -33,5 +32,4 @@
     "qlora_llama2_13b",
     "qlora_llama2_70b",
     "qlora_llama2_7b",
-    "scale_hidden_dim_for_mlp",
 ]
diff --git a/torchtune/models/llama2/_model_builders.py b/torchtune/models/llama2/_model_builders.py
index 459e872639..6cae9e62ea 100644
--- a/torchtune/models/llama2/_model_builders.py
+++ b/torchtune/models/llama2/_model_builders.py
@@ -83,6 +83,7 @@ def lora_llama2_7b(
         lora_rank (int): rank of each low-rank approximation
         lora_alpha (float): scaling factor for the low-rank approximation
         quantize_base (bool): Whether to quantize base model weights
+        lora_dropout (float): dropout probability for LoRA linear layers. Default: 0.05
 
     Returns:
         TransformerDecoder: Instantiation of Llama2 7B model with LoRA applied
@@ -162,6 +163,7 @@ def lora_llama2_13b(
             Default: False
         lora_rank (int): rank of each low-rank approximation
         lora_alpha (float): scaling factor for the low-rank approximation
+        lora_dropout (float): dropout probability for LoRA linear layers. Default: 0.05
         quantize_base (bool): Whether to quantize base model weights
 
     Returns:
@@ -243,6 +245,7 @@ def lora_llama2_70b(
             Default: False
         lora_rank (int): rank of each low-rank approximation
         lora_alpha (float): scaling factor for the low-rank approximation
+        lora_dropout (float): dropout probability for LoRA linear layers. Default: 0.05
         quantize_base (bool): Whether to quantize base model weights
 
     Returns:
diff --git a/torchtune/models/llama3/__init__.py b/torchtune/models/llama3/__init__.py
index 44b66eed8f..702c19383d 100644
--- a/torchtune/models/llama3/__init__.py
+++ b/torchtune/models/llama3/__init__.py
@@ -15,7 +15,6 @@
     qlora_llama3_70b,
     qlora_llama3_8b,
 )
-from ._model_utils import scale_hidden_dim_for_mlp
 
 __all__ = [
     "llama3",
@@ -27,5 +26,4 @@
     "lora_llama3_70b",
     "qlora_llama3_8b",
     "qlora_llama3_70b",
-    "scale_hidden_dim_for_mlp",
 ]
diff --git a/torchtune/models/mistral/_model_builders.py b/torchtune/models/mistral/_model_builders.py
index 3fb919fec6..891a969436 100644
--- a/torchtune/models/mistral/_model_builders.py
+++ b/torchtune/models/mistral/_model_builders.py
@@ -126,7 +126,7 @@ def mistral_classifier_7b() -> TransformerDecoder:
 
 
     Returns:
-        TransformerClassifier: Instantiation of Mistral 7B classifier model
+        TransformerDecoder: Instantiation of Mistral 7B classifier model
     """
     return mistral_classifier(
         num_classes=1,
diff --git a/torchtune/models/phi3/_sentencepiece.py b/torchtune/models/phi3/_sentencepiece.py
index 390afa4f39..d68c5143fd 100644
--- a/torchtune/models/phi3/_sentencepiece.py
+++ b/torchtune/models/phi3/_sentencepiece.py
@@ -136,13 +136,13 @@ def tokenize_messages(
                 Message(role="system", content="system message\n", masked=True),
                 Message(role="user", content="user prompt\n", masked=True),
                 Message(role="assistant", content="assistant response\n"),
-                ]
-            # tokenize_messages encodes messages separately and concats
+            ]
+
+            >>> # tokenize_messages encodes messages separately and concats
             >>> tokenizer.tokenize_messages(messages, max_seq_len)[0]
             [1, 1788, 2643, 13, 1792, 9508, 13, 465, 22137, 2933, 2]
 
-
-            # Same result as encoding the full string in one go
+            >>> # Same result as encoding the full string in one go
             >>> tokenizer.encode(''.join([message.content for message in messages]))
             [1, 1788, 2643, 13, 1792, 9508, 13, 465, 22137, 2933, 2]