update models docs (pytorch#1167)

maximegmd · Jul 13, 2024 · 40b356f · 40b356f
1 parent 7c5369a
commit 40b356f
Show file tree

Hide file tree

Showing 14 changed files with 78 additions and 47 deletions.
diff --git a/docs/source/api_ref_models.rst b/docs/source/api_ref_models.rst
@@ -11,9 +11,18 @@ llama3
 
 All models from the `Llama3 family <https://llama.meta.com/llama3/>`_.
 
+To download the Llama3-8B-Instruct model:
+
+.. code-block:: bash
+
+    tune download meta-llama/Meta-Llama-3-8B-Instruct --hf-token <HF_TOKEN>
+
+To download the Llama3-70B-Instruct model:
+
 .. code-block:: bash
 
-    tune download meta-llama/Meta-Llama-3-8B-Instruct --hf-token <ACCESS_TOKEN>
+    tune download meta-llama/Meta-Llama-3-70B-Instruct --hf-token <HF_TOKEN>
+    --ignore-patterns "original/consolidated*"
 
 
 .. autosummary::
@@ -35,11 +44,23 @@ llama2
 
 All models from the `Llama2 family <https://llama.meta.com/llama2/>`_.
 
-Pre-trained models can be downloaded from the Hugging Face Hub with the following command:
+To download the Llama2-7B model:
+
+.. code-block:: bash
+
+    tune download meta-llama/Llama-2-7b-hf --hf-token <HF_TOKEN>
+
+To download the Llama2-13B model:
 
 .. code-block:: bash
 
-    tune download meta-llama/Llama-2-7b-hf --hf-token <ACCESS_TOKEN>
+    tune download meta-llama/Llama-2-13b-hf --hf-token <HF_TOKEN>
+
+To download the Llama2-70B model:
+
+.. code-block:: bash
+
+    tune download meta-llama/Llama-2-70b-hf --hf-token <HF_TOKEN>
 
 .. autosummary::
     :toctree: generated/
@@ -63,11 +84,11 @@ code llama
 
 Models from the `Code Llama family <https://arxiv.org/pdf/2308.12950>`_.
 
-Pre-trained models can be downloaded from the Hugging Face Hub with the following command:
+To download the CodeLlama-7B model:
 
 .. code-block:: bash
 
-    tune download codellama/CodeLlama-7b-hf --hf-token <ACCESS_TOKEN>
+    tune download codellama/CodeLlama-7b-hf --hf-token <HF_TOKEN>
 
 .. autosummary::
     :toctree: generated/
@@ -89,7 +110,7 @@ phi-3
 
 Models from the `Phi-3 mini family <https://news.microsoft.com/source/features/ai/the-phi-3-small-language-models-with-big-potential/>`_.
 
-Pre-trained models can be download from the Hugging Face Hub with the following command:
+To download the Phi-3 Mini 4k instruct model:
 
 .. code-block:: bash
 
@@ -111,11 +132,11 @@ mistral
 
 All models from `Mistral AI family <https://mistral.ai/technology/#models>`_.
 
-Pre-trained models can be downloaded from the Hugging Face Hub with the following command:
+To download the Mistral 7B v0.1 model:
 
 .. code-block:: bash
 
-    tune download mistralai/Mistral-7B-v0.1
+    tune download mistralai/Mistral-7B-v0.1 --hf-token <HF_TOKEN>
 
 .. autosummary::
     :toctree: generated/
@@ -136,11 +157,17 @@ gemma
 
 Models of size 2B and 7B from the `Gemma family <https://blog.google/technology/developers/gemma-open-models/>`_.
 
-Pre-trained models can be downloaded from the Hugging Face Hub with the following command:
+To download the Gemma 2B model:
+
+.. code-block:: bash
+
+    tune download google/gemma-2b --hf-token <HF_TOKEN> --ignore-patterns ""
+
+To download the Gemma 7B model:
 
 .. code-block:: bash
 
-    tune download google/gemma-2b --hf-token <ACCESS_TOKEN> --ignore-patterns ""
+    tune download google/gemma-7b --hf-token <HF_TOKEN> --ignore-patterns "gemma-7b.gguf"
 
 .. autosummary::
     :toctree: generated/

diff --git a/torchtune/models/clip/__init__.py b/torchtune/models/clip/__init__.py
@@ -6,7 +6,6 @@
 
 from ._component_builders import clip_vision_encoder
 
-from ._model_builders import clip_vit_224_transform  # noqa
 from ._position_embeddings import (
     TiledTokenPositionalEmbedding,
     TilePositionalEmbedding,
@@ -18,5 +17,4 @@
     "TokenPositionalEmbedding",
     "TiledTokenPositionalEmbedding",
     "TilePositionalEmbedding",
-    "clip_vit_224_transform",
 ]
diff --git a/torchtune/models/clip/_component_builders.py b/torchtune/models/clip/_component_builders.py
@@ -23,32 +23,32 @@ def clip_vision_encoder(
     """
     Builds the vision encoder associated with the clip model. This includes:
     
-    - TransformerEncoderLayer
+    - num_layers TransformerEncoderLayers
     - positional embeddings
     - CLS projection (optional)
 
     For details, please check the documentation of
     :class:`torchtune.modules.vision_transformer.VisionTransformer`.
 
     Args:
+        tile_size (int): The size of your image tiles, if the image was tile-cropped in advance. Otherwise,
+            the size of the input image. In this case, the function will consider your image as a single tile.
+        patch_size (int): The size of each patch. Used to divide the tiles into patches.
+            E.g. for ``patch_size=40``, a tile of shape (400, 400) will have 10x10 grid of patches
+            with shape (40, 40) each.
         embed_dim (int): The dimensionality of each patch embedding (token).
         num_layers (int): The number of transformer layers.
         num_heads (int): The number of attention heads in each transformer layer.
+        cls_output_dim (int): The dimensionality of the output tensor from the CLS projection module.
         out_indices (Optional[List[int]]): The indices of hidden layers to return.
             If provided, it will return the intermediate results of the transformer layers
             before they go through a next layer. For example, ``out_indices=[0,3]`` will
             return the tokens before they go through the first and fourth layers.
         output_cls_projection (bool): If True, only the CLS token projection will be outputted,
             instead of all tokens. Defaults to False.
-        tile_size (int): The size of your image tiles, if the image was tile-cropped in advance. Otherwise,
-            the size of the input image. In this case, the function will consider your image as a single tile.
-        patch_size (int): The size of each patch. Used to divide the tiles into patches.
-            E.g. for ``patch_size=40``, a tile of shape (400, 400) will have 10x10 grid of patches
-            with shape (40, 40) each.
         max_num_tiles (int): The maximum number of tiles that can be processed. This is used to
             determine the size of the positional embeddings.
         in_channels (int): The number of image input channels.
-        cls_output_dim (int): The dimensionality of the output tensor from the CLS projection module.
 
     Returns:
         A `VisionTransformer` object.

diff --git a/torchtune/models/clip/_model_builders.py b/torchtune/models/clip/_model_builders.py
@@ -1,7 +1,6 @@
 from torchtune.models.clip._transforms import CLIPImageTransform
 
-def clip_vit_224_transform():
-
+def _clip_vit_224_transform():
     image_transform = CLIPImageTransform(
         image_mean=[0.48145466, 0.4578275, 0.40821073],
         image_std=[0.26862954, 0.26130258, 0.27577711],

diff --git a/torchtune/models/clip/_position_embeddings.py b/torchtune/models/clip/_position_embeddings.py
@@ -49,11 +49,11 @@ def forward(self, x: torch.Tensor, *args) -> torch.Tensor:
 
 class TiledTokenPositionalEmbedding(nn.Module):
     """
+
     Token positional embedding for tiled images. There are two positional embeddings in this module:
 
-    * local_token_positional_embedding: same for every tile, different for every token. Equivalent
+    * local_token_positional_embedding: same for every tile, different for every token. Equivalent \
         to :class:`torchtune.models.clip._position_embeddings.TokenPositionalEmbedding`, but gated.
-
     * global_token_positional_embedding: different for every tile, different for every token.
 
     Notice that tile is different from patch (token). For details, please check the documentation of
@@ -101,7 +101,8 @@ def forward(self, x: torch.Tensor, aspect_ratio: torch.Tensor) -> torch.Tensor:
         Args:
             x (torch.Tensor): Tensor with shape (bsz * n_imgs, n_tiles, n_tokens, embed_dim).
             aspect_ratio (torch.Tensor): Tensor with shape (bsz * n_imgs, 2),
-                representing the aspect ratio of the image before tile-cropping, e.g. (2,1).
+                where aspect_ratio[k] represents the aspect ratio of the k^th image
+                of the batch before tile-cropping,  e.g. aspect_ratio[k] = (2,1).
         Returns:
             torch.Tensor: The input tensor with added positional embeddings.
         """

diff --git a/torchtune/models/code_llama2/_model_builders.py b/torchtune/models/code_llama2/_model_builders.py
@@ -59,6 +59,7 @@ def lora_code_llama2_7b(
             Default: False
         lora_rank (int): rank of each low-rank approximation
         lora_alpha (float): scaling factor for the low-rank approximation
+        lora_dropout (float): dropout probability for LoRA linear layers. Default: 0.05
         quantize_base (bool): Whether to quantize base model weights
 
     Returns:
@@ -139,6 +140,7 @@ def lora_code_llama2_13b(
             Default: False
         lora_rank (int): rank of each low-rank approximation
         lora_alpha (float): scaling factor for the low-rank approximation
+        lora_dropout (float): dropout probability for LoRA linear layers. Default: 0.05
         quantize_base (bool): Whether to quantize base model weights
 
     Returns:
@@ -220,6 +222,7 @@ def lora_code_llama2_70b(
             Default: False
         lora_rank (int): rank of each low-rank approximation
         lora_alpha (float): scaling factor for the low-rank approximation
+        lora_dropout (float): dropout probability for LoRA linear layers. Default: 0.05
         quantize_base (bool): Whether to quantize base model weights
 
     Returns:

diff --git a/torchtune/models/gemma/_tokenizer.py b/torchtune/models/gemma/_tokenizer.py
@@ -92,12 +92,13 @@ def tokenize_messages(
                 Message(role="user", content="user prompt\n", masked=True),
                 Message(role="assistant", content="assistant response\n"),
             ]
-            # tokenize_messages encodes messages separately and concats
+
+            >>> # tokenize_messages encodes messages separately and concats
             >>> tokenizer.tokenize_messages(messages, max_seq_len)[0]
             [1, 1788, 2643, 13, 1792, 9508, 13, 465, 22137, 2933, 2]
 
 
-            # Same result as encoding the full string in one go
+            >>> # Same result as encoding the full string in one go
             >>> tokenizer.encode(''.join([message.content for message in messages]))
             [1, 1788, 2643, 13, 1792, 9508, 13, 465, 22137, 2933, 2]
 

diff --git a/torchtune/models/llama2/__init__.py b/torchtune/models/llama2/__init__.py
@@ -18,7 +18,6 @@
     qlora_llama2_70b,
     qlora_llama2_7b,
 )
-from ._model_utils import scale_hidden_dim_for_mlp
 from ._tokenizer import Llama2Tokenizer
 
 __all__ = [
@@ -35,5 +34,4 @@
     "qlora_llama2_13b",
     "qlora_llama2_70b",
     "qlora_llama2_7b",
-    "scale_hidden_dim_for_mlp",
 ]
diff --git a/torchtune/models/llama2/_model_builders.py b/torchtune/models/llama2/_model_builders.py
@@ -79,6 +79,7 @@ def lora_llama2_7b(
         lora_rank (int): rank of each low-rank approximation
         lora_alpha (float): scaling factor for the low-rank approximation
         quantize_base (bool): Whether to quantize base model weights
+        lora_dropout (float): dropout probability for LoRA linear layers. Default: 0.05
 
     Returns:
         TransformerDecoder: Instantiation of Llama2 7B model with LoRA applied
@@ -158,6 +159,7 @@ def lora_llama2_13b(
             Default: False
         lora_rank (int): rank of each low-rank approximation
         lora_alpha (float): scaling factor for the low-rank approximation
+        lora_dropout (float): dropout probability for LoRA linear layers. Default: 0.05
         quantize_base (bool): Whether to quantize base model weights
 
     Returns:
@@ -239,6 +241,7 @@ def lora_llama2_70b(
             Default: False
         lora_rank (int): rank of each low-rank approximation
         lora_alpha (float): scaling factor for the low-rank approximation
+        lora_dropout (float): dropout probability for LoRA linear layers. Default: 0.05
         quantize_base (bool): Whether to quantize base model weights
 
     Returns:

diff --git a/torchtune/models/llama2/_tokenizer.py b/torchtune/models/llama2/_tokenizer.py
@@ -92,10 +92,11 @@ def tokenize_messages(
         r"""Tokenize a list of messages one at a time then concatenate them,
         returning a list of tokens and a list of masks.
 
-        Note: llama2 sentencepiece has problems where in general
-        encode(s1 + s2) != encode(s1) + encode(s2) due to whitespace handling.
-        We can get around this by prepending s2 with a known token and slicing the
-        beginning off the tokenized s2.
+        Note:
+            sentencepiece has problems where in general
+            encode(s1 + s2) != encode(s1) + encode(s2) due to whitespace handling.
+            We can get around this by prepending s2 with a known token and slicing the
+            beginning off the tokenized s2.
 
         Example:
             >>> tokenizer = Llama2Tokenizer(tokenizer_path)
@@ -104,12 +105,12 @@ def tokenize_messages(
                 Message(role="user", content="user prompt\n", masked=True),
                 Message(role="assistant", content="assistant response\n"),
             ]
-            # tokenize_messages encodes messages separately and concats
+
+            >>> # tokenize_messages encodes messages separately and concats
             >>> tokenizer.tokenize_messages(messages, max_seq_len)[0]
             [1, 1788, 2643, 13, 1792, 9508, 13, 465, 22137, 2933, 2]
 
-
-            # Same result as encoding the full string in one go
+            >>> # Same result as encoding the full string in one go
             >>> tokenizer.encode(''.join([message.content for message in messages]))
             [1, 1788, 2643, 13, 1792, 9508, 13, 465, 22137, 2933, 2]
 

diff --git a/torchtune/models/llama3/__init__.py b/torchtune/models/llama3/__init__.py
@@ -15,7 +15,6 @@
     qlora_llama3_70b,
     qlora_llama3_8b,
 )
-from ._model_utils import scale_hidden_dim_for_mlp
 from ._tokenizer import Llama3Tokenizer
 
 __all__ = [
@@ -29,5 +28,4 @@
     "lora_llama3_70b",
     "qlora_llama3_8b",
     "qlora_llama3_70b",
-    "scale_hidden_dim_for_mlp",
 ]
diff --git a/torchtune/models/mistral/_model_builders.py b/torchtune/models/mistral/_model_builders.py
@@ -121,7 +121,7 @@ def mistral_classifier_7b() -> TransformerDecoder:
 
 
     Returns:
-        TransformerClassifier: Instantiation of Mistral 7B classifier model
+        TransformerDecoder: Instantiation of Mistral 7B classifier model
     """
     return mistral_classifier(
         num_classes=1,

diff --git a/torchtune/models/mistral/_tokenizer.py b/torchtune/models/mistral/_tokenizer.py
@@ -107,10 +107,11 @@ def tokenize_messages(
         r"""Tokenize a list of messages one at a time then concatenate them,
         returning a list of tokens and a list of masks.
 
-        Note: sentencepiece has problems where in general
-        encode(s1 + s2) != encode(s1) + encode(s2) due to whitespace handling.
-        We can get around this by prepending s2 with a known token and slicing the
-        beginning off the tokenized s2.
+        Note:
+            sentencepiece has problems where in general
+            encode(s1 + s2) != encode(s1) + encode(s2) due to whitespace handling.
+            We can get around this by prepending s2 with a known token and slicing the
+            beginning off the tokenized s2.
 
         Example:
             >>> tokenizer = MistralTokenizer(tokenizer_path)
@@ -119,12 +120,13 @@ def tokenize_messages(
                 Message(role="user", content="user prompt\n", masked=True),
                 Message(role="assistant", content="assistant response\n"),
             ]
-            # tokenize_messages encodes messages separately and concats
+
+            >>> # tokenize_messages encodes messages separately and concats
             >>> tokenizer.tokenize_messages(messages, max_seq_len)[0]
             [1, 1788, 2643, 13, 1792, 9508, 13, 465, 22137, 2933, 2]
 
 
-            # Same result as encoding the full string in one go
+            >>> # Same result as encoding the full string in one go
             >>> tokenizer.encode(''.join([message.content for message in messages]))
             [1, 1788, 2643, 13, 1792, 9508, 13, 465, 22137, 2933, 2]
 

diff --git a/torchtune/models/phi3/_tokenizer.py b/torchtune/models/phi3/_tokenizer.py
@@ -118,12 +118,12 @@ def tokenize_messages(
                 Message(role="user", content="user prompt\n", masked=True),
                 Message(role="assistant", content="assistant response\n"),
             ]
-            # tokenize_messages encodes messages separately and concats
+
+            >>> # tokenize_messages encodes messages separately and concats
             >>> tokenizer.tokenize_messages(messages, max_seq_len)[0]
             [1, 1788, 2643, 13, 1792, 9508, 13, 465, 22137, 2933, 2]
 
-
-            # Same result as encoding the full string in one go
+            >>> # Same result as encoding the full string in one go
             >>> tokenizer.encode(''.join([message.content for message in messages]))
             [1, 1788, 2643, 13, 1792, 9508, 13, 465, 22137, 2933, 2]