pytorch · felipemello1 · Oct 24, 2024 · Oct 23, 2024 · Oct 23, 2024 · Oct 23, 2024
diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py
@@ -404,15 +404,18 @@ def _setup_model(
         # Shard transformer decoder layers (or AC-wrapped versions)
         # Alternatively we could condition on the module type (TransformerDecoder or CheckpointWrapper)
         # But directly using the name is more concise
-        def _is_layer_fqn(s: str) -> bool:
+        def _is_layer_name(name: str, module: nn.Module) -> bool:
             """
             Return True for layers.i and False for all other module names
             Covers sharding for both AC-wrapped and non-AC-wrapped modules in one shot
             """
-            s_list = s.split(".")
-            return len(s_list) == 2 and s_list[0] == "layers" and str.isdigit(s_list[1])
+            name_list = name.split(".")
+            if len(name_list) < 2:
+                return False
+            else:
+                return name_list[-2] == "layers" and str.isdigit(name_list[-1])
 
-        fsdp_shard_conditions = [lambda n, m: _is_layer_fqn(n)]
+        fsdp_shard_conditions = [_is_layer_name]
 
         # If wrapping any layers separately, we can add another shard condition
         # A layer will be sharded if any of the fsdp_shard_conditions are met

diff --git a/recipes/lora_dpo_distributed.py b/recipes/lora_dpo_distributed.py
@@ -336,11 +336,10 @@ def _is_layer_name(name: str, module: nn.Module) -> bool:
             Covers sharding for both AC-wrapped and non-AC-wrapped modules in one shot
             """
             name_list = name.split(".")
-            return (
-                len(name_list) == 2
-                and name_list[0] == "layers"
-                and str.isdigit(name_list[1])
-            )
+            if len(name_list) < 2:
+                return False
+            else:
+                return name_list[-2] == "layers" and str.isdigit(name_list[-1])
 
         training.shard_model(
             model=model,

diff --git a/recipes/lora_finetune_distributed.py b/recipes/lora_finetune_distributed.py
@@ -458,11 +458,10 @@ def _is_layer_name(name: str, module: nn.Module) -> bool:
             Covers sharding for both AC-wrapped and non-AC-wrapped modules in one shot
             """
             name_list = name.split(".")
-            return (
-                len(name_list) == 2
-                and name_list[0] == "layers"
-                and str.isdigit(name_list[1])
-            )
+            if len(name_list) < 2:
+                return False
+            else:
+                return name_list[-2] == "layers" and str.isdigit(name_list[-1])
 
         training.shard_model(
             model=model,

diff --git a/recipes/qat_distributed.py b/recipes/qat_distributed.py
@@ -233,9 +233,11 @@ def setup(self, cfg: DictConfig) -> None:
 
         self._optimizer = self._setup_optimizer(
             cfg_optimizer=cfg.optimizer,
-            opt_state_dict=checkpoint_dict[training.OPT_KEY]
-            if self._resume_from_checkpoint
-            else None,
+            opt_state_dict=(
+                checkpoint_dict[training.OPT_KEY]
+                if self._resume_from_checkpoint
+                else None
+            ),
         )
 
         # initialize loss
@@ -428,15 +430,18 @@ def _setup_model(
         # Shard transformer decoder layers (or AC-wrapped versions)
         # Alternatively we could condition on the module type (TransformerDecoder or CheckpointWrapper)
         # But directly using the name is more concise
-        def _is_layer_fqn(s: str) -> bool:
+        def _is_layer_name(name: str, module: nn.Module) -> bool:
             """
             Return True for layers.i and False for all other module names
             Covers sharding for both AC-wrapped and non-AC-wrapped modules in one shot
             """
-            s_list = s.split(".")
-            return len(s_list) == 2 and s_list[0] == "layers" and str.isdigit(s_list[1])
+            name_list = name.split(".")
+            if len(name_list) < 2:
+                return False
+            else:
+                return name_list[-2] == "layers" and str.isdigit(name_list[-1])
 
-        fsdp_shard_conditions = [lambda n, m: _is_layer_fqn(n)]
+        fsdp_shard_conditions = [_is_layer_name]
 
         # If wrapping any layers separately, we can add another shard condition
         # A layer will be sharded if any of the fsdp_shard_conditions are met
@@ -525,14 +530,16 @@ def _setup_data(
             sampler=sampler,
             # dropping last avoids shape issues with compile + flex attention
             drop_last=True,
-            collate_fn=partial(
-                padded_collate_sft,
-                padding_idx=self._tokenizer.pad_id,
-                ignore_idx=self._loss_fn.ignore_index,
-            )
-            if not packed
-            else partial(
-                padded_collate_packed,
+            collate_fn=(
+                partial(
+                    padded_collate_sft,
+                    padding_idx=self._tokenizer.pad_id,
+                    ignore_idx=self._loss_fn.ignore_index,
+                )
+                if not packed
+                else partial(
+                    padded_collate_packed,
+                )
             ),
         )
 

diff --git a/torchtune/__init__.py b/torchtune/__init__.py
@@ -12,6 +12,9 @@
 # We have to do this because it is not currently possible to
 # properly support both nightly and stable installs of PyTorch + torchao
 # in pyproject.toml.
+import torch
+
+torch.backends.cuda.enable_cudnn_sdp(False)
 try:
     import torchao  # noqa
 except ImportError as e:

diff --git a/torchtune/training/_distributed.py b/torchtune/training/_distributed.py
@@ -608,16 +608,25 @@ def shard_model(
             the forward pass. Setting this to True corresponds to the FULL_SHARD sharding strategy
             from FSDP1, while setting it to False corresponds to the SHARD_GRAD_OP sharding strategy.
 
+    Raises:
+        ValueError: If no layer modules were sharded. Please check if shard conditions is working as expected.
     """
     fsdp_kwargs = {"reshard_after_forward": reshard_after_forward}
     if cpu_offload:
         fsdp_kwargs["offload_policy"] = CPUOffloadPolicy()
 
     # Shard the model with FSDP, iterating in reverse to start with
     # lowest-level modules first
+    num_layers_sharded = 0
     for n, m in reversed(list(model.named_modules())):
         if any([shard_condition(n, m) for shard_condition in shard_conditions]):
             fully_shard(m, **fsdp_kwargs)
+            num_layers_sharded += 1
+
+    if num_layers_sharded == 0:
+        raise ValueError(
+            "No layer modules were sharded. Please check if shard conditions is working as expected."
+        )
 
     # Finally shard the entire model to account for any stragglers
     fully_shard(model, **fsdp_kwargs)