pytorch · JP-sDEV · Oct 21, 2024 · RdoubleA · Oct 22, 2024 · RdoubleA
diff --git a/docs/source/tutorials/memory_optimizations.rst b/docs/source/tutorials/memory_optimizations.rst
@@ -108,7 +108,7 @@ checkpointing, where all activations will either be recomputed later in the back
 
 To enable activation offloading, use the ``enable_activation_offloading`` config entry or flag
 in our lora finetuning single device recipe, e.g. ``enable_activation_offloading=True``. To allow
-usage of streams, make sure you are on a torch version later than PyTorch 2.5.0.dev20240907.
+usage of streams, make sure you are on a torch version equal to or later than PyTorch.
-usage of streams, make sure you are on a torch version equal to or later than PyTorch.
+usage of streams, make sure you are on a torch version equal to or later than PyTorch 2.5.0.
-usage of streams, make sure you are on a torch version equal to or later than PyTorch.
+usage of streams, make sure you are on a torch version equal to or later than PyTorch 2.5.0.
 
 .. _glossary_grad_accm:
 

diff --git a/recipes/lora_finetune_distributed.py b/recipes/lora_finetune_distributed.py
@@ -74,9 +74,9 @@ class LoRAFinetuneRecipeDistributed(FTRecipeInterface):
             back during the backward pass. As always, there is a tradeoff--these savings in memory can
             come at the cost of training performance and CPU resources. To recover some runtime cost,
             we've added an option to enable offloading on a different stream to permit overlapping with
-            the computation. This option is currently only available on PyTorch nightly 2.5.0.dev20240907
-            or later and will be enabled by default if an acceptable torch version is found. Activation
-            offloading can be used in conjunction with activation checkpointing.
+            the computation. This option is currently only available on PyTorch 2.5.0 or later and will be
+            enabled by default if an acceptable torch version is found. Activation offloading can be used in
+            conjunction with activation checkpointing.
 
         - Precision. Full fp32 and bf16 training are supported. Precision is controlled using the ``dtype``
             flag. When ``dtype=bf16``, all activations, gradients and optimizer states are in bfloat16. In

diff --git a/tests/torchtune/modules/test_attention_utils.py b/tests/torchtune/modules/test_attention_utils.py
@@ -82,10 +82,6 @@ def test_packed_block_causal_mask_sdpa(self, seq_lens):
         )
         torch.testing.assert_close(actual, expected)
 
-    @pytest.mark.skipif(
-        not _SUPPORTS_FLEX_ATTENTION,
-        reason="Please install a nightly build of torch (>=2.5.0) to run this test.",
-    )
     @gpu_test(gpu_count=1)
     def test_packed_block_causal_mask_flex(self):
         # create_block_mask requires that seq_len be divisible by 128, the default block size.

diff --git a/torchtune/modules/attention_utils.py b/torchtune/modules/attention_utils.py
@@ -115,9 +115,9 @@ def packed_block_causal_mask(
     seq_lens: List[torch.Tensor],
 ) -> _MaskType:
     """
-    Create a block causal document mask for a batch of packed sequences. If on
-    torch version >= 2.5.0, this is done by creating a mask_mod function with the
-    block causal logic and passing this into :func:`torch.nn.attention.flex_attention.create_block_mask`.
+    Create a block causal document mask for a batch of packed sequences. If 
+    flex attention is supported by the current hardware, block causal logic and
+    passing this into :func:`torch.nn.attention.flex_attention.create_block_mask`.
     The resultant BlockMask is a compressed representation of the full block causal
     mask. If on an older version, a standard 2D block causal mask is created and returned.
 

diff --git a/torchtune/modules/common_utils.py b/torchtune/modules/common_utils.py
@@ -149,11 +149,7 @@ def _register_reparametrize_state_dict_hooks(
         RuntimeError: If the low RAM reparametrize hook is used on Windows or an incompatible torch version.
     """
     if _use_low_cpu_ram:
-        if torch.__version__ < "2.5.0.dev20240906":
-            raise RuntimeError(
-                "Low RAM reparametrize_as_dtype_state_dict_post_hook requires PyTorch 2.5.0.dev20240906 or later."
-            )
-        elif sys.platform == "win32":
+        if sys.platform == "win32":
             # mmap.MAP_SHARED is not supported on Windows but this change targets colab.
             raise RuntimeError(
                 "Low RAM reparametrize_as_dtype_state_dict_post_hook is not supported on Windows."

diff --git a/torchtune/training/_activation_offloading.py b/torchtune/training/_activation_offloading.py
@@ -33,7 +33,7 @@ class OffloadActivations(saved_tensors_hooks):
 
         use_streams (Optional[bool]): Whether or not to use streams for performance optimization where
             the communications get overlapped with the computation. Requires a torch build
-            after torch-2.5.0.dev20240907. Default: True if a later torch build is found, else False.
+            after torch-2.5.0.]. Default: True.
-            after torch-2.5.0.]. Default: True.
+            after torch-2.5.0. Default: True.
-            after torch-2.5.0.]. Default: True.
+            after torch-2.5.0. Default: True.
 
         max_fwd_stash_size (int): The maximum size of the forward stash, or the maximum number of
             consecutive activations to keep alive during the forward pass. This number must be at
@@ -60,7 +60,7 @@ class OffloadActivations(saved_tensors_hooks):
     def __init__(
         self,
         use_pin_memory: bool = True,
-        use_streams: Optional[bool] = None,
+        use_streams: Optional[bool] = True,
         max_fwd_stash_size: int = 5,
         min_offload_size: int = 1024,
     ) -> None:

diff --git a/torchtune/training/_compile.py b/torchtune/training/_compile.py
@@ -42,23 +42,21 @@ def compile_model(
     backend = os.environ.get("TORCH_COMPILE_BACKEND", "inductor")
     if isinstance(model, DeepFusionModel):
         model = model.decoder
-    if torch_version_ge("2.5.0"):
-        if verbose:
-            log.info("Compiling model layers with torch.compile...")
-        for m in reversed(list(model.modules())):
-            if isinstance(m, TransformerSelfAttentionLayer) or isinstance(
-                m, TransformerCrossAttentionLayer
-            ):
-                m.compile(backend=backend)
-    else:
+    # Per-layer compilation by default
+    if verbose:
+        log.info("Compiling model layers with torch.compile...")
+    for m in reversed(list(model.modules())):
+        if isinstance(m, TransformerSelfAttentionLayer) or isinstance(
+            m, TransformerCrossAttentionLayer
+        ):
+            m.compile(backend=backend)
+    # Fallback for models that can't be per-layer compiled
+    if not torch_version_ge("2.5.0"):
         if verbose:
-            log.info(
-                """
-                Compiling full model with torch.compile...
-                For faster compile times via per-layer compile, please run on PyTorch nightlies.
-                """
+            log.warning(
+                "Per-layer compilation may not be fully optimized in PyTorch versions < 2.5.0. "
+                "Consider upgrading for improved performance."
             )
-        model.compile(backend=backend)
 
 
 def compile_loss(loss: nn.Module, verbose: bool = True) -> None:

diff --git a/torchtune/utils/_import_guard.py b/torchtune/utils/_import_guard.py
@@ -6,12 +6,11 @@
 
 import torch
 import torchao
-from torchtune.utils._version import _is_fbcode, _nightly_version_ge, torch_version_ge
+from torchtune.utils._version import _is_fbcode, _nightly_version_ge
 
 # We can only use flex attention / BlockMask if torch version >= 2.5.0 and GPU is Turing / SM75 and above
 _SUPPORTS_FLEX_ATTENTION = (
-    torch_version_ge("2.5.0")
-    and torch.cuda.is_available()
+    torch.cuda.is_available()
     and torch.cuda.get_device_capability() >= (7, 5)
 )