diff --git a/docs/source/tutorials/memory_optimizations.rst b/docs/source/tutorials/memory_optimizations.rst index 04644093a9..e2c82b2b1c 100644 --- a/docs/source/tutorials/memory_optimizations.rst +++ b/docs/source/tutorials/memory_optimizations.rst @@ -108,7 +108,7 @@ checkpointing, where all activations will either be recomputed later in the back To enable activation offloading, use the ``enable_activation_offloading`` config entry or flag in our lora finetuning single device recipe, e.g. ``enable_activation_offloading=True``. To allow -usage of streams, make sure you are on a torch version later than PyTorch 2.5.0.dev20240907. +usage of streams, make sure you are on a torch version equal to or later than PyTorch. .. _glossary_grad_accm: diff --git a/recipes/lora_finetune_distributed.py b/recipes/lora_finetune_distributed.py index 1569dfee63..383d7e84d5 100644 --- a/recipes/lora_finetune_distributed.py +++ b/recipes/lora_finetune_distributed.py @@ -74,9 +74,9 @@ class LoRAFinetuneRecipeDistributed(FTRecipeInterface): back during the backward pass. As always, there is a tradeoff--these savings in memory can come at the cost of training performance and CPU resources. To recover some runtime cost, we've added an option to enable offloading on a different stream to permit overlapping with - the computation. This option is currently only available on PyTorch nightly 2.5.0.dev20240907 - or later and will be enabled by default if an acceptable torch version is found. Activation - offloading can be used in conjunction with activation checkpointing. + the computation. This option is currently only available on PyTorch 2.5.0 or later and will be + enabled by default if an acceptable torch version is found. Activation offloading can be used in + conjunction with activation checkpointing. - Precision. Full fp32 and bf16 training are supported. Precision is controlled using the ``dtype`` flag. When ``dtype=bf16``, all activations, gradients and optimizer states are in bfloat16. In diff --git a/tests/torchtune/modules/test_attention_utils.py b/tests/torchtune/modules/test_attention_utils.py index 18bb20a87d..0dd980f483 100644 --- a/tests/torchtune/modules/test_attention_utils.py +++ b/tests/torchtune/modules/test_attention_utils.py @@ -82,10 +82,6 @@ def test_packed_block_causal_mask_sdpa(self, seq_lens): ) torch.testing.assert_close(actual, expected) - @pytest.mark.skipif( - not _SUPPORTS_FLEX_ATTENTION, - reason="Please install a nightly build of torch (>=2.5.0) to run this test.", - ) @gpu_test(gpu_count=1) def test_packed_block_causal_mask_flex(self): # create_block_mask requires that seq_len be divisible by 128, the default block size. diff --git a/torchtune/modules/attention_utils.py b/torchtune/modules/attention_utils.py index 8afd4eba71..289de8774d 100644 --- a/torchtune/modules/attention_utils.py +++ b/torchtune/modules/attention_utils.py @@ -115,9 +115,9 @@ def packed_block_causal_mask( seq_lens: List[torch.Tensor], ) -> _MaskType: """ - Create a block causal document mask for a batch of packed sequences. If on - torch version >= 2.5.0, this is done by creating a mask_mod function with the - block causal logic and passing this into :func:`torch.nn.attention.flex_attention.create_block_mask`. + Create a block causal document mask for a batch of packed sequences. If + flex attention is supported by the current hardware, block causal logic and + passing this into :func:`torch.nn.attention.flex_attention.create_block_mask`. The resultant BlockMask is a compressed representation of the full block causal mask. If on an older version, a standard 2D block causal mask is created and returned. diff --git a/torchtune/modules/common_utils.py b/torchtune/modules/common_utils.py index 055252cf72..51a65d686d 100644 --- a/torchtune/modules/common_utils.py +++ b/torchtune/modules/common_utils.py @@ -149,11 +149,7 @@ def _register_reparametrize_state_dict_hooks( RuntimeError: If the low RAM reparametrize hook is used on Windows or an incompatible torch version. """ if _use_low_cpu_ram: - if torch.__version__ < "2.5.0.dev20240906": - raise RuntimeError( - "Low RAM reparametrize_as_dtype_state_dict_post_hook requires PyTorch 2.5.0.dev20240906 or later." - ) - elif sys.platform == "win32": + if sys.platform == "win32": # mmap.MAP_SHARED is not supported on Windows but this change targets colab. raise RuntimeError( "Low RAM reparametrize_as_dtype_state_dict_post_hook is not supported on Windows." diff --git a/torchtune/training/_activation_offloading.py b/torchtune/training/_activation_offloading.py index c536e7f5ee..111f74eb58 100644 --- a/torchtune/training/_activation_offloading.py +++ b/torchtune/training/_activation_offloading.py @@ -33,7 +33,7 @@ class OffloadActivations(saved_tensors_hooks): use_streams (Optional[bool]): Whether or not to use streams for performance optimization where the communications get overlapped with the computation. Requires a torch build - after torch-2.5.0.dev20240907. Default: True if a later torch build is found, else False. + after torch-2.5.0.]. Default: True. max_fwd_stash_size (int): The maximum size of the forward stash, or the maximum number of consecutive activations to keep alive during the forward pass. This number must be at @@ -60,7 +60,7 @@ class OffloadActivations(saved_tensors_hooks): def __init__( self, use_pin_memory: bool = True, - use_streams: Optional[bool] = None, + use_streams: Optional[bool] = True, max_fwd_stash_size: int = 5, min_offload_size: int = 1024, ) -> None: diff --git a/torchtune/training/_compile.py b/torchtune/training/_compile.py index 668df921c5..c22eaef5c6 100644 --- a/torchtune/training/_compile.py +++ b/torchtune/training/_compile.py @@ -42,23 +42,21 @@ def compile_model( backend = os.environ.get("TORCH_COMPILE_BACKEND", "inductor") if isinstance(model, DeepFusionModel): model = model.decoder - if torch_version_ge("2.5.0"): - if verbose: - log.info("Compiling model layers with torch.compile...") - for m in reversed(list(model.modules())): - if isinstance(m, TransformerSelfAttentionLayer) or isinstance( - m, TransformerCrossAttentionLayer - ): - m.compile(backend=backend) - else: + # Per-layer compilation by default + if verbose: + log.info("Compiling model layers with torch.compile...") + for m in reversed(list(model.modules())): + if isinstance(m, TransformerSelfAttentionLayer) or isinstance( + m, TransformerCrossAttentionLayer + ): + m.compile(backend=backend) + # Fallback for models that can't be per-layer compiled + if not torch_version_ge("2.5.0"): if verbose: - log.info( - """ - Compiling full model with torch.compile... - For faster compile times via per-layer compile, please run on PyTorch nightlies. - """ + log.warning( + "Per-layer compilation may not be fully optimized in PyTorch versions < 2.5.0. " + "Consider upgrading for improved performance." ) - model.compile(backend=backend) def compile_loss(loss: nn.Module, verbose: bool = True) -> None: diff --git a/torchtune/utils/_import_guard.py b/torchtune/utils/_import_guard.py index 93e7941fbc..9f61e10dfa 100644 --- a/torchtune/utils/_import_guard.py +++ b/torchtune/utils/_import_guard.py @@ -6,12 +6,11 @@ import torch import torchao -from torchtune.utils._version import _is_fbcode, _nightly_version_ge, torch_version_ge +from torchtune.utils._version import _is_fbcode, _nightly_version_ge # We can only use flex attention / BlockMask if torch version >= 2.5.0 and GPU is Turing / SM75 and above _SUPPORTS_FLEX_ATTENTION = ( - torch_version_ge("2.5.0") - and torch.cuda.is_available() + torch.cuda.is_available() and torch.cuda.get_device_capability() >= (7, 5) )