Skip to content

Commit

Permalink
2024-10-25 nightly release (8e013c2)
Browse files Browse the repository at this point in the history
  • Loading branch information
pytorchbot committed Oct 25, 2024
1 parent 38198c0 commit fd3b4c3
Show file tree
Hide file tree
Showing 15 changed files with 460 additions and 178 deletions.
34 changes: 31 additions & 3 deletions .github/workflows/build_linux_wheels.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,18 +27,46 @@ jobs:
with-cuda: enable
with-rocm: enable
build-python-only: enable
build:
# TODO: Remove `filter-python-version` after PyArrow releases v18
filter-python-versions:
needs: generate-matrix
runs-on: ubuntu-latest
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
steps:
- name: Filter matrix to exclude Python 3.13
id: set-matrix
shell: python
env:
input-matrix: ${{ needs.generate-matrix.outputs.matrix }}
run: |
import os
import json
# Grab environment variables
input_matrix = json.loads(os.environ["input-matrix"])
github_output_file = os.environ["GITHUB_OUTPUT"]
# Filter out any builds for 3.13
filtered_matrix = {"include": []}
for build in input_matrix["include"]:
if build["python_version"] != "3.13":
filtered_matrix["include"].append(build)
# Write the new matrix to the default outputs file
with open(github_output_file, "w") as handle:
handle.write(f"matrix={json.dumps(filtered_matrix)}")
build:
needs: filter-python-versions
name: ${{ matrix.repository }}
uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@main
if: ${{ needs.generate-matrix.outputs.matrix.python_version }} != '3.13'
strategy:
fail-fast: false
with:
repository: pytorch/torchtune
ref: ""
package-name: torchtune
build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
build-matrix: ${{ needs.filter-python-versions.outputs.matrix }}
pre-script: .github/scripts/pre_build_script.sh
trigger-event: ${{ github.event_name }}
build-platform: 'python-build-package'
Expand Down
2 changes: 1 addition & 1 deletion recipes/configs/llama3_2_vision/11B_lora.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ enable_activation_offloading: False
dtype: bf16

# Logging
output_dir: /tmp/full-llama3.2-vision-finetune
output_dir: /tmp/lora-llama3.2-vision-finetune
metric_logger:
_component_: torchtune.training.metric_logging.DiskLogger
log_dir: /tmp/Llama-3.2-11B-Vision-Instruct/logs
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ enable_activation_offloading: False
dtype: bf16

# Logging
output_dir: /tmp/full-llama3.2-vision-finetune
output_dir: /tmp/lora-llama3.2-vision-finetune
metric_logger:
_component_: torchtune.training.metric_logging.DiskLogger
log_dir: /tmp/Llama-3.2-11B-Vision-Instruct/logs
Expand Down
88 changes: 88 additions & 0 deletions recipes/configs/llama3_2_vision/11B_qlora.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# Config for multi-device QLoRA finetuning in lora_finetune_distributed.py
# using a Llama3.2 11B Vision Instruct model
#
# This config assumes that you've run the following command before launching:
# tune download meta-llama/Llama-3.2-11B-Vision-Instruct --output-dir /tmp/Llama-3.2-11B-Vision-Instruct
#
# To launch on 2 devices, run the following command from root:
# tune run --nproc_per_node 2 lora_finetune_distributed --config llama3_2_vision/11B_qlora
#
# You can add specific overrides through the command line. For example
# to override the checkpointer directory while launching training:
# tune run --nproc_per_node 2 lora_finetune_distributed --config llama3_2_vision/11B_qlora checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
#
# This config works best when the model is being fine-tuned on 2+ GPUs.
# For single device QLoRA finetuning please use 11B_qlora_single_device.yaml

# Model arguments
model:
_component_: torchtune.models.llama3_2_vision.qlora_llama3_2_vision_11b
decoder_trainable: "frozen"
encoder_trainable: "lora"
fusion_trainable: "lora"
lora_attn_modules: ['q_proj', 'v_proj']
apply_lora_to_mlp: False
apply_lora_to_output: False
lora_rank: 8
lora_alpha: 16
lora_dropout: 0.0
image_size: 560 # Make sure this matches the image_size in tokenizer

# Transform
tokenizer:
_component_: torchtune.models.llama3_2_vision.llama3_2_vision_transform
path: /tmp/Llama-3.2-11B-Vision-Instruct/original/tokenizer.model
image_size: 560
max_seq_len: 8192

# Checkpointer
checkpointer:
_component_: torchtune.training.FullModelMetaCheckpointer
checkpoint_dir: /tmp/Llama-3.2-11B-Vision-Instruct/original/
checkpoint_files: [consolidated.pth]
recipe_checkpoint: null
output_dir: /tmp/Llama-3.2-11B-Vision-Instruct/
model_type: LLAMA3_VISION
resume_from_checkpoint: False

# Dataset
dataset:
_component_: torchtune.datasets.multimodal.the_cauldron_dataset
subset: ocrvqa
seed: null
shuffle: True
collate_fn: torchtune.data.padded_collate_tiled_images_and_mask

# Fine-tuning arguments
epochs: 1
max_steps_per_epoch: null
batch_size: 2
gradient_accumulation_steps: 4
optimizer:
_component_: torch.optim.AdamW
fused: True
weight_decay: 0.01
lr: 2e-5
lr_scheduler:
_component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
num_warmup_steps: 100
loss:
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
clip_grad_norm: 1.0
compile: False # set it to True for better memory and performance

# Training env
device: cuda

# Memory management
enable_activation_checkpointing: True
enable_activation_offloading: False
dtype: bf16

# Logging
output_dir: /tmp/qlora-llama3.2-vision-finetune
metric_logger:
_component_: torchtune.training.metric_logging.DiskLogger
log_dir: /tmp/Llama-3.2-11B-Vision-Instruct/logs
log_every_n_steps: 1
log_peak_memory_stats: False
113 changes: 113 additions & 0 deletions recipes/configs/llama3_2_vision/11B_qlora_single_device.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
# Config for single device QLoRA finetuning in lora_finetune_single_device.py
# using a Llama3.2 11B Vision Instruct model
#
# This config assumes that you've run the following command before launching:
# tune download meta-llama/Llama-3.2-11B-Vision-Instruct --output-dir /tmp/Llama-3.2-11B-Vision-Instruct
#
# To launch on a single device, run the following command from root:
# tune run lora_finetune_single_device --config llama3_2_vision/11B_qlora_single_device
#
# You can add specific overrides through the command line. For example
# to override the checkpointer directory while launching training:
# tune run lora_finetune_single_device --config llama3_2_vision/11B_qlora_single_device checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
#
# This config works only for training on single device.

# Model arguments
model:
_component_: torchtune.models.llama3_2_vision.qlora_llama3_2_vision_11b
decoder_trainable: "frozen"
encoder_trainable: "lora"
fusion_trainable: "lora"
lora_attn_modules: ['q_proj', 'v_proj']
apply_lora_to_mlp: False
apply_lora_to_output: False
lora_rank: 8
lora_alpha: 16
lora_dropout: 0.0
image_size: 560 # Make sure this matches the image_size in tokenizer

# Transform
tokenizer:
_component_: torchtune.models.llama3_2_vision.llama3_2_vision_transform
path: /tmp/Llama-3.2-11B-Vision-Instruct/original/tokenizer.model
image_size: 560
max_seq_len: 8192

# Checkpointer
checkpointer:
_component_: torchtune.training.FullModelMetaCheckpointer
checkpoint_dir: /tmp/Llama-3.2-11B-Vision-Instruct/original/
checkpoint_files: [consolidated.pth]
recipe_checkpoint: null
output_dir: /tmp/Llama-3.2-11B-Vision-Instruct/
model_type: LLAMA3_VISION
resume_from_checkpoint: False

# Dataset
dataset:
_component_: torchtune.datasets.multimodal.the_cauldron_dataset
subset: ocrvqa
seed: null
shuffle: True
collate_fn: torchtune.data.padded_collate_tiled_images_and_mask

# Fine-tuning arguments
epochs: 1
max_steps_per_epoch: null
batch_size: 2
gradient_accumulation_steps: 16
optimizer:
_component_: torch.optim.AdamW
fused: True
weight_decay: 0.01
lr: 2e-5
optimizer_in_bwd: False
lr_scheduler:
_component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
num_warmup_steps: 100
loss:
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
clip_grad_norm: 1.0
compile: False # set it to True for better memory and performance

# Training env
device: cuda

# Memory management
enable_activation_checkpointing: True
enable_activation_offloading: False
dtype: bf16

# Logging
output_dir: /tmp/qlora-llama3.2-vision-finetune
metric_logger:
_component_: torchtune.training.metric_logging.DiskLogger
log_dir: /tmp/Llama-3.2-11B-Vision-Instruct/logs
log_every_n_steps: 1
log_peak_memory_stats: False

# Profiler (disabled)
profiler:
_component_: torchtune.training.setup_torch_profiler
enabled: False

#Output directory of trace artifacts
output_dir: ${output_dir}/profiling_outputs

#`torch.profiler.ProfilerActivity` types to trace
cpu: True
cuda: True

#trace options passed to `torch.profiler.profile`
profile_memory: True
with_stack: False
record_shapes: True
with_flops: False

# `torch.profiler.schedule` options:
# wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
wait_steps: 1
warmup_steps: 2
active_steps: 1
num_cycles: 1
9 changes: 3 additions & 6 deletions tests/torchtune/modules/low_precision/test_nf4_linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,6 @@ class TestNF4Linear:
Class for testing our NF4Linear implementation.
"""

def test_bias_unsupported(self):
with pytest.raises(RuntimeError, match="does not currently support biases"):
_ = FrozenNF4Linear(1, 1, bias=True)

@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float32])
def test_parameters(self, dtype):
nf4_linear = FrozenNF4Linear(512, 512, device="cpu", dtype=dtype)
Expand All @@ -59,9 +55,10 @@ def test_state_dict(self, dtype):
assert isinstance(state_dict["weight"], NF4Tensor)

@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float32])
def test_output_dtype(self, dtype):
@pytest.mark.parametrize("bias", [True, False])
def test_output_dtype(self, dtype, bias):
# Test to ensure W4 A16 produces A16 / W4A32 produces A32
nf4_linear = FrozenNF4Linear(512, 512, device="cpu", dtype=dtype)
nf4_linear = FrozenNF4Linear(512, 512, device="cpu", dtype=dtype, bias=bias)
inp = torch.randn(2, 512, dtype=dtype, requires_grad=True)
out = nf4_linear(inp)
assert out.dtype == dtype
Expand Down
Loading

0 comments on commit fd3b4c3

Please sign in to comment.