refactor: warmstart tutorial now uses FSDP2

le1nux · le1nux · commit 97a6594e0a28 · 2025-05-23T11:07:21.000+02:00
diff --git a/tutorials/warmstart/configs/pre_training_config.yaml b/tutorials/warmstart/configs/pre_training_config.yaml
@@ -1,4 +1,4 @@
-settings:  
+settings:
   experiment_id: ${modalities_env:experiment_id}
   config_file_path: ${modalities_env:config_file_path}
   referencing_keys:
@@ -28,7 +28,7 @@ settings:
   training_target:
     num_target_tokens: 81920 # num_target_steps * world_size * local_train_micro_batch_size * sequence_length * gradient_accumulation_steps
     num_target_steps: 20    # we want to run for exactly 20 steps (although we will only get one checkpoint after 11 steps)
-  training_progress: 
+  training_progress:
     global_num_seen_tokens: 0
     num_seen_steps: 0
     num_seen_samples: 0
@@ -95,7 +95,7 @@ checkpoint_saving:
         k: -1   # -1 to save all checkpoints
     checkpoint_saving_execution:
       component_key: checkpoint_saving_execution
-      variant_key: fsdp1
+      variant_key: dcp
       config:
         checkpoint_path: ${settings.paths.checkpoint_saving_path}
         global_rank: ${settings.cuda_env.global_rank}
@@ -108,12 +108,21 @@ loss_fn:
     target_key: ${settings.referencing_keys.target_key}
     prediction_key: ${settings.referencing_keys.prediction_key}
 
+device_mesh:
+  component_key: device_mesh
+  variant_key: default
+  config:
+    device_type: cuda
+    data_parallel_replicate_degree: 1
+    data_parallel_shard_degree: ${settings.cuda_env.world_size} # i.e., fully sharded
+    world_size: ${settings.cuda_env.world_size}
+
 app_state:
   component_key: app_state
   variant_key: raw
   config:
     model: 
-      instance_key: wrapped_model
+      instance_key: initialized_model
       pass_type: BY_REFERENCE
     optimizer:
       instance_key: optimizer
@@ -122,24 +131,12 @@ app_state:
       instance_key: lr_scheduler
       pass_type: BY_REFERENCE
 
-wrapped_model:
-  component_key: model
-  variant_key: fsdp1_wrapped
-  config:
-    model:
-      instance_key: model
-      pass_type: BY_REFERENCE
-    sync_module_states: true
-    mixed_precision_settings: BF_16
-    sharding_strategy: FULL_SHARD
-    block_names: [GPT2Block]
-
-model:
+initialized_model:
   component_key: model
   variant_key: model_initialized
   config:
     model:
-      instance_key: model_raw
+      instance_key: fsdp_model
       pass_type: BY_REFERENCE
     model_initializer:
       component_key: model_initialization
@@ -151,6 +148,21 @@ model:
         std: 0.02
         num_layers: ${model_raw.config.n_layer}
 
+fsdp_model:
+  component_key: model
+  variant_key: fsdp2_wrapped
+  config:
+    model:
+      instance_key: model_raw
+      pass_type: BY_REFERENCE
+    device_mesh:
+      instance_key: device_mesh
+      pass_type: BY_REFERENCE
+    mixed_precision_settings:
+      param_dtype: BF_16
+      reduce_dtype: BF_16
+    block_names: [GPT2Block]
+
 model_raw:
   component_key: model
   variant_key: gpt2
@@ -171,12 +183,12 @@ model_raw:
     bias: false
     attention_config:
       qkv_transforms:
-      - type_hint: RotaryTransform
-        config:
-          n_embd: ${model_raw.config.n_embd}
-          n_head: ${model_raw.config.n_head_q}
-          seq_length_dim: -2
-          base_freq: 100000
+        - type_hint: RotaryTransform
+          config:
+            n_embd: ${model_raw.config.n_embd}
+            n_head: ${model_raw.config.n_head_q}
+            seq_length_dim: -2
+            base_freq: 100000
     attention_implementation: pytorch_flash
     activation_type: swiglu
     attention_norm_config:
@@ -223,15 +235,15 @@ optimizer:
     weight_decay: 1e-1
     weight_decay_groups_excluded: [embedding, layernorm]
     wrapped_model: 
-      instance_key: wrapped_model
+      instance_key: initialized_model
       pass_type: BY_REFERENCE
 
 gradient_clipper:
   component_key: gradient_clipper
-  variant_key: fsdp1
+  variant_key: fsdp2
   config:
     wrapped_model:
-      instance_key: wrapped_model
+      instance_key: initialized_model
       pass_type: BY_REFERENCE
     norm_type: P2_NORM
     max_norm: 1.0
diff --git a/tutorials/warmstart/configs/warmstart_config.yaml b/tutorials/warmstart/configs/warmstart_config.yaml
@@ -33,12 +33,12 @@ settings:
       component_key: number_conversion
       variant_key: global_num_seen_tokens_from_checkpoint_path
       config:
-        checkpoint_path: ${settings.warmstart_checkpoint_paths.model_checkpoint_path}
+        checkpoint_path: ${settings.warmstart_checkpoint_paths.checkpoint_folder_path}
     num_seen_steps:  # for the batch progress subscriber
       component_key: number_conversion
       variant_key: num_seen_steps_from_checkpoint_path
       config:
-        checkpoint_path: ${settings.warmstart_checkpoint_paths.model_checkpoint_path}
+        checkpoint_path: ${settings.warmstart_checkpoint_paths.checkpoint_folder_path}
     num_seen_samples:
       component_key: number_conversion
       variant_key: num_samples_from_num_tokens
@@ -49,7 +49,7 @@ settings:
       component_key: number_conversion
       variant_key: last_step_from_checkpoint_path
       config:
-        checkpoint_path: ${settings.warmstart_checkpoint_paths.model_checkpoint_path}
+        checkpoint_path: ${settings.warmstart_checkpoint_paths.checkpoint_folder_path}
   warmstart_checkpoint_paths: ${warmstart_env:checkpoint_paths}
 
 collate_fn:
@@ -104,12 +104,9 @@ eval_dataloaders: []
 
 checkpoint_loading:
   component_key: checkpoint_loading
-  variant_key: fsdp1
+  variant_key: dcp
   config:
     global_rank: ${settings.cuda_env.global_rank}
-    block_names: [GPT2Block]
-    mixed_precision_settings: BF_16
-    sharding_strategy: FULL_SHARD
 
 checkpoint_saving:
   component_key: checkpoint_saving
@@ -122,7 +119,7 @@ checkpoint_saving:
         k: -1   # -1 to save all checkpoints
     checkpoint_saving_execution:
       component_key: checkpoint_saving_execution
-      variant_key: fsdp1
+      variant_key: dcp
       config:
         checkpoint_path: ${settings.paths.checkpoint_saving_path}
         global_rank: ${settings.cuda_env.global_rank}
@@ -135,12 +132,30 @@ loss_fn:
     target_key: ${settings.referencing_keys.target_key}
     prediction_key: ${settings.referencing_keys.prediction_key}
 
+device_mesh:
+  component_key: device_mesh
+  variant_key: default
+  config:
+    device_type: cuda
+    data_parallel_replicate_degree: 1
+    data_parallel_shard_degree: ${settings.cuda_env.world_size}
+    world_size: ${settings.cuda_env.world_size}
+
 app_state:
+  component_key: app_state
+  variant_key: dcp
+  config:
+    raw_app_state:
+      instance_key: app_state_raw
+      pass_type: BY_REFERENCE
+    checkpoint_dir_path: ${settings.warmstart_checkpoint_paths.checkpoint_folder_path}
+
+app_state_raw:
   component_key: app_state
   variant_key: raw
   config:
-    model: 
-      instance_key: wrapped_model
+    model:
+      instance_key: initialized_model
       pass_type: BY_REFERENCE
     optimizer:
       instance_key: optimizer
@@ -149,24 +164,12 @@ app_state:
       instance_key: lr_scheduler
       pass_type: BY_REFERENCE
 
-wrapped_model:
-  component_key: model
-  variant_key: fsdp1_checkpointed
-  config:
-    model:
-      instance_key: model
-      pass_type: BY_REFERENCE
-    checkpoint_loading:
-      instance_key: checkpoint_loading
-      pass_type: BY_REFERENCE
-    checkpoint_path: ${settings.warmstart_checkpoint_paths.model_checkpoint_path}
-
-model:
+initialized_model:
   component_key: model
   variant_key: model_initialized
   config:
     model:
-      instance_key: model_raw
+      instance_key: fsdp_model
       pass_type: BY_REFERENCE
     model_initializer:
       component_key: model_initialization
@@ -178,6 +181,21 @@ model:
         std: 0.02
         num_layers: ${model_raw.config.n_layer}
 
+fsdp_model:
+  component_key: model
+  variant_key: fsdp2_wrapped
+  config:
+    model:
+      instance_key: model_raw
+      pass_type: BY_REFERENCE
+    device_mesh:
+      instance_key: device_mesh
+      pass_type: BY_REFERENCE
+    mixed_precision_settings:
+      param_dtype: BF_16
+      reduce_dtype: BF_16
+    block_names: [GPT2Block]
+
 model_raw:
   component_key: model
   variant_key: gpt2
@@ -198,12 +216,12 @@ model_raw:
     bias: false
     attention_config:
       qkv_transforms:
-      - type_hint: RotaryTransform
-        config:
-          n_embd: ${model_raw.config.n_embd}
-          n_head: ${model_raw.config.n_head_q}
-          seq_length_dim: -2
-          base_freq: 100000
+        - type_hint: RotaryTransform
+          config:
+            n_embd: ${model_raw.config.n_embd}
+            n_head: ${model_raw.config.n_head_q}
+            seq_length_dim: -2
+            base_freq: 100000
     attention_implementation: pytorch_flash
     activation_type: swiglu
     attention_norm_config:
@@ -238,24 +256,9 @@ lr_scheduler:
     total_steps: ${settings.training_target.num_target_steps}
     pct_start: 0.01
     anneal_strategy: cos
-    last_epoch: ${settings.training_progress.last_step}
+    # last_epoch: ${settings.training_progress.last_step}
 
 optimizer:
-  component_key: optimizer
-  variant_key: fsdp1_checkpointed
-  config:
-    optimizer:
-      instance_key: optimizer_original
-      pass_type: BY_REFERENCE
-    wrapped_model:
-      instance_key: wrapped_model
-      pass_type: BY_REFERENCE
-    checkpoint_loading:
-      instance_key: checkpoint_loading
-      pass_type: BY_REFERENCE
-    checkpoint_path: ${settings.warmstart_checkpoint_paths.optimizer_checkpoint_path}
-
-optimizer_original:
   component_key: optimizer
   variant_key: adam_w
   config:
@@ -265,15 +268,15 @@ optimizer_original:
     weight_decay: 1e-1
     weight_decay_groups_excluded: [embedding, layernorm]
     wrapped_model: 
-      instance_key: wrapped_model
+      instance_key: initialized_model
       pass_type: BY_REFERENCE
 
 gradient_clipper:
   component_key: gradient_clipper
-  variant_key: fsdp1
+  variant_key: fsdp2
   config:
     wrapped_model:
-      instance_key: wrapped_model
+      instance_key: initialized_model
       pass_type: BY_REFERENCE
     norm_type: P2_NORM
     max_norm: 1.0
diff --git a/tutorials/warmstart/scripts/check_checkpoint_consistency.py b/tutorials/warmstart/scripts/check_checkpoint_consistency.py

-Original file line number
+Diff line change
@@ @@ -1,5 +1,5 @@ @@
 import glob
 -import json
 +import os
 import re
 from pathlib import Path
 def test_checkpoint_files_exist(checkpoint_folder_path: list[Path], expected_checkpoint_names: list[str]):
     # Check if all the checkpoint files exist and have the correct names
 -    checkpoint_paths = glob.glob(str(checkpoint_folder_path / "**/*.bin"), recursive=True)
 +    checkpoint_paths = glob.glob(str(checkpoint_folder_path / "**/*"), recursive=True)
 -    assert len(checkpoint_paths) == 6, "ERROR! Expected 6 checkpoint files."
 +    assert len(checkpoint_paths) == 17, "ERROR! Expected 6 checkpoint files."
 -    for checkpoint_path in checkpoint_paths:
 -        checkpoint_file_name = Path(checkpoint_path).name
 -        cleaned_checkpoint_file_name = _get_checkpoint_file_name_without_eid(checkpoint_file_name)
+-
 -        assert (
 -            cleaned_checkpoint_file_name in expected_checkpoint_names
 -        ), f"ERROR! {checkpoint_file_name} is not a valid checkpoint file name."
+-
+-
 -def check_last_checkpoint_info_correctness(checkpoint_folder_path: Path, expected_last_checkpoint_names: list[str]):
 -    # Check if the last checkpoint info files reference the correct checkpoint files
+-
 -    checkpoint_info_paths = glob.glob(str(checkpoint_folder_path / "**/*.json"), recursive=True)
+-
 -    assert len(checkpoint_info_paths) == 2, "ERROR! Expected 2 checkpoint info files."
+-
 -    assert len(set(checkpoint_info_paths)) == len(
 -        checkpoint_info_paths
 -    ), "ERROR! Duplicate checkpoint info files found."
+-
 -    for checkpoint_info_path in checkpoint_info_paths:
 -        with open(checkpoint_info_path, "r") as f:
 -            checkpoint_info = json.load(f)
 -        model_checkpoint_path = Path(checkpoint_info["model_checkpoint_path"])
 -        optimizer_checkpoint_path = Path(checkpoint_info["optimizer_checkpoint_path"])
 -        assert model_checkpoint_path.exists(), f"ERROR! {model_checkpoint_path} does not exist."
 -        assert optimizer_checkpoint_path.exists(), f"ERROR! {optimizer_checkpoint_path} does not exist."
+-
 -        cleaned_model_checkpoint_file_name = _get_checkpoint_file_name_without_eid(model_checkpoint_path.name)
 -        cleaned_optimizer_checkpoint_file_name = _get_checkpoint_file_name_without_eid(optimizer_checkpoint_path.name)
+-
 -        assert cleaned_model_checkpoint_file_name in expected_last_checkpoint_names
 -        assert cleaned_optimizer_checkpoint_file_name in expected_last_checkpoint_names
 +    assert len([p for p in checkpoint_paths if p.endswith(".distcp")]), "ERROR! Expected 6 checkpoint files."
 if __name__ == "__main__":
 -    checkpoint_folder_path = Path("../data/checkpoints")
 +    current_file_path = Path(__file__).resolve()
 +    os.chdir(current_file_path.parent)
 -    expected_checkpoint_names = [
 -        # pretrain checkpoint
 -        "model-seen_steps_11-seen_tokens_45056-target_steps_20-target_tokens_81920.bin",
 -        "optimizer-seen_steps_11-seen_tokens_45056-target_steps_20-target_tokens_81920.bin",
 -        # warmstart checkpoints
 -        "model-seen_steps_15-seen_tokens_61440-target_steps_20-target_tokens_81920.bin",
 -        "optimizer-seen_steps_15-seen_tokens_61440-target_steps_20-target_tokens_81920.bin",
 -        "model-seen_steps_20-seen_tokens_81920-target_steps_20-target_tokens_81920.bin",
 -        "optimizer-seen_steps_20-seen_tokens_81920-target_steps_20-target_tokens_81920.bin",
 -    ]
 +    checkpoint_folder_path = Path("../data/checkpoints")
 -    expected_last_checkpoint_names = [
 +    expected_checkpoint_folder_names = [
         # pretrain checkpoint
 -        "model-seen_steps_11-seen_tokens_45056-target_steps_20-target_tokens_81920.bin",
 -        "optimizer-seen_steps_11-seen_tokens_45056-target_steps_20-target_tokens_81920.bin",
 +        "seen_steps_11-seen_tokens_45056-target_steps_20-target_tokens_81920",
         # warmstart checkpoints
 -        "model-seen_steps_20-seen_tokens_81920-target_steps_20-target_tokens_81920.bin",
 -        "optimizer-seen_steps_20-seen_tokens_81920-target_steps_20-target_tokens_81920.bin",
 +        "seen_steps_15-seen_tokens_61440-target_steps_20-target_tokens_81920",
 +        "seen_steps_20-seen_tokens_81920-target_steps_20-target_tokens_81920",
+    ]
 -    test_checkpoint_files_exist(checkpoint_folder_path, expected_checkpoint_names)
 -    check_last_checkpoint_info_correctness(checkpoint_folder_path, expected_last_checkpoint_names)
 +    test_checkpoint_files_exist(checkpoint_folder_path, expected_checkpoint_folder_names)