From 7c763ef8686773fd19457727f2c9a54924d8a80d Mon Sep 17 00:00:00 2001
From: Jacob-Chmura <jacobpaul.chmura@gmail.com>
Date: Thu, 15 May 2025 22:03:12 -0400
Subject: [PATCH 1/3] Fresh model reload

---
 benchmarks/ppo/ppo_continual.py | 46 ++++++++++++++++-----------------
 1 file changed, 22 insertions(+), 24 deletions(-)

diff --git a/benchmarks/ppo/ppo_continual.py b/benchmarks/ppo/ppo_continual.py
index 8db6aff3..9c36e606 100644
--- a/benchmarks/ppo/ppo_continual.py
+++ b/benchmarks/ppo/ppo_continual.py
@@ -52,13 +52,7 @@ def main(
         quantization_config=quantization_config,
     )
 
-    # Load main model and (optionally) reference model
     model = str(training_args.sft_model_path)
-    policy = AutoModelForCausalLM.from_pretrained(
-        training_args.sft_model_path,
-        trust_remote_code=model_args.trust_remote_code,
-        **model_kwargs,
-    )
     peft_config = get_peft_config(model_args)
     if peft_config is None:
         ref_policy = AutoModelForCausalLM.from_pretrained(
@@ -69,13 +63,6 @@ def main(
     else:
         ref_policy = None
 
-    # Load value model and policy model (main model)
-    value_model = AutoModelForSequenceClassification.from_pretrained(
-        script_args.value_model_path,
-        trust_remote_code=model_args.trust_remote_code,
-        num_labels=1,
-    )
-
     # Load tokenizer and set chat template if needed
     tokenizer = AutoTokenizer.from_pretrained(
         training_args.sft_model_path,
@@ -100,7 +87,6 @@ def main(
     if '.' in clean_dataset_name:
         clean_dataset_name = clean_dataset_name.split('.')[0]
 
-    print(f'Training PPO on {len(continual_dataset)} tasks')
     # check if the reward models are present either in the path or in the hub
     if training_args.reward_model_path is not None:
         for i in range(len(continual_dataset)):
@@ -117,6 +103,24 @@ def main(
 
     # Task Loop
     for i, dataset in enumerate(continual_dataset):
+        # Load main model and (optionally) reference model
+        if i == 0:
+            model_path = training_args.sft_model_path
+        else:
+            model_path = os.path.join(training_args.output_dir, 'last')
+        policy = AutoModelForCausalLM.from_pretrained(
+            pretrained_model_name_or_path=model_path,
+            trust_remote_code=model_args.trust_remote_code,
+            **model_kwargs,
+        )
+
+        # Load value model and policy model (main model)
+        value_model = AutoModelForSequenceClassification.from_pretrained(
+            script_args.value_model_path,
+            trust_remote_code=model_args.trust_remote_code,
+            num_labels=1,
+        )
+
         # Build custom repository name for this task
         custom_repo_name = (
             model.split('/')[-1] + '_' + clean_dataset_name + '_PPO_' + str(i)
@@ -144,10 +148,6 @@ def main(
             eval_dataset=dataset[script_args.dataset_test_split],
             peft_config=peft_config,
         )
-
-        # if i == 0:
-        #     trainer.save_model(os.path.join(training_args.output_dir, 'checkpoint-0'))
-
         # Set current task in trainer for task-based logging
         trainer.set_task(f'task_{i}')
 
@@ -169,14 +169,12 @@ def main(
             trainer.save_metrics('eval', metrics)
 
             # Log metrics to WandB
-            if training_args.local_rank in (None, -1, 0):
-                wb.log({'eval': {'last': metrics}})  # type: ignore[attr-defined]
-                wb.log({f'task/{custom_repo_name}/last': metrics})  # type: ignore[attr-defined]
+            wb.log({'eval': {'last': metrics}})  # type: ignore[attr-defined]
+            wb.log({f'task/{custom_repo_name}/last': metrics})  # type: ignore[attr-defined]
 
         # Save model checkpoint and optionally push
-        if not training_args.push_to_hub:
-            trainer.save_model(os.path.join(training_args.output_dir, 'last'))
-        else:
+        trainer.save_model(os.path.join(training_args.output_dir, 'last'))
+        if training_args.push_to_hub:
             trainer.push_to_hub(
                 model_name=custom_repo_name,
                 dataset_name='Continual_PPO_' + clean_dataset_name + '_' + str(i),

From a782d4cd51734617bd6bc456954eb836d8876534 Mon Sep 17 00:00:00 2001
From: Jacob-Chmura <jacobpaul.chmura@gmail.com>
Date: Thu, 15 May 2025 22:07:04 -0400
Subject: [PATCH 2/3] Revert diff

---
 benchmarks/ppo/ppo_continual.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/benchmarks/ppo/ppo_continual.py b/benchmarks/ppo/ppo_continual.py
index 9c36e606..b62aece9 100644
--- a/benchmarks/ppo/ppo_continual.py
+++ b/benchmarks/ppo/ppo_continual.py
@@ -87,6 +87,7 @@ def main(
     if '.' in clean_dataset_name:
         clean_dataset_name = clean_dataset_name.split('.')[0]
 
+    print(f'Training PPO on {len(continual_dataset)} tasks')
     # check if the reward models are present either in the path or in the hub
     if training_args.reward_model_path is not None:
         for i in range(len(continual_dataset)):
@@ -148,6 +149,10 @@ def main(
             eval_dataset=dataset[script_args.dataset_test_split],
             peft_config=peft_config,
         )
+
+        # if i == 0:
+        #     trainer.save_model(os.path.join(training_args.output_dir, 'checkpoint-0'))
+
         # Set current task in trainer for task-based logging
         trainer.set_task(f'task_{i}')
 
@@ -169,8 +174,9 @@ def main(
             trainer.save_metrics('eval', metrics)
 
             # Log metrics to WandB
-            wb.log({'eval': {'last': metrics}})  # type: ignore[attr-defined]
-            wb.log({f'task/{custom_repo_name}/last': metrics})  # type: ignore[attr-defined]
+            if training_args.local_rank in (None, -1, 0):
+                wb.log({'eval': {'last': metrics}})  # type: ignore[attr-defined]
+                wb.log({f'task/{custom_repo_name}/last': metrics})  # type: ignore[attr-defined]
 
         # Save model checkpoint and optionally push
         trainer.save_model(os.path.join(training_args.output_dir, 'last'))

From a47dab66f0279ed826f91af11c88983fdc8c5710 Mon Sep 17 00:00:00 2001
From: Jacob-Chmura <jacobpaul.chmura@gmail.com>
Date: Thu, 15 May 2025 22:24:48 -0400
Subject: [PATCH 3/3] Dropped shared state in trainer

---
 benchmarks/ppo/continual_ppo_trainer.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/benchmarks/ppo/continual_ppo_trainer.py b/benchmarks/ppo/continual_ppo_trainer.py
index cb3d747f..4865b2b9 100644
--- a/benchmarks/ppo/continual_ppo_trainer.py
+++ b/benchmarks/ppo/continual_ppo_trainer.py
@@ -270,8 +270,9 @@ def __init__(
                 gradient_accumulation_steps=args.gradient_accumulation_steps
             )
             self.accelerator = accelerator
+            self.gather_function = self.accelerator.gather_for_metrics
             ContinualPPOTrainer.shared_accelerator = accelerator
-        else:
+        elif False:
             self.accelerator = ContinualPPOTrainer.shared_accelerator
             self.gather_function = self.accelerator.gather_for_metrics
             if (
@@ -336,7 +337,7 @@ def __init__(
             self.model = PolicyAndValueWrapper(self.policy_model, self.value_model)
             ContinualPPOTrainer.policy_value_models = self.model
             self.model.config = self.policy_model.config  # needed for pushing to hub
-        else:
+        elif False:
             # Subsequent tasks: Reuse existing model
             self.model = ContinualPPOTrainer.policy_value_models
             self.model.config = self.policy_model.config  # needed for pushing to hub
@@ -407,7 +408,7 @@ def __init__(
                 self.model, self.optimizer, self.dataloader
             )
             ContinualPPOTrainer.ds_wrapped_models = self.model
-        else:
+        elif False:
             # For subsequent tasks, only prepare optimizer and dataloader
             self.optimizer, self.dataloader = self.accelerator.prepare(
                 self.optimizer, self.dataloader
@@ -971,6 +972,7 @@ def repeat_generator() -> DataLoader:
             ContinualPPOTrainer.class_ref_model = original_ref_model
 
         # Ensure the class variable is updated
+        # TODO: Double check this is fine to keep
         ContinualPPOTrainer.class_ref_model = self.ref_model
         if self.is_deepspeed_enabled:
             ContinualPPOTrainer.ds_wrapped_models = self.deepspeed