From 7c763ef8686773fd19457727f2c9a54924d8a80d Mon Sep 17 00:00:00 2001 From: Jacob-Chmura Date: Thu, 15 May 2025 22:03:12 -0400 Subject: [PATCH 1/3] Fresh model reload --- benchmarks/ppo/ppo_continual.py | 46 ++++++++++++++++----------------- 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/benchmarks/ppo/ppo_continual.py b/benchmarks/ppo/ppo_continual.py index 8db6aff3..9c36e606 100644 --- a/benchmarks/ppo/ppo_continual.py +++ b/benchmarks/ppo/ppo_continual.py @@ -52,13 +52,7 @@ def main( quantization_config=quantization_config, ) - # Load main model and (optionally) reference model model = str(training_args.sft_model_path) - policy = AutoModelForCausalLM.from_pretrained( - training_args.sft_model_path, - trust_remote_code=model_args.trust_remote_code, - **model_kwargs, - ) peft_config = get_peft_config(model_args) if peft_config is None: ref_policy = AutoModelForCausalLM.from_pretrained( @@ -69,13 +63,6 @@ def main( else: ref_policy = None - # Load value model and policy model (main model) - value_model = AutoModelForSequenceClassification.from_pretrained( - script_args.value_model_path, - trust_remote_code=model_args.trust_remote_code, - num_labels=1, - ) - # Load tokenizer and set chat template if needed tokenizer = AutoTokenizer.from_pretrained( training_args.sft_model_path, @@ -100,7 +87,6 @@ def main( if '.' in clean_dataset_name: clean_dataset_name = clean_dataset_name.split('.')[0] - print(f'Training PPO on {len(continual_dataset)} tasks') # check if the reward models are present either in the path or in the hub if training_args.reward_model_path is not None: for i in range(len(continual_dataset)): @@ -117,6 +103,24 @@ def main( # Task Loop for i, dataset in enumerate(continual_dataset): + # Load main model and (optionally) reference model + if i == 0: + model_path = training_args.sft_model_path + else: + model_path = os.path.join(training_args.output_dir, 'last') + policy = AutoModelForCausalLM.from_pretrained( + pretrained_model_name_or_path=model_path, + trust_remote_code=model_args.trust_remote_code, + **model_kwargs, + ) + + # Load value model and policy model (main model) + value_model = AutoModelForSequenceClassification.from_pretrained( + script_args.value_model_path, + trust_remote_code=model_args.trust_remote_code, + num_labels=1, + ) + # Build custom repository name for this task custom_repo_name = ( model.split('/')[-1] + '_' + clean_dataset_name + '_PPO_' + str(i) @@ -144,10 +148,6 @@ def main( eval_dataset=dataset[script_args.dataset_test_split], peft_config=peft_config, ) - - # if i == 0: - # trainer.save_model(os.path.join(training_args.output_dir, 'checkpoint-0')) - # Set current task in trainer for task-based logging trainer.set_task(f'task_{i}') @@ -169,14 +169,12 @@ def main( trainer.save_metrics('eval', metrics) # Log metrics to WandB - if training_args.local_rank in (None, -1, 0): - wb.log({'eval': {'last': metrics}}) # type: ignore[attr-defined] - wb.log({f'task/{custom_repo_name}/last': metrics}) # type: ignore[attr-defined] + wb.log({'eval': {'last': metrics}}) # type: ignore[attr-defined] + wb.log({f'task/{custom_repo_name}/last': metrics}) # type: ignore[attr-defined] # Save model checkpoint and optionally push - if not training_args.push_to_hub: - trainer.save_model(os.path.join(training_args.output_dir, 'last')) - else: + trainer.save_model(os.path.join(training_args.output_dir, 'last')) + if training_args.push_to_hub: trainer.push_to_hub( model_name=custom_repo_name, dataset_name='Continual_PPO_' + clean_dataset_name + '_' + str(i), From a782d4cd51734617bd6bc456954eb836d8876534 Mon Sep 17 00:00:00 2001 From: Jacob-Chmura Date: Thu, 15 May 2025 22:07:04 -0400 Subject: [PATCH 2/3] Revert diff --- benchmarks/ppo/ppo_continual.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/benchmarks/ppo/ppo_continual.py b/benchmarks/ppo/ppo_continual.py index 9c36e606..b62aece9 100644 --- a/benchmarks/ppo/ppo_continual.py +++ b/benchmarks/ppo/ppo_continual.py @@ -87,6 +87,7 @@ def main( if '.' in clean_dataset_name: clean_dataset_name = clean_dataset_name.split('.')[0] + print(f'Training PPO on {len(continual_dataset)} tasks') # check if the reward models are present either in the path or in the hub if training_args.reward_model_path is not None: for i in range(len(continual_dataset)): @@ -148,6 +149,10 @@ def main( eval_dataset=dataset[script_args.dataset_test_split], peft_config=peft_config, ) + + # if i == 0: + # trainer.save_model(os.path.join(training_args.output_dir, 'checkpoint-0')) + # Set current task in trainer for task-based logging trainer.set_task(f'task_{i}') @@ -169,8 +174,9 @@ def main( trainer.save_metrics('eval', metrics) # Log metrics to WandB - wb.log({'eval': {'last': metrics}}) # type: ignore[attr-defined] - wb.log({f'task/{custom_repo_name}/last': metrics}) # type: ignore[attr-defined] + if training_args.local_rank in (None, -1, 0): + wb.log({'eval': {'last': metrics}}) # type: ignore[attr-defined] + wb.log({f'task/{custom_repo_name}/last': metrics}) # type: ignore[attr-defined] # Save model checkpoint and optionally push trainer.save_model(os.path.join(training_args.output_dir, 'last')) From a47dab66f0279ed826f91af11c88983fdc8c5710 Mon Sep 17 00:00:00 2001 From: Jacob-Chmura Date: Thu, 15 May 2025 22:24:48 -0400 Subject: [PATCH 3/3] Dropped shared state in trainer --- benchmarks/ppo/continual_ppo_trainer.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/benchmarks/ppo/continual_ppo_trainer.py b/benchmarks/ppo/continual_ppo_trainer.py index cb3d747f..4865b2b9 100644 --- a/benchmarks/ppo/continual_ppo_trainer.py +++ b/benchmarks/ppo/continual_ppo_trainer.py @@ -270,8 +270,9 @@ def __init__( gradient_accumulation_steps=args.gradient_accumulation_steps ) self.accelerator = accelerator + self.gather_function = self.accelerator.gather_for_metrics ContinualPPOTrainer.shared_accelerator = accelerator - else: + elif False: self.accelerator = ContinualPPOTrainer.shared_accelerator self.gather_function = self.accelerator.gather_for_metrics if ( @@ -336,7 +337,7 @@ def __init__( self.model = PolicyAndValueWrapper(self.policy_model, self.value_model) ContinualPPOTrainer.policy_value_models = self.model self.model.config = self.policy_model.config # needed for pushing to hub - else: + elif False: # Subsequent tasks: Reuse existing model self.model = ContinualPPOTrainer.policy_value_models self.model.config = self.policy_model.config # needed for pushing to hub @@ -407,7 +408,7 @@ def __init__( self.model, self.optimizer, self.dataloader ) ContinualPPOTrainer.ds_wrapped_models = self.model - else: + elif False: # For subsequent tasks, only prepare optimizer and dataloader self.optimizer, self.dataloader = self.accelerator.prepare( self.optimizer, self.dataloader @@ -971,6 +972,7 @@ def repeat_generator() -> DataLoader: ContinualPPOTrainer.class_ref_model = original_ref_model # Ensure the class variable is updated + # TODO: Double check this is fine to keep ContinualPPOTrainer.class_ref_model = self.ref_model if self.is_deepspeed_enabled: ContinualPPOTrainer.ds_wrapped_models = self.deepspeed