From 4913d63cda4637187ecf5a15a8ad8233f12fa2f3 Mon Sep 17 00:00:00 2001 From: Gordon Zu Date: Wed, 22 Jan 2025 11:17:45 +0000 Subject: [PATCH 1/3] Add the model databricks/dolly-v2-12b --- finetuning_codes/requirements.txt | 1 + .../scripts/train_dolly_v2_12b.sh | 25 +++++++ finetuning_codes/utils.py | 66 +++++++++++++++++++ inference_codes/benchmark_client.py | 7 +- 4 files changed, 98 insertions(+), 1 deletion(-) create mode 100644 finetuning_codes/scripts/train_dolly_v2_12b.sh diff --git a/finetuning_codes/requirements.txt b/finetuning_codes/requirements.txt index 3039810..1aeb869 100644 --- a/finetuning_codes/requirements.txt +++ b/finetuning_codes/requirements.txt @@ -10,3 +10,4 @@ openai tenacity trl einops +trl==0.11.4 diff --git a/finetuning_codes/scripts/train_dolly_v2_12b.sh b/finetuning_codes/scripts/train_dolly_v2_12b.sh new file mode 100644 index 0000000..f58e627 --- /dev/null +++ b/finetuning_codes/scripts/train_dolly_v2_12b.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +START_TIME=$(TZ="Asia/Seoul" date) +current_time=$(date +"%y%m%d_%H%M%S") + +export SAVE_DIR='/root/poc/checkpoints/dolly_finetuned_new_run' +export LOG_DIR='/root/poc/logs/dolly_finetune_new.log' + +TOKENIZERS_PARALLELISM=false TRANSFORMERS_VERBOSITY=info accelerate launch \ + --config_file config.yaml \ + train.py \ + --model-name-or-path databricks/dolly-v2-12b \ + --dataset-name-or-path fawern/Text-to-sql-query-generation \ + --lr 0.0001 \ + --train-batch-size 64 \ + --eval-batch-size 64 \ + --block-size 1024 \ + --num-epochs 10 \ + --max-steps -1 \ + --log-interval 20 \ + --save-path $SAVE_DIR \ + |& tee $LOG_DIR + +echo "Start: $START_TIME" +echo "End: $(TZ="Asia/Seoul" date)" diff --git a/finetuning_codes/utils.py b/finetuning_codes/utils.py index e6c0265..c24ca94 100644 --- a/finetuning_codes/utils.py +++ b/finetuning_codes/utils.py @@ -125,6 +125,12 @@ def load_model(args): torch_dtype='float32', fp32=True) tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-tokenizer", trust_remote_code=True) + elif "gptneoxforcausallm" in configs.architectures[0].lower(): + #For databricks/dolly-v2-12b, because it uses GPTNeoXForCausalLM architecture + model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, + trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained("databricks/dolly-v2-12b", + trust_remote_code=True) else: model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, use_cache = False) tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) @@ -185,6 +191,20 @@ def load_custom_dataset(args): dataset["validation"] = load_dataset( args.dataset_name_or_path, split="train[90%:95%]").with_format("torch") + elif args.dataset_name_or_path == "mlabonne/mini-platypus": + dataset = load_dataset(args.dataset_name_or_path).with_format("torch") + dataset["train"] = load_dataset(args.dataset_name_or_path, + split="train[:90%]").with_format("torch") + dataset["validation"] = load_dataset( + args.dataset_name_or_path, + split="train[90%:95%]").with_format("torch") + elif args.dataset_name_or_path == "fawern/Text-to-sql-query-generation": + dataset = load_dataset(args.dataset_name_or_path).with_format("torch") + dataset["train"] = load_dataset(args.dataset_name_or_path, + split="train[:90%]").with_format("torch") + dataset["validation"] = load_dataset( + args.dataset_name_or_path, + split="train[90%:95%]").with_format("torch") else: dataset = load_dataset(args.dataset_name_or_path).with_format("torch") @@ -230,6 +250,40 @@ def preprocess(prompt): result['position_ids'] = torch.arange(0, len(result['labels'])) return result + def preprocess_platypus(prompt): + if tokenizer.chat_template is not None: + chat = [ + { + "role": "user", + "content": f"{prompt['instruction']}" + }, + { + "role": "assistant", + "content": f"{prompt['output']}" + }, + ] + chat = tokenizer.apply_chat_template(chat, tokenize=False) + else: + chat = f"##INSTRUCTION {prompt['instruction']}\n\n##RESPONSE {prompt['output']}" + result = tokenizer(chat, + truncation=True, + max_length=args.block_size, + padding="max_length") + result['labels'] = copy.deepcopy(result['input_ids']) + result['position_ids'] = torch.arange(0, len(result['labels'])) + return result + + + def preprocess_sql_query_generation(prompt): + chat = prompt.get('prompt') + result = tokenizer(chat, + truncation=True, + max_length=args.block_size, + padding="max_length") + result['labels'] = copy.deepcopy(result['input_ids']) + result['position_ids'] = torch.arange(0, len(result['labels'])) + return result + def preprocess_chatbot(prompt): if tokenizer.chat_template is not None: chat = [ @@ -297,6 +351,18 @@ def preprocess_agileloop(prompt): load_from_cache_file=True) dataset['validation'] = dataset['validation'].map( preprocess_chatbot, num_proc=1, load_from_cache_file=True) + elif args.dataset_name_or_path == "mlabonne/mini-platypus": + dataset['train'] = dataset['train'].map(preprocess_platypus, + num_proc=1, + load_from_cache_file=True) + dataset['validation'] = dataset['validation'].map( + preprocess_platypus, num_proc=1, load_from_cache_file=True) + elif args.dataset_name_or_path == "fawern/Text-to-sql-query-generation": + dataset['train'] = dataset['train'].map(preprocess_sql_query_generation, + num_proc=1, + load_from_cache_file=True) + dataset['validation'] = dataset['validation'].map( + preprocess_sql_query_generation, num_proc=1, load_from_cache_file=True) else: dataset = dataset.map(preprocess, num_proc=8, load_from_cache_file=True) diff --git a/inference_codes/benchmark_client.py b/inference_codes/benchmark_client.py index 344d570..90460b6 100644 --- a/inference_codes/benchmark_client.py +++ b/inference_codes/benchmark_client.py @@ -398,7 +398,12 @@ def main(args: argparse.Namespace): if args.num_prompts == 1: maximum_generation_tps = df.sort_values(by = ["Running", "generation_tps"], ascending=False).iloc[0]["generation_tps"] else: - maximum_generation_tps = df[(df["prompt_tps"] == 0)].sort_values(by = ["Running", "generation_tps"], ascending=False).iloc[0]["generation_tps"] + #Fix the issue where there is no row with df["prompt_tps"] == 0. + if (df["prompt_tps"] == 0).any(): + maximum_generation_tps = df[(df["prompt_tps"] == 0)].sort_values(by = ["Running", "generation_tps"], ascending=False).iloc[0]["generation_tps"] + else: + maximum_generation_tps = df["generation_tps"].max() + # maximum_generation_tps = df[(df["prompt_tps"] == 0)].sort_values(by = ["Running", "generation_tps"], ascending=False).iloc[0]["generation_tps"] max_generation_tps_list.append(maximum_generation_tps) maximum_running_req = df["Running"].max() max_running_request_list.append(maximum_running_req) From afc851de196af6047c972a8e58cc61f841507991 Mon Sep 17 00:00:00 2001 From: jaeyoung <138426917+whitewave99@users.noreply.github.com> Date: Mon, 3 Feb 2025 10:46:10 +0900 Subject: [PATCH 2/3] Update train_dolly_v2_12b.sh Fix the result directory name --- finetuning_codes/scripts/train_dolly_v2_12b.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/finetuning_codes/scripts/train_dolly_v2_12b.sh b/finetuning_codes/scripts/train_dolly_v2_12b.sh index f58e627..401d288 100644 --- a/finetuning_codes/scripts/train_dolly_v2_12b.sh +++ b/finetuning_codes/scripts/train_dolly_v2_12b.sh @@ -3,8 +3,8 @@ START_TIME=$(TZ="Asia/Seoul" date) current_time=$(date +"%y%m%d_%H%M%S") -export SAVE_DIR='/root/poc/checkpoints/dolly_finetuned_new_run' -export LOG_DIR='/root/poc/logs/dolly_finetune_new.log' +export SAVE_DIR='/root/poc/checkpoints/dolly_finetuned_${current_time}' +export LOG_DIR='/root/poc/logs/dolly_finetune_${current_time}.log' TOKENIZERS_PARALLELISM=false TRANSFORMERS_VERBOSITY=info accelerate launch \ --config_file config.yaml \ From a4674b3348e482fd27873462b164b6e8f5fb3d58 Mon Sep 17 00:00:00 2001 From: jaeyoung <138426917+whitewave99@users.noreply.github.com> Date: Mon, 3 Feb 2025 10:51:22 +0900 Subject: [PATCH 3/3] Update benchmark_client.py Erase the unnecessary comments --- inference_codes/benchmark_client.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/inference_codes/benchmark_client.py b/inference_codes/benchmark_client.py index 90460b6..73fa23d 100644 --- a/inference_codes/benchmark_client.py +++ b/inference_codes/benchmark_client.py @@ -398,12 +398,10 @@ def main(args: argparse.Namespace): if args.num_prompts == 1: maximum_generation_tps = df.sort_values(by = ["Running", "generation_tps"], ascending=False).iloc[0]["generation_tps"] else: - #Fix the issue where there is no row with df["prompt_tps"] == 0. if (df["prompt_tps"] == 0).any(): maximum_generation_tps = df[(df["prompt_tps"] == 0)].sort_values(by = ["Running", "generation_tps"], ascending=False).iloc[0]["generation_tps"] else: maximum_generation_tps = df["generation_tps"].max() - # maximum_generation_tps = df[(df["prompt_tps"] == 0)].sort_values(by = ["Running", "generation_tps"], ascending=False).iloc[0]["generation_tps"] max_generation_tps_list.append(maximum_generation_tps) maximum_running_req = df["Running"].max() max_running_request_list.append(maximum_running_req)