diff --git a/finetuning_codes/requirements.txt b/finetuning_codes/requirements.txt index fd1f626..aafa216 100644 --- a/finetuning_codes/requirements.txt +++ b/finetuning_codes/requirements.txt @@ -10,3 +10,4 @@ openai tenacity trl==0.14.0 einops +trl==0.11.4 diff --git a/finetuning_codes/scripts/train_dolly_v2_12b.sh b/finetuning_codes/scripts/train_dolly_v2_12b.sh new file mode 100644 index 0000000..401d288 --- /dev/null +++ b/finetuning_codes/scripts/train_dolly_v2_12b.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +START_TIME=$(TZ="Asia/Seoul" date) +current_time=$(date +"%y%m%d_%H%M%S") + +export SAVE_DIR='/root/poc/checkpoints/dolly_finetuned_${current_time}' +export LOG_DIR='/root/poc/logs/dolly_finetune_${current_time}.log' + +TOKENIZERS_PARALLELISM=false TRANSFORMERS_VERBOSITY=info accelerate launch \ + --config_file config.yaml \ + train.py \ + --model-name-or-path databricks/dolly-v2-12b \ + --dataset-name-or-path fawern/Text-to-sql-query-generation \ + --lr 0.0001 \ + --train-batch-size 64 \ + --eval-batch-size 64 \ + --block-size 1024 \ + --num-epochs 10 \ + --max-steps -1 \ + --log-interval 20 \ + --save-path $SAVE_DIR \ + |& tee $LOG_DIR + +echo "Start: $START_TIME" +echo "End: $(TZ="Asia/Seoul" date)" diff --git a/finetuning_codes/utils.py b/finetuning_codes/utils.py index f993097..ce96b9c 100644 --- a/finetuning_codes/utils.py +++ b/finetuning_codes/utils.py @@ -117,6 +117,12 @@ def load_model(args): torch_dtype='float32', fp32=True) tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-tokenizer", trust_remote_code=True) + elif "gptneoxforcausallm" in configs.architectures[0].lower(): + #For databricks/dolly-v2-12b, because it uses GPTNeoXForCausalLM architecture + model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, + trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained("databricks/dolly-v2-12b", + trust_remote_code=True) else: model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, use_cache = False) tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) @@ -177,6 +183,20 @@ def load_custom_dataset(args): dataset["validation"] = load_dataset( args.dataset_name_or_path, split="train[90%:95%]").with_format("torch") + elif args.dataset_name_or_path == "mlabonne/mini-platypus": + dataset = load_dataset(args.dataset_name_or_path).with_format("torch") + dataset["train"] = load_dataset(args.dataset_name_or_path, + split="train[:90%]").with_format("torch") + dataset["validation"] = load_dataset( + args.dataset_name_or_path, + split="train[90%:95%]").with_format("torch") + elif args.dataset_name_or_path == "fawern/Text-to-sql-query-generation": + dataset = load_dataset(args.dataset_name_or_path).with_format("torch") + dataset["train"] = load_dataset(args.dataset_name_or_path, + split="train[:90%]").with_format("torch") + dataset["validation"] = load_dataset( + args.dataset_name_or_path, + split="train[90%:95%]").with_format("torch") else: dataset = load_dataset(args.dataset_name_or_path).with_format("torch") @@ -222,6 +242,40 @@ def preprocess(prompt): result['position_ids'] = torch.arange(0, len(result['labels'])) return result + def preprocess_platypus(prompt): + if tokenizer.chat_template is not None: + chat = [ + { + "role": "user", + "content": f"{prompt['instruction']}" + }, + { + "role": "assistant", + "content": f"{prompt['output']}" + }, + ] + chat = tokenizer.apply_chat_template(chat, tokenize=False) + else: + chat = f"##INSTRUCTION {prompt['instruction']}\n\n##RESPONSE {prompt['output']}" + result = tokenizer(chat, + truncation=True, + max_length=args.block_size, + padding="max_length") + result['labels'] = copy.deepcopy(result['input_ids']) + result['position_ids'] = torch.arange(0, len(result['labels'])) + return result + + + def preprocess_sql_query_generation(prompt): + chat = prompt.get('prompt') + result = tokenizer(chat, + truncation=True, + max_length=args.block_size, + padding="max_length") + result['labels'] = copy.deepcopy(result['input_ids']) + result['position_ids'] = torch.arange(0, len(result['labels'])) + return result + def preprocess_chatbot(prompt): if tokenizer.chat_template is not None: chat = [ @@ -289,6 +343,18 @@ def preprocess_agileloop(prompt): load_from_cache_file=True) dataset['validation'] = dataset['validation'].map( preprocess_chatbot, num_proc=1, load_from_cache_file=True) + elif args.dataset_name_or_path == "mlabonne/mini-platypus": + dataset['train'] = dataset['train'].map(preprocess_platypus, + num_proc=1, + load_from_cache_file=True) + dataset['validation'] = dataset['validation'].map( + preprocess_platypus, num_proc=1, load_from_cache_file=True) + elif args.dataset_name_or_path == "fawern/Text-to-sql-query-generation": + dataset['train'] = dataset['train'].map(preprocess_sql_query_generation, + num_proc=1, + load_from_cache_file=True) + dataset['validation'] = dataset['validation'].map( + preprocess_sql_query_generation, num_proc=1, load_from_cache_file=True) else: dataset = dataset.map(preprocess, num_proc=8, load_from_cache_file=True) diff --git a/inference_codes/benchmark_client.py b/inference_codes/benchmark_client.py index 344d570..73fa23d 100644 --- a/inference_codes/benchmark_client.py +++ b/inference_codes/benchmark_client.py @@ -398,7 +398,10 @@ def main(args: argparse.Namespace): if args.num_prompts == 1: maximum_generation_tps = df.sort_values(by = ["Running", "generation_tps"], ascending=False).iloc[0]["generation_tps"] else: - maximum_generation_tps = df[(df["prompt_tps"] == 0)].sort_values(by = ["Running", "generation_tps"], ascending=False).iloc[0]["generation_tps"] + if (df["prompt_tps"] == 0).any(): + maximum_generation_tps = df[(df["prompt_tps"] == 0)].sort_values(by = ["Running", "generation_tps"], ascending=False).iloc[0]["generation_tps"] + else: + maximum_generation_tps = df["generation_tps"].max() max_generation_tps_list.append(maximum_generation_tps) maximum_running_req = df["Running"].max() max_running_request_list.append(maximum_running_req)