diff --git a/.idea/sagemaker-stablediffusion-quick-kit.iml b/.idea/sagemaker-stablediffusion-quick-kit.iml new file mode 100644 index 0000000..d6ebd48 --- /dev/null +++ b/.idea/sagemaker-stablediffusion-quick-kit.iml @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/fine-tuning/dreambooth/stablediffusion_xl_dreambooth_finetuning.zh.ipynb b/fine-tuning/dreambooth/stablediffusion_xl_dreambooth_finetuning.zh.ipynb new file mode 100644 index 0000000..18b21c8 --- /dev/null +++ b/fine-tuning/dreambooth/stablediffusion_xl_dreambooth_finetuning.zh.ipynb @@ -0,0 +1,1274 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a0e92bd0", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "### Dreambooth 模型微调\n", + "DreamBooth 是一种深度学习生成模型,用于微调现有的文本到图像模型,由 Google Research 和波士顿大学的研究人员于 2022 年开发。最初使用 Google 自己的 Imagen 文本到图像模型开发,DreamBooth 的实现可以应用到其他文本到图像模型,它可以让模型通过的三到五张图像对一个主题进行训练后生成更精细和个性化的输出。\n", + "\n", + "![](../../images/dreambooth.png)\n", + "\n", + "接下来我们将使用 DreamBooth 来微调我们的 stable diffusion xl模型.\n", + "\n", + "#### Notebook 步骤\n", + "1. 导入 boto3, sagemaker python SDK\n", + "2. 构建 dreambooth fine-tuning 镜像\n", + "3. 实现模型微调\n", + " * 配置超参\n", + " * 创建训练任务\n", + "4. 测试" + ] + }, + { + "cell_type": "markdown", + "id": "eb9eb077", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "#### 1. 导入 boto3, sagemaker python SDK" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "8314fc9b-c468-497b-abcc-259ec792154c", + "metadata": { + "pycharm": { + "name": "#%%\n" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/ec2-user/anaconda3/envs/amazonei_pytorch_latest_p37/lib/python3.7/site-packages/boto3/compat.py:82: PythonDeprecationWarning: Boto3 will no longer support Python 3.7 starting December 13, 2023. To continue receiving service updates, bug fixes, and security updates please upgrade to Python 3.8 or later. More information can be found here: https://aws.amazon.com/blogs/developer/python-support-policy-updates-for-aws-sdks-and-tools/\n", + " warnings.warn(warning, PythonDeprecationWarning)\n" + ] + } + ], + "source": [ + "import sagemaker\n", + "import boto3\n", + "from sagemaker.pytorch import PyTorch\n", + "sagemaker_session = sagemaker.Session()\n", + "bucket = sagemaker_session.default_bucket()\n", + "role = sagemaker.get_execution_role()\n", + "account_id = boto3.client('sts').get_caller_identity().get('Account')\n", + "region_name = boto3.session.Session().region_name\n", + "\n", + "images_s3uri = 's3://{0}/dreambooth-xl/images/'.format(bucket)\n", + "models_s3uri = 's3://{0}/stable-diffusion/models/'.format(bucket)\n", + "dreambooth_s3uri = 's3://{0}/stable-diffusion/dreambooth/'.format(bucket)" + ] + }, + { + "cell_type": "markdown", + "id": "bd2a3178", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "#### 2. 构建 dreambooth xl fine-tuning 镜像" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15c49cae-3336-4e34-aefd-c53e396f7b04", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "!cd sd_xl_dreambooth && git clone https://github.com/huggingface/diffusers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a7612e5a", + "metadata": { + "pycharm": { + "name": "#%%\n" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "%%writefile Dockerfile\n", + "## You should change below region code to the region you used, here sample is use us-west-2\n", + "#From 763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:1.13.1-transformers4.26.0-gpu-py39-cu117-ubuntu20.04\n", + "From 763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04\n", + "\n", + "RUN pip install wandb\n", + "RUN pip install xformers==0.0.18\n", + "RUN pip install bitsandbytes\n", + "#RUN export TORCH_CUDA_ARCH_LIST=\"7.5 8.0 8.6\" && export FORCE_CUDA=\"1\" && pip install ninja triton==2.0.0.dev20221120 && git clone https://github.com/xieyongliang/xformers.git /tmp/xformers && cd /tmp/xformers && git submodule update --init --recursive && pip install -r requirements.txt && pip install -e . \n", + "\n", + "\n", + "ENV LANG=C.UTF-8\n", + "ENV PYTHONUNBUFFERED=TRUE\n", + "ENV PYTHONDONTWRITEBYTECODE=TRUE" + ] + }, + { + "cell_type": "markdown", + "id": "9d70d510-caf7-4b48-95d4-f9bc2eaa0648", + "metadata": {}, + "source": [ + "* build & push docker镜像" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f573e3c1-5e49-43cd-b71b-c858547192c9", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "## You should change below region code to the region you used, here sample is use us-west-2\n", + "!aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "a69253dd-850f-41b7-b57a-437273648a46", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "## define repo name, should contain *sagemaker* in the name\n", + "repo_name = \"sd_xl_dreambooth_finetuning\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "20a370fa-bdf7-47a6-892d-f05adcf5904c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%%script env repo_name=$repo_name bash\n", + "\n", + "#!/usr/bin/env bash\n", + "\n", + "# This script shows how to build the Docker image and push it to ECR to be ready for use\n", + "# by SageMaker.\n", + "\n", + "# The argument to this script is the image name. This will be used as the image on the local\n", + "# machine and combined with the account and region to form the repository name for ECR.\n", + "# The name of our algorithm\n", + "algorithm_name=${repo_name}\n", + "\n", + "account=$(aws sts get-caller-identity --query Account --output text)\n", + "\n", + "# Get the region defined in the current configuration (default to us-west-2 if none defined)\n", + "region=$(aws configure get region)\n", + "region=${region:-us-west-2}\n", + "\n", + "fullname=\"${account}.dkr.ecr.${region}.amazonaws.com/${algorithm_name}:latest\"\n", + "\n", + "# If the repository doesn't exist in ECR, create it.\n", + "aws ecr describe-repositories --repository-names \"${algorithm_name}\" > /dev/null 2>&1\n", + "\n", + "if [ $? -ne 0 ]\n", + "then\n", + " aws ecr create-repository --repository-name \"${algorithm_name}\" > /dev/null\n", + "fi\n", + "\n", + "# Get the login command from ECR and execute it directly\n", + "aws ecr get-login-password --region ${region}|docker login --username AWS --password-stdin ${fullname}\n", + "\n", + "# Build the docker image locally with the image name and then push it to ECR\n", + "# with the full name.\n", + "\n", + "docker build -t ${algorithm_name} .\n", + "docker tag ${algorithm_name} ${fullname}\n", + "\n", + "docker push ${fullname}" + ] + }, + { + "cell_type": "markdown", + "id": "a8e01e5f-f53f-4443-a149-94d8c7126d8b", + "metadata": { + "tags": [] + }, + "source": [ + "* 准备训练图像" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "285d6076-0746-4afa-ba61-0d53f91bd67a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from huggingface_hub import snapshot_download\n", + "\n", + "local_dir = \"./dog\"\n", + "snapshot_download(\n", + " \"diffusers/dog-example\",\n", + " local_dir=local_dir, repo_type=\"dataset\",\n", + " ignore_patterns=\".gitattributes\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c9f44953-ee8f-4b7a-bbfe-afad211b3224", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "!chmod -R 777 ./sd_xl_dreambooth\n", + "!./sd_xl_dreambooth/s5cmd sync ./dog/ $images_s3uri" + ] + }, + { + "cell_type": "markdown", + "id": "1d843895", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "#### 3. 模型微调\n", + "\n", + " * image_uri: ecr仓库中的 docker 镜像地址\n", + " * instance_type: 用于训练任务的实例大小 , 建议使用 ml.g4dn.xlarge, ml.g5.xlarge\n", + " * class_prompt: 提示词类别\n", + " * instance_prompt: 用于你的图片的关键词\n", + " * model_name: 预训练的模型名称\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "720c66e8-8958-47f2-bfa7-c5252fde430e", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Overwriting ./sd_xl_dreambooth/train.sh\n" + ] + } + ], + "source": [ + "%%writefile ./sd_xl_dreambooth/train.sh\n", + "\n", + "\n", + "mkdir -p /tmp/dog\n", + "ls -lt ./\n", + "chmod 777 ./s5cmd\n", + "\n", + "\n", + "cd diffusers && pip install -e .\n", + "cd examples/dreambooth/ && pip install -r requirements_sdxl.txt\n", + "\n", + "cp -r /opt/ml/input/data/images/* /tmp/dog/\n", + "\n", + "export MODEL_NAME=\"stabilityai/stable-diffusion-xl-base-1.0\"\n", + "export INSTANCE_DIR=\"/tmp/dog/\"\n", + "export OUTPUT_DIR=\"/tmp/ouput\"\n", + "#export OUTPUT_DIR=\"/opt/ml/model/\"\n", + "export VAE_PATH=\"madebyollin/sdxl-vae-fp16-fix\"\n", + "export dreambooth_s3uri=\"s3://sagemaker-us-west-2-687912291502/stable-diffusion/dreambooth/\"\n", + "\n", + "accelerate launch /opt/ml/code/diffusers/examples/dreambooth/train_dreambooth_lora_sdxl.py \\\n", + " --gradient_checkpointing \\\n", + " --use_8bit_adam \\\n", + " --pretrained_model_name_or_path=$MODEL_NAME \\\n", + " --instance_data_dir=$INSTANCE_DIR \\\n", + " --pretrained_vae_model_name_or_path=$VAE_PATH \\\n", + " --output_dir=$OUTPUT_DIR \\\n", + " --mixed_precision=\"fp16\" \\\n", + " --instance_prompt=\"a photo of sks dog\" \\\n", + " --resolution=1024 \\\n", + " --train_batch_size=1 \\\n", + " --gradient_accumulation_steps=4 \\\n", + " --learning_rate=1e-5 \\\n", + " --report_to=\"tensorboard\" \\\n", + " --lr_scheduler=\"constant\" \\\n", + " --lr_warmup_steps=0 \\\n", + " --max_train_steps=500 \\\n", + " --validation_prompt=\"A photo of sks dog in a bucket\" \\\n", + " --validation_epochs=25 \\\n", + " --seed=\"0\" \\\n", + " --enable_xformers_memory_efficient_attention\n", + "\n", + "/opt/ml/code/s5cmd sync /tmp/ouput/ $dreambooth_s3uri/output/$(date +%Y-%m-%d-%H-%M-%S)/\n" + ] + }, + { + "cell_type": "markdown", + "id": "8c67611b-41a2-4977-9bfb-74342b2a5126", + "metadata": {}, + "source": [ + "* 本地跑测试" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "641a23d6-0391-4c13-86e1-c81e226110c9", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "!pip list|grep -i xformers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "264becae-cd2f-4b83-97c7-1e2469fa7f11", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "!./sd_xl_dreambooth/train.sh" + ] + }, + { + "cell_type": "markdown", + "id": "9c569c81", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + " * 创建训练任务" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6535d22c-ab12-48c4-9989-5fabe4f31f69", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using provided s3_resource\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:sagemaker:Creating training-job with name: sd-xl-dreambooth-finetuning-high-2023-08-25-10-08-30-083\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023-08-25 10:08:34 Starting - Starting the training job...\n", + "2023-08-25 10:08:48 Starting - Preparing the instances for training......\n", + "2023-08-25 10:09:57 Downloading - Downloading input data...\n", + "2023-08-25 10:10:22 Training - Downloading the training image...........................\n", + "2023-08-25 10:14:48 Training - Training image download completed. Training in progress....\u001b[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device\u001b[0m\n", + "\u001b[34mbash: no job control in this shell\u001b[0m\n", + "\u001b[34m2023-08-25 10:15:27,344 sagemaker-training-toolkit INFO Imported framework sagemaker_pytorch_container.training\u001b[0m\n", + "\u001b[34m2023-08-25 10:15:27,358 sagemaker-training-toolkit INFO No Neurons detected (normal if no neurons installed)\u001b[0m\n", + "\u001b[34m2023-08-25 10:15:27,368 sagemaker_pytorch_container.training INFO Block until all host DNS lookups succeed.\u001b[0m\n", + "\u001b[34m2023-08-25 10:15:27,369 sagemaker_pytorch_container.training INFO Invoking user training script.\u001b[0m\n", + "\u001b[34m2023-08-25 10:15:29,659 sagemaker-training-toolkit INFO No Neurons detected (normal if no neurons installed)\u001b[0m\n", + "\u001b[34m2023-08-25 10:15:29,683 sagemaker-training-toolkit INFO No Neurons detected (normal if no neurons installed)\u001b[0m\n", + "\u001b[34m2023-08-25 10:15:29,706 sagemaker-training-toolkit INFO No Neurons detected (normal if no neurons installed)\u001b[0m\n", + "\u001b[34m2023-08-25 10:15:29,715 sagemaker-training-toolkit INFO Invoking user script\u001b[0m\n", + "\u001b[34mTraining Env:\u001b[0m\n", + "\u001b[34m{\n", + " \"additional_framework_parameters\": {},\n", + " \"channel_input_dirs\": {\n", + " \"images\": \"/opt/ml/input/data/images\"\n", + " },\n", + " \"current_host\": \"algo-1\",\n", + " \"current_instance_group\": \"homogeneousCluster\",\n", + " \"current_instance_group_hosts\": [\n", + " \"algo-1\"\n", + " ],\n", + " \"current_instance_type\": \"ml.g5.2xlarge\",\n", + " \"distribution_hosts\": [],\n", + " \"distribution_instance_groups\": [],\n", + " \"framework_module\": \"sagemaker_pytorch_container.training:main\",\n", + " \"hosts\": [\n", + " \"algo-1\"\n", + " ],\n", + " \"hyperparameters\": {},\n", + " \"input_config_dir\": \"/opt/ml/input/config\",\n", + " \"input_data_config\": {\n", + " \"images\": {\n", + " \"TrainingInputMode\": \"File\",\n", + " \"S3DistributionType\": \"FullyReplicated\",\n", + " \"RecordWrapperType\": \"None\"\n", + " }\n", + " },\n", + " \"input_dir\": \"/opt/ml/input\",\n", + " \"instance_groups\": [\n", + " \"homogeneousCluster\"\n", + " ],\n", + " \"instance_groups_dict\": {\n", + " \"homogeneousCluster\": {\n", + " \"instance_group_name\": \"homogeneousCluster\",\n", + " \"instance_type\": \"ml.g5.2xlarge\",\n", + " \"hosts\": [\n", + " \"algo-1\"\n", + " ]\n", + " }\n", + " },\n", + " \"is_hetero\": false,\n", + " \"is_master\": true,\n", + " \"is_modelparallel_enabled\": null,\n", + " \"is_smddpmprun_installed\": true,\n", + " \"job_name\": \"sd-xl-dreambooth-finetuning-high-2023-08-25-10-08-30-083\",\n", + " \"log_level\": 20,\n", + " \"master_hostname\": \"algo-1\",\n", + " \"model_dir\": \"/opt/ml/model\",\n", + " \"module_dir\": \"s3://sagemaker-us-west-2-687912291502/sd-xl-dreambooth-finetuning-high-2023-08-25-10-08-30-083/source/sourcedir.tar.gz\",\n", + " \"module_name\": \"train.sh\",\n", + " \"network_interface_name\": \"eth0\",\n", + " \"num_cpus\": 8,\n", + " \"num_gpus\": 1,\n", + " \"num_neurons\": 0,\n", + " \"output_data_dir\": \"/opt/ml/output/data\",\n", + " \"output_dir\": \"/opt/ml/output\",\n", + " \"output_intermediate_dir\": \"/opt/ml/output/intermediate\",\n", + " \"resource_config\": {\n", + " \"current_host\": \"algo-1\",\n", + " \"current_instance_type\": \"ml.g5.2xlarge\",\n", + " \"current_group_name\": \"homogeneousCluster\",\n", + " \"hosts\": [\n", + " \"algo-1\"\n", + " ],\n", + " \"instance_groups\": [\n", + " {\n", + " \"instance_group_name\": \"homogeneousCluster\",\n", + " \"instance_type\": \"ml.g5.2xlarge\",\n", + " \"hosts\": [\n", + " \"algo-1\"\n", + " ]\n", + " }\n", + " ],\n", + " \"network_interface_name\": \"eth0\"\n", + " },\n", + " \"user_entry_point\": \"train.sh\"\u001b[0m\n", + "\u001b[34m}\u001b[0m\n", + "\u001b[34mEnvironment variables:\u001b[0m\n", + "\u001b[34mSM_HOSTS=[\"algo-1\"]\u001b[0m\n", + "\u001b[34mSM_NETWORK_INTERFACE_NAME=eth0\u001b[0m\n", + "\u001b[34mSM_HPS={}\u001b[0m\n", + "\u001b[34mSM_USER_ENTRY_POINT=train.sh\u001b[0m\n", + "\u001b[34mSM_FRAMEWORK_PARAMS={}\u001b[0m\n", + "\u001b[34mSM_RESOURCE_CONFIG={\"current_group_name\":\"homogeneousCluster\",\"current_host\":\"algo-1\",\"current_instance_type\":\"ml.g5.2xlarge\",\"hosts\":[\"algo-1\"],\"instance_groups\":[{\"hosts\":[\"algo-1\"],\"instance_group_name\":\"homogeneousCluster\",\"instance_type\":\"ml.g5.2xlarge\"}],\"network_interface_name\":\"eth0\"}\u001b[0m\n", + "\u001b[34mSM_INPUT_DATA_CONFIG={\"images\":{\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"}}\u001b[0m\n", + "\u001b[34mSM_OUTPUT_DATA_DIR=/opt/ml/output/data\u001b[0m\n", + "\u001b[34mSM_CHANNELS=[\"images\"]\u001b[0m\n", + "\u001b[34mSM_CURRENT_HOST=algo-1\u001b[0m\n", + "\u001b[34mSM_CURRENT_INSTANCE_TYPE=ml.g5.2xlarge\u001b[0m\n", + "\u001b[34mSM_CURRENT_INSTANCE_GROUP=homogeneousCluster\u001b[0m\n", + "\u001b[34mSM_CURRENT_INSTANCE_GROUP_HOSTS=[\"algo-1\"]\u001b[0m\n", + "\u001b[34mSM_INSTANCE_GROUPS=[\"homogeneousCluster\"]\u001b[0m\n", + "\u001b[34mSM_INSTANCE_GROUPS_DICT={\"homogeneousCluster\":{\"hosts\":[\"algo-1\"],\"instance_group_name\":\"homogeneousCluster\",\"instance_type\":\"ml.g5.2xlarge\"}}\u001b[0m\n", + "\u001b[34mSM_DISTRIBUTION_INSTANCE_GROUPS=[]\u001b[0m\n", + "\u001b[34mSM_IS_HETERO=false\u001b[0m\n", + "\u001b[34mSM_MODULE_NAME=train.sh\u001b[0m\n", + "\u001b[34mSM_LOG_LEVEL=20\u001b[0m\n", + "\u001b[34mSM_FRAMEWORK_MODULE=sagemaker_pytorch_container.training:main\u001b[0m\n", + "\u001b[34mSM_INPUT_DIR=/opt/ml/input\u001b[0m\n", + "\u001b[34mSM_INPUT_CONFIG_DIR=/opt/ml/input/config\u001b[0m\n", + "\u001b[34mSM_OUTPUT_DIR=/opt/ml/output\u001b[0m\n", + "\u001b[34mSM_NUM_CPUS=8\u001b[0m\n", + "\u001b[34mSM_NUM_GPUS=1\u001b[0m\n", + "\u001b[34mSM_NUM_NEURONS=0\u001b[0m\n", + "\u001b[34mSM_MODEL_DIR=/opt/ml/model\u001b[0m\n", + "\u001b[34mSM_MODULE_DIR=s3://sagemaker-us-west-2-687912291502/sd-xl-dreambooth-finetuning-high-2023-08-25-10-08-30-083/source/sourcedir.tar.gz\u001b[0m\n", + "\u001b[34mSM_TRAINING_ENV={\"additional_framework_parameters\":{},\"channel_input_dirs\":{\"images\":\"/opt/ml/input/data/images\"},\"current_host\":\"algo-1\",\"current_instance_group\":\"homogeneousCluster\",\"current_instance_group_hosts\":[\"algo-1\"],\"current_instance_type\":\"ml.g5.2xlarge\",\"distribution_hosts\":[],\"distribution_instance_groups\":[],\"framework_module\":\"sagemaker_pytorch_container.training:main\",\"hosts\":[\"algo-1\"],\"hyperparameters\":{},\"input_config_dir\":\"/opt/ml/input/config\",\"input_data_config\":{\"images\":{\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"}},\"input_dir\":\"/opt/ml/input\",\"instance_groups\":[\"homogeneousCluster\"],\"instance_groups_dict\":{\"homogeneousCluster\":{\"hosts\":[\"algo-1\"],\"instance_group_name\":\"homogeneousCluster\",\"instance_type\":\"ml.g5.2xlarge\"}},\"is_hetero\":false,\"is_master\":true,\"is_modelparallel_enabled\":null,\"is_smddpmprun_installed\":true,\"job_name\":\"sd-xl-dreambooth-finetuning-high-2023-08-25-10-08-30-083\",\"log_level\":20,\"master_hostname\":\"algo-1\",\"model_dir\":\"/opt/ml/model\",\"module_dir\":\"s3://sagemaker-us-west-2-687912291502/sd-xl-dreambooth-finetuning-high-2023-08-25-10-08-30-083/source/sourcedir.tar.gz\",\"module_name\":\"train.sh\",\"network_interface_name\":\"eth0\",\"num_cpus\":8,\"num_gpus\":1,\"num_neurons\":0,\"output_data_dir\":\"/opt/ml/output/data\",\"output_dir\":\"/opt/ml/output\",\"output_intermediate_dir\":\"/opt/ml/output/intermediate\",\"resource_config\":{\"current_group_name\":\"homogeneousCluster\",\"current_host\":\"algo-1\",\"current_instance_type\":\"ml.g5.2xlarge\",\"hosts\":[\"algo-1\"],\"instance_groups\":[{\"hosts\":[\"algo-1\"],\"instance_group_name\":\"homogeneousCluster\",\"instance_type\":\"ml.g5.2xlarge\"}],\"network_interface_name\":\"eth0\"},\"user_entry_point\":\"train.sh\"}\u001b[0m\n", + "\u001b[34mSM_USER_ARGS=[]\u001b[0m\n", + "\u001b[34mSM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate\u001b[0m\n", + "\u001b[34mSM_CHANNEL_IMAGES=/opt/ml/input/data/images\u001b[0m\n", + "\u001b[34mPYTHONPATH=/opt/ml/code:/opt/conda/bin:/opt/conda/lib/python310.zip:/opt/conda/lib/python3.10:/opt/conda/lib/python3.10/lib-dynload:/opt/conda/lib/python3.10/site-packages\u001b[0m\n", + "\u001b[34mInvoking script with the following command:\u001b[0m\n", + "\u001b[34m/bin/sh -c \"./train.sh \"\u001b[0m\n", + "\u001b[34m2023-08-25 10:15:29,742 sagemaker-training-toolkit INFO Exceptions not imported for SageMaker TF as Tensorflow is not installed.\u001b[0m\n", + "\u001b[34mtotal 12136\u001b[0m\n", + "\u001b[34m-rwxrwxrwx 1 1000 1000 1329 Aug 25 10:08 train.sh\u001b[0m\n", + "\u001b[34mdrwxrwxrwx 11 1000 1000 4096 Aug 25 02:24 diffusers\u001b[0m\n", + "\u001b[34m-rwxrwxrwx 1 1000 1000 12419072 Aug 25 02:10 s5cmd\u001b[0m\n", + "\u001b[34mObtaining file:///opt/ml/code/diffusers\u001b[0m\n", + "\u001b[34mInstalling build dependencies: started\u001b[0m\n", + "\u001b[34mInstalling build dependencies: finished with status 'done'\u001b[0m\n", + "\u001b[34mChecking if build backend supports build_editable: started\u001b[0m\n", + "\u001b[34mChecking if build backend supports build_editable: finished with status 'done'\u001b[0m\n", + "\u001b[34mGetting requirements to build editable: started\u001b[0m\n", + "\u001b[34mGetting requirements to build editable: finished with status 'done'\u001b[0m\n", + "\u001b[34mPreparing editable metadata (pyproject.toml): started\u001b[0m\n", + "\u001b[34mPreparing editable metadata (pyproject.toml): finished with status 'done'\u001b[0m\n", + "\u001b[34mRequirement already satisfied: importlib-metadata in /opt/conda/lib/python3.10/site-packages (from diffusers==0.21.0.dev0) (4.13.0)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from diffusers==0.21.0.dev0) (3.12.0)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: huggingface-hub>=0.13.2 in /opt/conda/lib/python3.10/site-packages (from diffusers==0.21.0.dev0) (0.14.1)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: numpy in /opt/conda/lib/python3.10/site-packages (from diffusers==0.21.0.dev0) (1.23.5)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: regex!=2019.12.17 in /opt/conda/lib/python3.10/site-packages (from diffusers==0.21.0.dev0) (2023.5.5)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: requests in /opt/conda/lib/python3.10/site-packages (from diffusers==0.21.0.dev0) (2.28.2)\u001b[0m\n", + "\u001b[34mCollecting safetensors>=0.3.1 (from diffusers==0.21.0.dev0)\u001b[0m\n", + "\u001b[34mDownloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\u001b[0m\n", + "\u001b[34m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.3/1.3 MB 27.8 MB/s eta 0:00:00\u001b[0m\n", + "\u001b[34mRequirement already satisfied: Pillow in /opt/conda/lib/python3.10/site-packages (from diffusers==0.21.0.dev0) (9.4.0)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: fsspec in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.13.2->diffusers==0.21.0.dev0) (2023.5.0)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: tqdm>=4.42.1 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.13.2->diffusers==0.21.0.dev0) (4.65.0)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.13.2->diffusers==0.21.0.dev0) (5.4.1)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.13.2->diffusers==0.21.0.dev0) (4.5.0)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: packaging>=20.9 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.13.2->diffusers==0.21.0.dev0) (23.1)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: zipp>=0.5 in /opt/conda/lib/python3.10/site-packages (from importlib-metadata->diffusers==0.21.0.dev0) (3.15.0)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests->diffusers==0.21.0.dev0) (3.1.0)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests->diffusers==0.21.0.dev0) (3.4)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests->diffusers==0.21.0.dev0) (1.26.15)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests->diffusers==0.21.0.dev0) (2023.5.7)\u001b[0m\n", + "\u001b[34mBuilding wheels for collected packages: diffusers\u001b[0m\n", + "\u001b[34mBuilding editable for diffusers (pyproject.toml): started\u001b[0m\n", + "\u001b[34mBuilding editable for diffusers (pyproject.toml): finished with status 'done'\u001b[0m\n", + "\u001b[34mCreated wheel for diffusers: filename=diffusers-0.21.0.dev0-0.editable-py3-none-any.whl size=10590 sha256=04174d74787363bfffd7777681648ee363934f287b5b9887e063e4bc9e6137b6\u001b[0m\n", + "\u001b[34mStored in directory: /tmp/pip-ephem-wheel-cache-94533sq4/wheels/e7/bd/4d/191eca0598c7d7deb715af4f393ac4f436cd03eb69d398c6e3\u001b[0m\n", + "\u001b[34mSuccessfully built diffusers\u001b[0m\n", + "\u001b[34mInstalling collected packages: safetensors, diffusers\u001b[0m\n", + "\u001b[34mAttempting uninstall: diffusers\u001b[0m\n", + "\u001b[34mFound existing installation: diffusers 0.16.1\u001b[0m\n", + "\u001b[34mUninstalling diffusers-0.16.1:\u001b[0m\n", + "\u001b[34mSuccessfully uninstalled diffusers-0.16.1\u001b[0m\n", + "\u001b[34mSuccessfully installed diffusers-0.21.0.dev0 safetensors-0.3.3\u001b[0m\n", + "\u001b[34mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\n", + "\u001b[34m[notice] A new release of pip is available: 23.1.2 -> 23.2.1\u001b[0m\n", + "\u001b[34m[notice] To update, run: pip install --upgrade pip\u001b[0m\n", + "\u001b[34mRequirement already satisfied: accelerate>=0.16.0 in /opt/conda/lib/python3.10/site-packages (from -r requirements_sdxl.txt (line 1)) (0.19.0)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: torchvision in /opt/conda/lib/python3.10/site-packages (from -r requirements_sdxl.txt (line 2)) (0.15.1)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: transformers>=4.25.1 in /opt/conda/lib/python3.10/site-packages (from -r requirements_sdxl.txt (line 3)) (4.28.1)\u001b[0m\n", + "\u001b[34mCollecting ftfy (from -r requirements_sdxl.txt (line 4))\u001b[0m\n", + "\u001b[34mDownloading ftfy-6.1.1-py3-none-any.whl (53 kB)\u001b[0m\n", + "\u001b[34m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 53.1/53.1 kB 3.8 MB/s eta 0:00:00\u001b[0m\n", + "\u001b[34mRequirement already satisfied: tensorboard in /opt/conda/lib/python3.10/site-packages (from -r requirements_sdxl.txt (line 5)) (2.13.0)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: Jinja2 in /opt/conda/lib/python3.10/site-packages (from -r requirements_sdxl.txt (line 6)) (3.1.2)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: numpy>=1.17 in /opt/conda/lib/python3.10/site-packages (from accelerate>=0.16.0->-r requirements_sdxl.txt (line 1)) (1.23.5)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: packaging>=20.0 in /opt/conda/lib/python3.10/site-packages (from accelerate>=0.16.0->-r requirements_sdxl.txt (line 1)) (23.1)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: psutil in /opt/conda/lib/python3.10/site-packages (from accelerate>=0.16.0->-r requirements_sdxl.txt (line 1)) (5.9.5)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: pyyaml in /opt/conda/lib/python3.10/site-packages (from accelerate>=0.16.0->-r requirements_sdxl.txt (line 1)) (5.4.1)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: torch>=1.6.0 in /opt/conda/lib/python3.10/site-packages (from accelerate>=0.16.0->-r requirements_sdxl.txt (line 1)) (2.0.0)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: requests in /opt/conda/lib/python3.10/site-packages (from torchvision->-r requirements_sdxl.txt (line 2)) (2.28.2)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: pillow!=8.3.*,>=5.3.0 in /opt/conda/lib/python3.10/site-packages (from torchvision->-r requirements_sdxl.txt (line 2)) (9.4.0)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from transformers>=4.25.1->-r requirements_sdxl.txt (line 3)) (3.12.0)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: huggingface-hub<1.0,>=0.11.0 in /opt/conda/lib/python3.10/site-packages (from transformers>=4.25.1->-r requirements_sdxl.txt (line 3)) (0.14.1)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: regex!=2019.12.17 in /opt/conda/lib/python3.10/site-packages (from transformers>=4.25.1->-r requirements_sdxl.txt (line 3)) (2023.5.5)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /opt/conda/lib/python3.10/site-packages (from transformers>=4.25.1->-r requirements_sdxl.txt (line 3)) (0.13.3)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: tqdm>=4.27 in /opt/conda/lib/python3.10/site-packages (from transformers>=4.25.1->-r requirements_sdxl.txt (line 3)) (4.65.0)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: wcwidth>=0.2.5 in /opt/conda/lib/python3.10/site-packages (from ftfy->-r requirements_sdxl.txt (line 4)) (0.2.6)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: absl-py>=0.4 in /opt/conda/lib/python3.10/site-packages (from tensorboard->-r requirements_sdxl.txt (line 5)) (1.4.0)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: grpcio>=1.48.2 in /opt/conda/lib/python3.10/site-packages (from tensorboard->-r requirements_sdxl.txt (line 5)) (1.54.2)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: google-auth<3,>=1.6.3 in /opt/conda/lib/python3.10/site-packages (from tensorboard->-r requirements_sdxl.txt (line 5)) (2.18.1)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: google-auth-oauthlib<1.1,>=0.5 in /opt/conda/lib/python3.10/site-packages (from tensorboard->-r requirements_sdxl.txt (line 5)) (1.0.0)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: markdown>=2.6.8 in /opt/conda/lib/python3.10/site-packages (from tensorboard->-r requirements_sdxl.txt (line 5)) (3.4.3)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: protobuf>=3.19.6 in /opt/conda/lib/python3.10/site-packages (from tensorboard->-r requirements_sdxl.txt (line 5)) (3.20.2)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: setuptools>=41.0.0 in /opt/conda/lib/python3.10/site-packages (from tensorboard->-r requirements_sdxl.txt (line 5)) (65.6.3)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: tensorboard-data-server<0.8.0,>=0.7.0 in /opt/conda/lib/python3.10/site-packages (from tensorboard->-r requirements_sdxl.txt (line 5)) (0.7.0)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: werkzeug>=1.0.1 in /opt/conda/lib/python3.10/site-packages (from tensorboard->-r requirements_sdxl.txt (line 5)) (2.3.4)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: wheel>=0.26 in /opt/conda/lib/python3.10/site-packages (from tensorboard->-r requirements_sdxl.txt (line 5)) (0.40.0)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.10/site-packages (from Jinja2->-r requirements_sdxl.txt (line 6)) (2.1.2)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: cachetools<6.0,>=2.0.0 in /opt/conda/lib/python3.10/site-packages (from google-auth<3,>=1.6.3->tensorboard->-r requirements_sdxl.txt (line 5)) (5.3.0)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: pyasn1-modules>=0.2.1 in /opt/conda/lib/python3.10/site-packages (from google-auth<3,>=1.6.3->tensorboard->-r requirements_sdxl.txt (line 5)) (0.3.0)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: six>=1.9.0 in /opt/conda/lib/python3.10/site-packages (from google-auth<3,>=1.6.3->tensorboard->-r requirements_sdxl.txt (line 5)) (1.16.0)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: urllib3<2.0 in /opt/conda/lib/python3.10/site-packages (from google-auth<3,>=1.6.3->tensorboard->-r requirements_sdxl.txt (line 5)) (1.26.15)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: rsa<5,>=3.1.4 in /opt/conda/lib/python3.10/site-packages (from google-auth<3,>=1.6.3->tensorboard->-r requirements_sdxl.txt (line 5)) (4.7.2)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: requests-oauthlib>=0.7.0 in /opt/conda/lib/python3.10/site-packages (from google-auth-oauthlib<1.1,>=0.5->tensorboard->-r requirements_sdxl.txt (line 5)) (1.3.1)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: fsspec in /opt/conda/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.11.0->transformers>=4.25.1->-r requirements_sdxl.txt (line 3)) (2023.5.0)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.11.0->transformers>=4.25.1->-r requirements_sdxl.txt (line 3)) (4.5.0)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests->torchvision->-r requirements_sdxl.txt (line 2)) (3.1.0)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests->torchvision->-r requirements_sdxl.txt (line 2)) (3.4)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests->torchvision->-r requirements_sdxl.txt (line 2)) (2023.5.7)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: sympy in /opt/conda/lib/python3.10/site-packages (from torch>=1.6.0->accelerate>=0.16.0->-r requirements_sdxl.txt (line 1)) (1.11.1)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: networkx in /opt/conda/lib/python3.10/site-packages (from torch>=1.6.0->accelerate>=0.16.0->-r requirements_sdxl.txt (line 1)) (3.1)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: pyasn1<0.6.0,>=0.4.6 in /opt/conda/lib/python3.10/site-packages (from pyasn1-modules>=0.2.1->google-auth<3,>=1.6.3->tensorboard->-r requirements_sdxl.txt (line 5)) (0.4.8)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: oauthlib>=3.0.0 in /opt/conda/lib/python3.10/site-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<1.1,>=0.5->tensorboard->-r requirements_sdxl.txt (line 5)) (3.2.2)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: mpmath>=0.19 in /opt/conda/lib/python3.10/site-packages (from sympy->torch>=1.6.0->accelerate>=0.16.0->-r requirements_sdxl.txt (line 1)) (1.3.0)\u001b[0m\n", + "\u001b[34mInstalling collected packages: ftfy\u001b[0m\n", + "\u001b[34mSuccessfully installed ftfy-6.1.1\u001b[0m\n", + "\u001b[34mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\n", + "\u001b[34m[notice] A new release of pip is available: 23.1.2 -> 23.2.1\u001b[0m\n", + "\u001b[34m[notice] To update, run: pip install --upgrade pip\u001b[0m\n", + "\u001b[34m[10:15:42] WARNING The following values were not passed to launch.py:890\n", + " `accelerate launch` and had defaults used \n", + " instead: \n", + " `--num_processes` was set to a value \n", + " of `1` \n", + " `--num_machines` was set to a value of \n", + " `1` \n", + " `--mixed_precision` was set to a value \n", + " of `'no'` \n", + " `--dynamo_backend` was set to a value \n", + " of `'no'` \n", + " To avoid this warning pass in values for each \n", + " of the problematic parameters or run \n", + " `accelerate config`.\u001b[0m\n", + "\u001b[34m08/25/2023 10:15:46 - INFO - __main__ - Distributed environment: NO\u001b[0m\n", + "\u001b[34mNum processes: 1\u001b[0m\n", + "\u001b[34mProcess index: 0\u001b[0m\n", + "\u001b[34mLocal process index: 0\u001b[0m\n", + "\u001b[34mDevice: cuda\u001b[0m\n", + "\u001b[34mMixed precision type: fp16\u001b[0m\n", + "\u001b[34mDownloading (…)okenizer_config.json: 0%| | 0.00/737 [00:00