diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..e361570 Binary files /dev/null and b/.DS_Store differ diff --git a/inference/.DS_Store b/inference/.DS_Store new file mode 100644 index 0000000..ab93afb Binary files /dev/null and b/inference/.DS_Store differ diff --git a/inference/stable-diffusion/.DS_Store b/inference/stable-diffusion/.DS_Store new file mode 100644 index 0000000..133c172 Binary files /dev/null and b/inference/stable-diffusion/.DS_Store differ diff --git a/inference/stable-diffusion/StableDiffusion2_1-inpaint.ipynb b/inference/stable-diffusion/StableDiffusion2_1-inpaint.ipynb new file mode 100644 index 0000000..359030a --- /dev/null +++ b/inference/stable-diffusion/StableDiffusion2_1-inpaint.ipynb @@ -0,0 +1,1031 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "76c7f449-c44e-4a5d-a344-e030018b5fc1", + "metadata": {}, + "source": [ + "# Deploy & Run Stable Diffusion Inpaint on SageMaker and Inferentia2\n", + "\n", + "**SageMaker Studio Kernel: Data Science 3.0 (ml.t3.medium)**\n", + "\n", + "Stable Diffusion is a transformer model that generates random images out of textual prompts (description of the scene). You can get more information of the model implementation at: https://github.com/Stability-AI/\n", + "\n", + "This sample shows how to compile & deploy a pre-trained [HF Stable Diffusion 2 Inpaint](https://huggingface.co/stabilityai/stable-diffusion-2-inpainting) to [Inferentia2](https://aws.amazon.com/ec2/instance-types/inf2/) using SageMaker. You need to run two steps to complete this task: 1) A SageMaker Training Job using a Trainium instance for compiling the model and; 2) create a SageMaker real-time endpoint, hosted on an Inferentia2 instance, to deploy and invoke your model.\n", + "\n", + "**Compilation:** First you'll kick-off a SageMaker training job on a **trn1.32xlarge** instance. It requires NeuronX Compiler v2.9 to compile the model. It takes ~22mins with a trn1.32xlarge. (Estimated compilation cost on 2023 Sep 30 - us-east-1 ml.trn1.32xlarge \\\\$24.73/h ::: 22mins=$9.02). You compile the model once and deploy & run as many times as you need.\n", + "\n", + "**Inference:** After compiling the model it is time to deploy. You'll create a SageMaker real-time Endpoint hosted on an **inf2** instance. SageMaker exposes your model as a webservice and allow you to invoke it with a simple API call.\n", + "\n", + "\n", + "The compilation mechanism supports datatypes in FP32 and it is selected by default." + ] + }, + { + "cell_type": "markdown", + "id": "7e1f0d48-059a-4170-9f00-2b3ee76cee39", + "metadata": { + "tags": [] + }, + "source": [ + "## 1) Install some dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2084775d-958a-4c6d-9f23-9b946f630008", + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "%pip install -U sagemaker" + ] + }, + { + "cell_type": "markdown", + "id": "04e3b84f-87ae-4d2b-9f5d-9aaea1cda18a", + "metadata": {}, + "source": [ + "## 2) Initialize variables\n", + "Not all regions have trn1 and inf2 instances available at the time this notebook was published. us-east-1 has trn1 instances and us-east-2 has inf2 instances. That way, we need to create two sagemaker sessions: 1/ for compiling the model (us-east-1); 2/ for deploying the model (us-east-2)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d5d9d4d-b3ed-4e7c-968b-c0c95c41f028", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import boto3\n", + "import sagemaker\n", + "import numpy as np\n", + "\n", + "print(sagemaker.__version__)\n", + "if not sagemaker.__version__ >= \"2.146.0\": print(\"You need to upgrade or restart the kernel if you already upgraded\")\n", + "\n", + "region_trn1='us-east-1'\n", + "boto3_sess_trn1 = boto3.Session(region_name=region_trn1) # trn1 session\n", + "sess_trn1 = sagemaker.Session(boto3_sess_trn1)\n", + "\n", + "region_inf2='us-east-2'\n", + "boto3_sess_inf2 = boto3.Session(region_name=region_inf2) # inf2 session\n", + "sess_inf2 = sagemaker.Session(boto3_sess_inf2)\n", + "\n", + "bucket_trn1 = sess_trn1.default_bucket()\n", + "bucket_inf2 = sess_inf2.default_bucket()\n", + "role = sagemaker.get_execution_role()\n", + "\n", + "# https://github.com/aws/deep-learning-containers/blob/master/available_images.md#neuron-containers\n", + "train_image_name=\"pytorch-training-neuronx\"\n", + "inference_image_name=\"pytorch-inference-neuronx\"\n", + "# We need SDK2.13+ to deal with SD Inpaint\n", + "image_tag=\"1.13.1-neuronx-py310-sdk2.13.2-ubuntu20.04-v1.0\"\n", + "\n", + "print(f\"sagemaker role arn: {role}\")\n", + "print(f\"sagemaker bucket trn1: {bucket_trn1}\")\n", + "print(f\"sagemaker bucket trn1: {bucket_inf2}\")\n", + "print(f\"sagemaker session regions. trn1: {region_trn1} inf2: {region_inf2}\")" + ] + }, + { + "cell_type": "markdown", + "id": "c1bf0a88-f94a-4261-822b-25ede4f4d066", + "metadata": {}, + "source": [ + "## 3) Visualize scripts\n", + "We have 3 scripts that will do the job. \n", + " - src-inpaint/wrapper.py: Helper class created to wrap the model and expose the parts that we will compile to inf2. It is also a way to put everything back together to compose a pipeline.\n", + " - src-inpaint/compile.py: NeuronSDK compilation script that makes use of the wrapper, splits the model into 4 parts and compile each one individually.\n", + " - src-inpaint/inference.py: SageMaker inference script that also makes use of wrapper to reload the compiled parts and re-build the pipeline responsible for getting the predictions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "73dd42ba-7ec5-4404-a144-c5b1ccb96bb1", + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "!pygmentize src-inpaint/wrapper.py" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e063c730-5b62-4600-950d-51f1646c3686", + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "!pygmentize src-inpaint/compile.py" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "85d0251e-4d73-4986-8b71-10faf2bef429", + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "!pygmentize src-inpaint/inference.py" + ] + }, + { + "cell_type": "markdown", + "id": "ca09112a-bb10-4719-b8f6-d823fcb72ade", + "metadata": {}, + "source": [ + "## 4) Compile the model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "03717dd9-53de-4b0b-b892-53ccb89c0680", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from sagemaker.pytorch import PyTorch\n", + "\n", + "estimator = PyTorch(\n", + " entry_point=\"compile.py\", # Specify your train script\n", + " source_dir=\"src-inpaint\",\n", + " image_uri=f\"763104351884.dkr.ecr.{region_trn1}.amazonaws.com/{train_image_name}:{image_tag}\",\n", + " role=role,\n", + " sagemaker_session=sess_trn1,\n", + " instance_count=1,\n", + " instance_type='ml.trn1.32xlarge',\n", + " disable_profiler=True,\n", + " output_path=f\"s3://{bucket_trn1}/output\", \n", + " volume_size = 384,\n", + " \n", + " # Parameters required to enable checkpointing\n", + " # This is necessary for caching XLA HLO files and reduce training time next time \n", + " checkpoint_s3_uri=f\"s3://{bucket_trn1}/checkpoints\",\n", + " hyperparameters={\n", + " \"dtype\": \"fp32\"\n", + " }\n", + ")\n", + "estimator.framework_version = '1.13.1' # workround when using image_uri\n", + "estimator._is_compiled_model = True" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f401d597-ed80-4fbc-ab1a-8c63d263976d", + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# it takes around 1141 seconds to complete the job on a trn1.32xlarge\n", + "# You will run this just once to compile the model.\n", + "estimator.fit()" + ] + }, + { + "cell_type": "markdown", + "id": "1338cea2-a0ab-4a6f-b4da-c3307d976e13", + "metadata": {}, + "source": [ + "## 5) Deploy the model to inferentia2\n", + "We compiled the model in one region but we'll deploy to another region. So, we need to copy the models artifacts first and then create a PyTorchModel" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d523f809-ff03-4ca8-be29-a92bd4b25a7d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import boto3\n", + "s3 = boto3.resource('s3', region_name=region_trn1)\n", + "\n", + "model_name=\"stable-diffusion-neuron-inferentia\"\n", + "model_data=f\"s3://{sess_inf2.default_bucket()}/{model_name}/model.tar.gz\"\n", + "copy_source = {\n", + " 'Bucket': sess_trn1.default_bucket(),\n", + " 'Key': estimator.model_data.split('/', 3)[-1]\n", + "}\n", + "print(copy_source)\n", + "print(model_data)\n", + "s3.meta.client.copy(copy_source, sess_inf2.default_bucket(), f'{model_name}/model.tar.gz')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4a202e4f-30cf-4c72-8fd4-f3a6bf408d19", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import logging\n", + "from sagemaker.pytorch.model import PyTorchModel\n", + "from sagemaker.utils import name_from_base\n", + "\n", + "# depending on the inf2 instance you deploy the model you'll have more or less accelerators\n", + "# we'll ask SageMaker to launch 1 worker per accelerator\n", + "model_data=model_data\n", + "instance_type_idx=1 # default ml.inf2.8xlarge\n", + "instance_types=['ml.inf2.xlarge', 'ml.inf2.8xlarge', 'ml.inf2.24xlarge','ml.inf2.48xlarge']\n", + "num_workers=[1,1,6,12]\n", + "\n", + "print(f\"Instance type: {instance_types[instance_type_idx]}. Num SM workers: {num_workers[instance_type_idx]}\")\n", + "pytorch_model = PyTorchModel(\n", + " image_uri=f\"763104351884.dkr.ecr.{region_inf2}.amazonaws.com/{inference_image_name}:{image_tag}\",\n", + " model_data=model_data,\n", + " role=role, \n", + " name=name_from_base('sd-inf2'),\n", + " sagemaker_session=sess_inf2,\n", + " container_log_level=logging.NOTSET,\n", + " model_server_workers=num_workers[instance_type_idx], # 1 worker per inferentia chip\n", + " framework_version=\"1.13.1\",\n", + " env = {\n", + " 'SAGEMAKER_MODEL_SERVER_TIMEOUT' : '3600' \n", + " }\n", + " # for production it is important to define vpc_config and use a vpc_endpoint\n", + " #vpc_config={\n", + " # 'Subnets': ['', ''],\n", + " # 'SecurityGroupIds': ['', '']\n", + " #}\n", + ")\n", + "pytorch_model._is_compiled_model = True" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "733f2b8d-0d21-493d-8d35-b68e85eabc05", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "predictor = pytorch_model.deploy(\n", + " initial_instance_count=1,\n", + " instance_type=instance_types[instance_type_idx],\n", + " model_data_download_timeout=3600, # it takes some time to download all the artifacts and load the model\n", + " container_startup_health_check_timeout=1800\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "a9fac815-9715-4a17-a748-86f9d1e2dd32", + "metadata": {}, + "source": [ + "## 6) Run a simple test" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c9166516-8b68-409d-b877-a291d9525f83", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from sagemaker.serializers import JSONSerializer\n", + "from sagemaker.deserializers import BytesDeserializer\n", + "predictor.serializer = JSONSerializer()\n", + "predictor.deserializer = BytesDeserializer(accept='image/jpeg')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "769b7e0b-045d-442b-911a-5fd045d8e483", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import io\n", + "import time\n", + "from PIL import Image\n", + "import requests\n", + "import base64" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "99228241-0e98-4d97-bfda-b62733722905", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def download_image(url):\n", + " response = requests.get(url)\n", + " encoded_image = base64.b64encode(response.content).decode('UTF-8')\n", + " return encoded_image" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1872a60c-b1c2-44b0-ae34-11599ef56f89", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "#prompt = \"a photo of an astronaut riding a horse on mars\"\n", + "prompt = \"Face of a yellow cat, high resolution, sitting on a park bench\"\n", + "img_url = \"https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png\"\n", + "mask_url = \"https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png\"\n", + "init_image = download_image(img_url)\n", + "mask_image = download_image(mask_url)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4dee2aff-3d12-4be8-8077-bd9feb161200", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "input_req={\n", + " \"prompt\": prompt,\n", + " \"init_image\": init_image,\n", + " \"mask_image\": mask_image\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "17953fca-c0b8-4d7f-a3c2-e6f3b6dead20", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "Image.open(io.BytesIO(predictor.predict(input_req)))" + ] + }, + { + "cell_type": "markdown", + "id": "95941fa0-2395-4393-ae79-65e2045c7923", + "metadata": {}, + "source": [ + "## 7) Cleanup\n", + "If you don't need the endpoint anymore, run the next cell to delete it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "36aba54e-6b61-470f-ae8c-575c4ea58633", + "metadata": {}, + "outputs": [], + "source": [ + "predictor.delete_model()\n", + "predictor.delete_endpoint()" + ] + } + ], + "metadata": { + "availableInstances": [ + { + "_defaultOrder": 0, + "_isFastLaunch": true, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 4, + "name": "ml.t3.medium", + "vcpuNum": 2 + }, + { + "_defaultOrder": 1, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 8, + "name": "ml.t3.large", + "vcpuNum": 2 + }, + { + "_defaultOrder": 2, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 16, + "name": "ml.t3.xlarge", + "vcpuNum": 4 + }, + { + "_defaultOrder": 3, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 32, + "name": "ml.t3.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 4, + "_isFastLaunch": true, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 8, + "name": "ml.m5.large", + "vcpuNum": 2 + }, + { + "_defaultOrder": 5, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 16, + "name": "ml.m5.xlarge", + "vcpuNum": 4 + }, + { + "_defaultOrder": 6, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 32, + "name": "ml.m5.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 7, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 64, + "name": "ml.m5.4xlarge", + "vcpuNum": 16 + }, + { + "_defaultOrder": 8, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 128, + "name": "ml.m5.8xlarge", + "vcpuNum": 32 + }, + { + "_defaultOrder": 9, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 192, + "name": "ml.m5.12xlarge", + "vcpuNum": 48 + }, + { + "_defaultOrder": 10, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 256, + "name": "ml.m5.16xlarge", + "vcpuNum": 64 + }, + { + "_defaultOrder": 11, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 384, + "name": "ml.m5.24xlarge", + "vcpuNum": 96 + }, + { + "_defaultOrder": 12, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 8, + "name": "ml.m5d.large", + "vcpuNum": 2 + }, + { + "_defaultOrder": 13, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 16, + "name": "ml.m5d.xlarge", + "vcpuNum": 4 + }, + { + "_defaultOrder": 14, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 32, + "name": "ml.m5d.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 15, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 64, + "name": "ml.m5d.4xlarge", + "vcpuNum": 16 + }, + { + "_defaultOrder": 16, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 128, + "name": "ml.m5d.8xlarge", + "vcpuNum": 32 + }, + { + "_defaultOrder": 17, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 192, + "name": "ml.m5d.12xlarge", + "vcpuNum": 48 + }, + { + "_defaultOrder": 18, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 256, + "name": "ml.m5d.16xlarge", + "vcpuNum": 64 + }, + { + "_defaultOrder": 19, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 384, + "name": "ml.m5d.24xlarge", + "vcpuNum": 96 + }, + { + "_defaultOrder": 20, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": true, + "memoryGiB": 0, + "name": "ml.geospatial.interactive", + "supportedImageNames": [ + "sagemaker-geospatial-v1-0" + ], + "vcpuNum": 0 + }, + { + "_defaultOrder": 21, + "_isFastLaunch": true, + "category": "Compute optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 4, + "name": "ml.c5.large", + "vcpuNum": 2 + }, + { + "_defaultOrder": 22, + "_isFastLaunch": false, + "category": "Compute optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 8, + "name": "ml.c5.xlarge", + "vcpuNum": 4 + }, + { + "_defaultOrder": 23, + "_isFastLaunch": false, + "category": "Compute optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 16, + "name": "ml.c5.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 24, + "_isFastLaunch": false, + "category": "Compute optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 32, + "name": "ml.c5.4xlarge", + "vcpuNum": 16 + }, + { + "_defaultOrder": 25, + "_isFastLaunch": false, + "category": "Compute optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 72, + "name": "ml.c5.9xlarge", + "vcpuNum": 36 + }, + { + "_defaultOrder": 26, + "_isFastLaunch": false, + "category": "Compute optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 96, + "name": "ml.c5.12xlarge", + "vcpuNum": 48 + }, + { + "_defaultOrder": 27, + "_isFastLaunch": false, + "category": "Compute optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 144, + "name": "ml.c5.18xlarge", + "vcpuNum": 72 + }, + { + "_defaultOrder": 28, + "_isFastLaunch": false, + "category": "Compute optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 192, + "name": "ml.c5.24xlarge", + "vcpuNum": 96 + }, + { + "_defaultOrder": 29, + "_isFastLaunch": true, + "category": "Accelerated computing", + "gpuNum": 1, + "hideHardwareSpecs": false, + "memoryGiB": 16, + "name": "ml.g4dn.xlarge", + "vcpuNum": 4 + }, + { + "_defaultOrder": 30, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "hideHardwareSpecs": false, + "memoryGiB": 32, + "name": "ml.g4dn.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 31, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "hideHardwareSpecs": false, + "memoryGiB": 64, + "name": "ml.g4dn.4xlarge", + "vcpuNum": 16 + }, + { + "_defaultOrder": 32, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "hideHardwareSpecs": false, + "memoryGiB": 128, + "name": "ml.g4dn.8xlarge", + "vcpuNum": 32 + }, + { + "_defaultOrder": 33, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 4, + "hideHardwareSpecs": false, + "memoryGiB": 192, + "name": "ml.g4dn.12xlarge", + "vcpuNum": 48 + }, + { + "_defaultOrder": 34, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "hideHardwareSpecs": false, + "memoryGiB": 256, + "name": "ml.g4dn.16xlarge", + "vcpuNum": 64 + }, + { + "_defaultOrder": 35, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "hideHardwareSpecs": false, + "memoryGiB": 61, + "name": "ml.p3.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 36, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 4, + "hideHardwareSpecs": false, + "memoryGiB": 244, + "name": "ml.p3.8xlarge", + "vcpuNum": 32 + }, + { + "_defaultOrder": 37, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 8, + "hideHardwareSpecs": false, + "memoryGiB": 488, + "name": "ml.p3.16xlarge", + "vcpuNum": 64 + }, + { + "_defaultOrder": 38, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 8, + "hideHardwareSpecs": false, + "memoryGiB": 768, + "name": "ml.p3dn.24xlarge", + "vcpuNum": 96 + }, + { + "_defaultOrder": 39, + "_isFastLaunch": false, + "category": "Memory Optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 16, + "name": "ml.r5.large", + "vcpuNum": 2 + }, + { + "_defaultOrder": 40, + "_isFastLaunch": false, + "category": "Memory Optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 32, + "name": "ml.r5.xlarge", + "vcpuNum": 4 + }, + { + "_defaultOrder": 41, + "_isFastLaunch": false, + "category": "Memory Optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 64, + "name": "ml.r5.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 42, + "_isFastLaunch": false, + "category": "Memory Optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 128, + "name": "ml.r5.4xlarge", + "vcpuNum": 16 + }, + { + "_defaultOrder": 43, + "_isFastLaunch": false, + "category": "Memory Optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 256, + "name": "ml.r5.8xlarge", + "vcpuNum": 32 + }, + { + "_defaultOrder": 44, + "_isFastLaunch": false, + "category": "Memory Optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 384, + "name": "ml.r5.12xlarge", + "vcpuNum": 48 + }, + { + "_defaultOrder": 45, + "_isFastLaunch": false, + "category": "Memory Optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 512, + "name": "ml.r5.16xlarge", + "vcpuNum": 64 + }, + { + "_defaultOrder": 46, + "_isFastLaunch": false, + "category": "Memory Optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 768, + "name": "ml.r5.24xlarge", + "vcpuNum": 96 + }, + { + "_defaultOrder": 47, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "hideHardwareSpecs": false, + "memoryGiB": 16, + "name": "ml.g5.xlarge", + "vcpuNum": 4 + }, + { + "_defaultOrder": 48, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "hideHardwareSpecs": false, + "memoryGiB": 32, + "name": "ml.g5.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 49, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "hideHardwareSpecs": false, + "memoryGiB": 64, + "name": "ml.g5.4xlarge", + "vcpuNum": 16 + }, + { + "_defaultOrder": 50, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "hideHardwareSpecs": false, + "memoryGiB": 128, + "name": "ml.g5.8xlarge", + "vcpuNum": 32 + }, + { + "_defaultOrder": 51, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "hideHardwareSpecs": false, + "memoryGiB": 256, + "name": "ml.g5.16xlarge", + "vcpuNum": 64 + }, + { + "_defaultOrder": 52, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 4, + "hideHardwareSpecs": false, + "memoryGiB": 192, + "name": "ml.g5.12xlarge", + "vcpuNum": 48 + }, + { + "_defaultOrder": 53, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 4, + "hideHardwareSpecs": false, + "memoryGiB": 384, + "name": "ml.g5.24xlarge", + "vcpuNum": 96 + }, + { + "_defaultOrder": 54, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 8, + "hideHardwareSpecs": false, + "memoryGiB": 768, + "name": "ml.g5.48xlarge", + "vcpuNum": 192 + }, + { + "_defaultOrder": 55, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 8, + "hideHardwareSpecs": false, + "memoryGiB": 1152, + "name": "ml.p4d.24xlarge", + "vcpuNum": 96 + }, + { + "_defaultOrder": 56, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 8, + "hideHardwareSpecs": false, + "memoryGiB": 1152, + "name": "ml.p4de.24xlarge", + "vcpuNum": 96 + } + ], + "instance_type": "ml.t3.medium", + "kernelspec": { + "display_name": "Python 3 (Data Science 3.0)", + "language": "python", + "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-1:081325390199:image/sagemaker-data-science-310-v1" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/inference/stable-diffusion/src-inpaint/compile.py b/inference/stable-diffusion/src-inpaint/compile.py new file mode 100644 index 0000000..d148252 --- /dev/null +++ b/inference/stable-diffusion/src-inpaint/compile.py @@ -0,0 +1,193 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 + +import os +os.environ["NEURON_FUSE_SOFTMAX"] = "1" +import time +import copy +import torch +import shutil +import argparse +import numpy as np +import torch_neuronx +import torch.nn as nn +from wrapper import NeuronTextEncoder, UNetWrap, NeuronUNet, get_attention_scores +from diffusers.models.unet_2d_condition import UNet2DConditionOutput +from diffusers import StableDiffusionPipeline, StableDiffusionInpaintPipeline, DPMSolverMultistepScheduler +from diffusers.models.attention_processor import Attention + +height = 512 // 8 +width = 512 // 8 + +def compile_text_encoder(text_encoder, args): + print("Compiling text encoder...") + base_dir='text_encoder' + os.makedirs(os.path.join(args.checkpoints_path, base_dir), exist_ok=True) + os.makedirs(os.path.join(args.model_path, base_dir), exist_ok=True) + t = time.time() + # Apply the wrapper to deal with custom return type + text_encoder = NeuronTextEncoder(text_encoder) + + # Compile text encoder + # This is used for indexing a lookup table in torch.nn.Embedding, + # so using random numbers may give errors (out of range). + emb = torch.tensor([[49406, 18376, 525, 7496, 49407, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0]]) + text_encoder_neuron = torch_neuronx.trace( + text_encoder.neuron_text_encoder, emb, + #compiler_workdir=os.path.join(args.checkpoints_path, base_dir), + ) + + # Save the compiled text encoder + text_encoder_filename = os.path.join(args.model_path, base_dir, 'model.pt') + torch.jit.save(text_encoder_neuron, text_encoder_filename) + + # delete unused objects + del text_encoder + del text_encoder_neuron + print(f"Done. Elapsed time: {(time.time()-t)*1000}ms") + +def compile_vae(decoder, args, dtype): + print("Compiling VAE...") + base_dir='vae_decoder' + os.makedirs(os.path.join(args.checkpoints_path, base_dir), exist_ok=True) + os.makedirs(os.path.join(args.model_path, base_dir), exist_ok=True) + t = time.time() + # Compile vae decoder + decoder_in = torch.randn([1, 4, height, width]).type(dtype) + decoder_neuron = torch_neuronx.trace( + decoder, + decoder_in, + #compiler_workdir=os.path.join(args.checkpoints_path, base_dir), + compiler_args=["--verbose", "info"] + ) + + # Save the compiled vae decoder + decoder_filename = os.path.join(args.model_path, base_dir, 'model.pt') + torch.jit.save(decoder_neuron, decoder_filename) + + # delete unused objects + del decoder + del decoder_neuron + print(f"Done. Elapsed time: {(time.time()-t)*1000}ms") + +def compile_unet(unet, args, dtype): + print("Compiling U-Net...") + base_dir='unet' + os.makedirs(os.path.join(args.checkpoints_path, base_dir), exist_ok=True) + os.makedirs(os.path.join(args.model_path, base_dir), exist_ok=True) + t = time.time() + # Compile unet - BF16 + sample_1b = torch.randn([1, 9, height, width]).type(dtype) + timestep_1b = torch.tensor(999).type(dtype).expand((1,)) + encoder_hidden_states_1b = torch.randn([1, 77, 1024]).type(dtype) + example_inputs = sample_1b, timestep_1b, encoder_hidden_states_1b + + + unet_neuron = torch_neuronx.trace( + unet, + example_inputs, + #compiler_workdir=os.path.join(args.checkpoints_path, base_dir), + compiler_args=["--model-type=unet-inference", "--verbose=info"] + ) + + # save compiled unet + unet_filename = os.path.join(args.model_path, base_dir, 'model.pt') + torch.jit.save(unet_neuron, unet_filename) + + # delete unused objects + del unet + del unet_neuron + print(f"Done. Elapsed time: {(time.time()-t)*1000}ms") + +def compile_vae_post_quant_conv(post_quant_conv, args, dtype): + print("Compiling Post Quant Conv...") + base_dir='vae_post_quant_conv' + os.makedirs(os.path.join(args.checkpoints_path, base_dir), exist_ok=True) + os.makedirs(os.path.join(args.model_path, base_dir), exist_ok=True) + t = time.time() + + # # Compile vae post_quant_conv + post_quant_conv_in = torch.randn([1, 4, height, width]).type(dtype) + post_quant_conv_neuron = torch_neuronx.trace( + post_quant_conv, + post_quant_conv_in, + #compiler_workdir=os.path.join(args.checkpoints_path, base_dir), + compiler_args=["--verbose", "info"] + ) + + # # Save the compiled vae post_quant_conv + post_quant_conv_filename = os.path.join(args.model_path, base_dir, 'model.pt') + torch.jit.save(post_quant_conv_neuron, post_quant_conv_filename) + + # delete unused objects + del post_quant_conv + del post_quant_conv_neuron + print(f"Done. Elapsed time: {(time.time()-t)*1000}ms") + +if __name__=='__main__': + parser = argparse.ArgumentParser(description='Train the UNet on images and target masks') + parser.add_argument('--model-path', type=str, help="Path where we'll save the model", default=os.environ["SM_MODEL_DIR"]) + parser.add_argument('--checkpoints-path', type=str, help="Path where we'll save the best model and cache", default='/opt/ml/checkpoints') + parser.add_argument('--dtype', type=str, help="Datatype of the weights", default='fp32') + + args = parser.parse_args() + + # make sure the checkpoint path exists + os.makedirs(args.checkpoints_path, exist_ok=True) + + # Model ID for SD version pipeline + model_id = "stabilityai/stable-diffusion-2-inpainting" + + # --- Compile CLIP text encoder and save --- + + dtype = torch.float32 + # Only keep the model being compiled in RAM to minimze memory pressure + pipe = StableDiffusionInpaintPipeline.from_pretrained(model_id, torch_dtype=dtype) + text_encoder = copy.deepcopy(pipe.text_encoder) + del pipe + compile_text_encoder(text_encoder, args) + + # --- Compile VAE decoder and save --- + + # Only keep the model being compiled in RAM to minimze memory pressure + pipe = StableDiffusionInpaintPipeline.from_pretrained(model_id, torch_dtype=dtype) + decoder = copy.deepcopy(pipe.vae.decoder) + del pipe + compile_vae(decoder, args, dtype) + + # --- Compile UNet and save --- + + pipe = StableDiffusionInpaintPipeline.from_pretrained(model_id, torch_dtype=dtype) + + # Replace original cross-attention module with custom cross-attention module for better performance + Attention.get_attention_scores = get_attention_scores + + # Apply double wrapper to deal with custom return type + pipe.unet = NeuronUNet(UNetWrap(pipe.unet)) + + # Only keep the model being compiled in RAM to minimze memory pressure + unet = copy.deepcopy(pipe.unet.unetwrap) + del pipe + compile_unet(unet, args, dtype) + + # --- Compile VAE post_quant_conv and save --- + + # Only keep the model being compiled in RAM to minimze memory pressure + pipe = StableDiffusionInpaintPipeline.from_pretrained(model_id, torch_dtype=dtype) + post_quant_conv = copy.deepcopy(pipe.vae.post_quant_conv) + del pipe + compile_vae_post_quant_conv(post_quant_conv, args, dtype) + + code_path = os.path.join(args.model_path, 'code') + os.makedirs(code_path, exist_ok=True) + + shutil.copyfile('inference.py', os.path.join(code_path, 'inference.py')) + #shutil.copyfile('wrapper.py', os.path.join(code_path, 'wrapper.py')) + shutil.copyfile('requirements.txt', os.path.join(code_path, 'requirements.txt')) \ No newline at end of file diff --git a/inference/stable-diffusion/src-inpaint/inference.py b/inference/stable-diffusion/src-inpaint/inference.py new file mode 100644 index 0000000..c030671 --- /dev/null +++ b/inference/stable-diffusion/src-inpaint/inference.py @@ -0,0 +1,155 @@ +import os +os.environ['NEURON_RT_NUM_CORES'] = '2' +import torch +import torch.nn as nn +import torch_neuronx +import time +from diffusers import StableDiffusionInpaintPipeline, DPMSolverMultistepScheduler +from diffusers.models.unet_2d_condition import UNet2DConditionOutput +from diffusers.models.attention_processor import Attention + +import threading +import argparse +import sys +import copy +import PIL +import math +import json +import requests +import io +from io import BytesIO +import base64 +from PIL import Image + + +model_id = "stabilityai/stable-diffusion-2-inpainting" +dtype = torch.float32 + +class UNetWrap(nn.Module): + def __init__(self, unet): + super().__init__() + self.unet = unet + + def forward(self, sample, timestep, encoder_hidden_states, cross_attention_kwargs=None): + out_tuple = self.unet(sample, timestep, encoder_hidden_states, return_dict=False) + return out_tuple + +class NeuronUNet(nn.Module): + def __init__(self, unetwrap): + super().__init__() + self.unetwrap = unetwrap + self.config = unetwrap.unet.config + self.in_channels = unetwrap.unet.in_channels + self.device = unetwrap.unet.device + + def forward(self, sample, timestep, encoder_hidden_states, cross_attention_kwargs=None, return_dict=False): + sample = self.unetwrap(sample, timestep.float().expand((sample.shape[0],)), encoder_hidden_states)[0] + return UNet2DConditionOutput(sample=sample) + +class NeuronTextEncoder(nn.Module): + def __init__(self, text_encoder): + super().__init__() + self.neuron_text_encoder = text_encoder + self.config = text_encoder.config + self.dtype = text_encoder.dtype + self.device = text_encoder.device + + def forward(self, emb, attention_mask = None): + return [self.neuron_text_encoder(emb)['last_hidden_state']] + +# Optimized attention +def get_attention_scores(self, query, key, attn_mask): + dtype = query.dtype + + if self.upcast_attention: + query = query.float() + key = key.float() + + # Check for square matmuls + if(query.size() == key.size()): + attention_scores = custom_badbmm( + key, + query.transpose(-1, -2) + ) + + if self.upcast_softmax: + attention_scores = attention_scores.float() + + attention_probs = torch.nn.functional.softmax(attention_scores, dim=1).permute(0,2,1) + attention_probs = attention_probs.to(dtype) + + else: + attention_scores = custom_badbmm( + query, + key.transpose(-1, -2) + ) + + if self.upcast_softmax: + attention_scores = attention_scores.float() + + attention_probs = torch.nn.functional.softmax(attention_scores, dim=-1) + attention_probs = attention_probs.to(dtype) + + return attention_probs + +def custom_badbmm(a, b): + bmm = torch.bmm(a, b) + scaled = bmm * 0.125 + return scaled + +def model_fn(model_dir, context=None): + global model_id, dtype + print("Loading model parts...") + t=time.time() + + text_encoder_filename = os.path.join(model_dir, 'text_encoder/model.pt') + decoder_filename = os.path.join(model_dir, 'vae_decoder/model.pt') + unet_filename = os.path.join(model_dir, 'unet/model.pt') + post_quant_conv_filename = os.path.join(model_dir, 'vae_post_quant_conv/model.pt') + + pipe = StableDiffusionInpaintPipeline.from_pretrained(model_id, torch_dtype=dtype) + + # Load the compiled UNet onto two neuron cores. + pipe.unet = NeuronUNet(UNetWrap(pipe.unet)) + device_ids = [0,1] + pipe.unet.unetwrap = torch_neuronx.DataParallel(torch.jit.load(unet_filename), device_ids, set_dynamic_batching=False) + + # Load other compiled models onto a single neuron core. + pipe.text_encoder = NeuronTextEncoder(pipe.text_encoder) + pipe.text_encoder.neuron_text_encoder = torch.jit.load(text_encoder_filename) + pipe.vae.decoder = torch.jit.load(decoder_filename) + pipe.vae.post_quant_conv = torch.jit.load(post_quant_conv_filename) + + print(f"Done. Elapsed time: {(time.time()-t)*1000}ms") + return pipe + +def input_fn(request_body, request_content_type, context=None): + if request_content_type == 'application/json': + req = json.loads(request_body) + prompt = req.get('prompt') + init_image = req.get('init_image') + mask_image = req.get('mask_image') + height = 512 + width = 512 + + if prompt is None or type(prompt) != str or len(prompt) < 5: + raise("Invalid prompt. It needs to be a string > 5") + + return prompt,init_image,mask_image,height,width + else: + raise Exception(f"Unsupported mime type: {request_content_type}. Supported: application/json") + +def predict_fn(input_req, model, context=None): + prompt,init_image,mask_image,height,width = input_req + init_image_input = Image.open(io.BytesIO(base64.b64decode((init_image)))).convert("RGB").resize((width, height)) + mask_image_input = Image.open(io.BytesIO(base64.b64decode((mask_image)))).convert("RGB").resize((width, height)) + return model(prompt,image=init_image_input, mask_image=mask_image_input, height=height, width=width).images[0] + +def output_fn(image, accept, context=None): + if accept!='image/jpeg': + raise Exception(f'Invalid data type. Expected image/jpeg, got {accept}') + + buffer = io.BytesIO() + image.save(buffer, 'jpeg', icc_profile=image.info.get('icc_profile')) + buffer.seek(0) + return buffer.read() diff --git a/inference/stable-diffusion/src-inpaint/requirements.txt b/inference/stable-diffusion/src-inpaint/requirements.txt new file mode 100644 index 0000000..2a1e641 --- /dev/null +++ b/inference/stable-diffusion/src-inpaint/requirements.txt @@ -0,0 +1,6 @@ +diffusers==0.20.2 +transformers==4.33.1 +accelerate==0.22.0 +safetensors==0.3.1 +matplotlib +Pillow \ No newline at end of file diff --git a/inference/stable-diffusion/src-inpaint/wrapper.py b/inference/stable-diffusion/src-inpaint/wrapper.py new file mode 100644 index 0000000..310a085 --- /dev/null +++ b/inference/stable-diffusion/src-inpaint/wrapper.py @@ -0,0 +1,76 @@ +import torch +import torch.nn as nn +from diffusers.models.unet_2d_condition import UNet2DConditionOutput +#from diffusers.models.attention_processor import Attention + +class UNetWrap(nn.Module): + def __init__(self, unet): + super().__init__() + self.unet = unet + + def forward(self, sample, timestep, encoder_hidden_states, cross_attention_kwargs=None): + out_tuple = self.unet(sample, timestep, encoder_hidden_states, return_dict=False) + return out_tuple + +class NeuronUNet(nn.Module): + def __init__(self, unetwrap): + super().__init__() + self.unetwrap = unetwrap + self.config = unetwrap.unet.config + self.in_channels = unetwrap.unet.in_channels + self.device = unetwrap.unet.device + + def forward(self, sample, timestep, encoder_hidden_states, cross_attention_kwargs=None, return_dict=False): + sample = self.unetwrap(sample, timestep.float().expand((sample.shape[0],)), encoder_hidden_states)[0] + return UNet2DConditionOutput(sample=sample) + +class NeuronTextEncoder(nn.Module): + def __init__(self, text_encoder): + super().__init__() + self.neuron_text_encoder = text_encoder + self.config = text_encoder.config + self.dtype = text_encoder.dtype + self.device = text_encoder.device + + def forward(self, emb, attention_mask = None): + return [self.neuron_text_encoder(emb)['last_hidden_state']] + +# Optimized attention +def get_attention_scores(self, query, key, attn_mask): + dtype = query.dtype + + if self.upcast_attention: + query = query.float() + key = key.float() + + # Check for square matmuls + if(query.size() == key.size()): + attention_scores = custom_badbmm( + key, + query.transpose(-1, -2) + ) + + if self.upcast_softmax: + attention_scores = attention_scores.float() + + attention_probs = torch.nn.functional.softmax(attention_scores, dim=1).permute(0,2,1) + attention_probs = attention_probs.to(dtype) + + else: + attention_scores = custom_badbmm( + query, + key.transpose(-1, -2) + ) + + if self.upcast_softmax: + attention_scores = attention_scores.float() + + attention_probs = torch.nn.functional.softmax(attention_scores, dim=-1) + attention_probs = attention_probs.to(dtype) + + return attention_probs + +def custom_badbmm(a, b): + bmm = torch.bmm(a, b) + scaled = bmm * 0.125 + return scaled \ No newline at end of file