From b88c190221c17da40f1437c1fd32d17cd91bf6a1 Mon Sep 17 00:00:00 2001 From: Eddy Varela Date: Fri, 19 Dec 2025 23:01:52 -0800 Subject: [PATCH] Revert "Updated Dockerfiles (#172)" This reverts commit 9b5d2553d38a9a6c3b0db49006d6206e84567067. --- common/nxdt_requirements.txt | 8 +- jax/training/0.7/Dockerfile.neuronx | 224 ---------------- pytorch/inference/2.9.0/Dockerfile.neuronx | 240 ----------------- pytorch/training/2.9.0/Dockerfile.neuronx | 288 --------------------- vllm/inference/0.11.0/Dockerfile.neuronx | 242 ----------------- 5 files changed, 4 insertions(+), 998 deletions(-) delete mode 100644 jax/training/0.7/Dockerfile.neuronx delete mode 100644 pytorch/inference/2.9.0/Dockerfile.neuronx delete mode 100644 pytorch/training/2.9.0/Dockerfile.neuronx delete mode 100644 vllm/inference/0.11.0/Dockerfile.neuronx diff --git a/common/nxdt_requirements.txt b/common/nxdt_requirements.txt index d3bff55..4f40ab0 100644 --- a/common/nxdt_requirements.txt +++ b/common/nxdt_requirements.txt @@ -2,7 +2,7 @@ hydra-core>=1.3.0 omegaconf>=2.2,<2.3 pyyaml==6.0.1 torchmetrics>=0.4.1rc0,<=0.10.3 -transformers==4.56.* +transformers==4.52.4 wandb webdataset>=0.1.48,<=0.1.62 pandas @@ -22,7 +22,7 @@ ftfy gdown inflect jieba -opencc==1.1.9 +opencc==1.1.6 pangu rapidfuzz pybind11 @@ -39,6 +39,7 @@ python-daemon huggingface_hub>=0.27.1 multiprocess==0.70.16 numba<=0.60.0 +numpy>=1.24.3,<=1.25.2 rouge_score setuptools>=70.0 lightning==2.5.0 @@ -46,5 +47,4 @@ ml-dtypes==0.5.0 boto3==1.35.93 botocore==1.35.93 s3transfer==0.10.4 -s3fs -numpy==1.26.4 \ No newline at end of file +s3fs \ No newline at end of file diff --git a/jax/training/0.7/Dockerfile.neuronx b/jax/training/0.7/Dockerfile.neuronx deleted file mode 100644 index 7b2466e..0000000 --- a/jax/training/0.7/Dockerfile.neuronx +++ /dev/null @@ -1,224 +0,0 @@ -ARG BUILD_STAGE=prod - -FROM public.ecr.aws/docker/library/ubuntu:24.04 AS base - -LABEL dlc_major_version="1" -LABEL maintainer="Amazon AI" - -# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 24 -ARG DEBIAN_FRONTEND=noninteractive -ARG PYTHON=python3.12 -ARG PYTHON_VERSION=3.12.11 -ARG PIP=pip3 -ARG OMPI_VERSION=4.1.5 -ARG PYPI_SIMPLE_URL="https://pypi.org/simple/" - -# Python won’t try to write .pyc or .pyo files on the import of source modules -# Force stdin, stdout and stderr to be totally unbuffered. Good for logging -ENV PYTHONDONTWRITEBYTECODE=1 -ENV PYTHONUNBUFFERED=1 -ENV PYTHONIOENCODING=UTF-8 -ENV LANG=C.UTF-8 -ENV LC_ALL=C.UTF-8 -ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/aws/neuron/lib" -ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib" -ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib64" -ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/openmpi/lib64" -ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib" -ENV PATH="/opt/aws/neuron/bin:${PATH}" - -RUN apt-get update \ - && apt-get upgrade -y \ - && apt-get install -y --no-install-recommends \ - build-essential \ - ca-certificates \ - cmake \ - curl \ - emacs \ - git \ - gnupg2 \ - gpg-agent \ - jq \ - libopencv-dev \ - libglib2.0-0 \ - libgl1-mesa-dri \ - libsm6 \ - libxext6 \ - libxrender-dev \ - libssl-dev \ - libsqlite3-dev \ - libgdbm-dev \ - libc6-dev \ - libbz2-dev \ - libncurses-dev \ - libffi-dev \ - libcap-dev \ - libhwloc-dev \ - openjdk-8-jdk-headless \ - openjdk-8-jdk \ - openjdk-8-jre \ - openjdk-11-jdk \ - openssl \ - software-properties-common \ - tk-dev \ - unzip \ - wget \ - vim \ - zlib1g-dev \ - && rm -rf /var/lib/apt/lists/* \ - && rm -rf /tmp/tmp* \ - && apt-get clean - -# Install Open MPI and configure SSH for MPI operator in k8s -RUN mkdir -p /tmp/openmpi \ - && cd /tmp/openmpi \ - && wget --quiet https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OMPI_VERSION}.tar.gz \ - && tar zxf openmpi-${OMPI_VERSION}.tar.gz \ - && cd openmpi-${OMPI_VERSION} \ - && ./configure --enable-orterun-prefix-by-default \ - && make -j $(nproc) all \ - && make install \ - && ldconfig \ - && rm -rf /tmp/openmpi - -# Install packages and configure SSH for MPI operator in k8s -RUN apt-get update \ - && apt-get install -y openmpi-bin openssh-server \ - && mkdir -p /var/run/sshd \ - && echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config \ - && echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config \ - && sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config \ - && rm -rf /var/lib/apt/lists/* \ - && rm -rf /tmp/tmp* \ - && apt-get clean - -# Install Python -RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz \ - && tar -xzf Python-$PYTHON_VERSION.tgz \ - && cd Python-$PYTHON_VERSION \ - && ./configure --enable-shared --prefix=/usr/local \ - && make -j $(nproc) && make install \ - && cd .. && rm -rf ../Python-$PYTHON_VERSION* \ - && ln -s /usr/local/bin/pip3 /usr/bin/pip \ - && ln -s /usr/local/bin/$PYTHON /usr/local/bin/python \ - && ${PIP} --no-cache-dir install --upgrade \ - "awscli<2" \ - pip \ - requests \ - setuptools \ - && rm -rf ~/.cache/pip/* - -# U24 will not allow installation of pip packages outside of venv without this flag -# This is because U24 ships with Python 3.12 by default and installation into the Python -# interpreter’s directory are disabled outside of a virtual environment. -# https://peps.python.org/pep-0668/ -RUN ${PIP} config set global.break-system-packages true - -# Install EFA -RUN apt-get update \ - && cd $HOME \ - && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz \ - && wget https://efa-installer.amazonaws.com/aws-efa-installer.key && gpg --import aws-efa-installer.key \ - && cat aws-efa-installer.key | gpg --fingerprint \ - && wget https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz.sig && gpg --verify ./aws-efa-installer-latest.tar.gz.sig \ - && tar -xf aws-efa-installer-latest.tar.gz \ - && cd aws-efa-installer \ - && ./efa_installer.sh -y -g --skip-kmod --skip-limit-conf --no-verify \ - && cd $HOME \ - && rm -rf /var/lib/apt/lists/* \ - && rm -rf /tmp/tmp* \ - && apt-get clean - -WORKDIR / - -# The ENV variables declared below are changed in the previous section -# Grouping these ENV variables in the first section causes -# ompi_info to fail. This is only observed in CPU containers -ENV PATH="$PATH:/home/.openmpi/bin" -ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/.openmpi/lib/" -RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value - -RUN mkdir -p /etc/pki/tls/certs && cp /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt - -# Copy workaround script for incorrect hostname -COPY changehostname.c / -COPY --chmod=755 start_with_right_hostname.sh deep_learning_container.py /usr/local/bin/ - -RUN HOME_DIR=/root \ - && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ - && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ - && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ - && chmod +x /usr/local/bin/testOSSCompliance \ - && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ - && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ - && rm -rf ${HOME_DIR}/oss_compliance* \ - && rm -rf /tmp/tmp* - -# Setting up APT and PIP repo for neuron artifacts -ARG NEURON_APT_REPO=apt.repos.neuron.amazonaws.com -ARG NEURON_APT_REPO_KEY -ARG NEURON_PIP_REPO=pip.repos.neuron.amazonaws.com -ARG NEURON_PIP_REPO_KEY -RUN mkdir -p /etc/apt/keyrings \ - && APT_REPO_PREFIX=$([ -n "${NEURON_APT_REPO_KEY}" ] && echo "${NEURON_APT_REPO_KEY}@" || echo "") \ - && echo "deb [signed-by=/etc/apt/keyrings/neuron.gpg] https://${APT_REPO_PREFIX}${NEURON_APT_REPO} jammy main" > /etc/apt/sources.list.d/neuron.list \ - && curl $([ -n "${NEURON_APT_REPO_KEY}" ] && echo "-u ${NEURON_APT_REPO_KEY}") --retry 3 --retry-delay 1 --retry-all-errors -fSL "https://${NEURON_APT_REPO}/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB" | gpg --dearmor > /etc/apt/keyrings/neuron.gpg - -# Neuron SDK components version numbers -ARG NEURONX_RUNTIME_LIB_VERSION=2.29.40.0-f954cd7a5 -ARG NEURONX_COLLECTIVES_LIB_VERSION=2.29.41.0-681fef5f5 -ARG NEURONX_TOOLS_VERSION=2.27.33.0-5d9c0b901 -ARG NEURONX_CC_VERSION=2.22.12471.0+b4a00d10 -ARG NEURONX_JAX_TRAINING_VERSION=0.7.0.1.0.7377+5e6a4049 - -FROM base AS repo - -# Install Neuron components from the apt and pip repos (latest versions) -RUN apt-get update \ - && apt-get install -y \ - aws-neuronx-tools \ - aws-neuronx-collectives \ - aws-neuronx-runtime-lib \ - && rm -rf /var/lib/apt/lists/* \ - && rm -rf /tmp/tmp* \ - && apt-get clean - -RUN PIP_REPO_URL=$([ -n "${NEURON_PIP_REPO_KEY}" ] && echo "https://${NEURON_PIP_REPO_KEY}@${NEURON_PIP_REPO}" || echo "https://${NEURON_PIP_REPO}") \ - && ${PIP} install --no-cache-dir --force-reinstall \ - --index-url ${PIP_REPO_URL} \ - --extra-index-url ${PYPI_SIMPLE_URL} \ - --trusted-host ${NEURON_PIP_REPO} \ - "neuronx-cc>=2.0" \ - jax-neuronx \ - && rm -rf ~/.cache/pip/* - -FROM base AS prod - -# Install Neuron components -# Install Neuron Driver, Runtime and Tools -RUN apt-get update \ - && apt-get install -y \ - aws-neuronx-tools=$NEURONX_TOOLS_VERSION \ - aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \ - aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \ - && rm -rf /var/lib/apt/lists/* \ - && rm -rf /tmp/tmp* \ - && apt-get clean - -# Install JAX & Neuron CC -RUN PIP_REPO_URL=$([ -n "${NEURON_PIP_REPO_KEY}" ] && echo "https://${NEURON_PIP_REPO_KEY}@${NEURON_PIP_REPO}" || echo "https://${NEURON_PIP_REPO}") \ - && ${PIP} install --no-cache-dir --force-reinstall \ - --index-url ${PIP_REPO_URL} \ - --trusted-host ${NEURON_PIP_REPO} \ - --extra-index-url ${PYPI_SIMPLE_URL} \ - neuronx-cc==$NEURONX_CC_VERSION \ - jax-neuronx==$NEURONX_JAX_TRAINING_VERSION \ - && rm -rf ~/.cache/pip/* - -FROM ${BUILD_STAGE} AS final - -# Starts framework -ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"] -CMD ["/bin/bash"] - -HEALTHCHECK CMD curl --fail http://localhost:8080/ping || exit 1 diff --git a/pytorch/inference/2.9.0/Dockerfile.neuronx b/pytorch/inference/2.9.0/Dockerfile.neuronx deleted file mode 100644 index 0da769c..0000000 --- a/pytorch/inference/2.9.0/Dockerfile.neuronx +++ /dev/null @@ -1,240 +0,0 @@ -ARG BUILD_STAGE=prod - -FROM public.ecr.aws/docker/library/ubuntu:24.04 AS base - -LABEL dlc_major_version="1" -LABEL maintainer="Amazon AI" -LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true - -ARG DEBIAN_FRONTEND=noninteractive -ARG PIP=pip3 -ARG PYTHON=python3.12 -ARG PYTHON_VERSION=3.12.11 -ARG TORCHSERVE_VERSION=0.11.0 -ARG SM_TOOLKIT_VERSION=2.0.25 -ARG PYPI_SIMPLE_URL="https://pypi.org/simple/" - -# See http://bugs.python.org/issue19846 -ENV LANG=C.UTF-8 -ENV LD_LIBRARY_PATH=/opt/aws/neuron/lib:/lib/x86_64-linux-gnu:/opt/conda/lib/:$LD_LIBRARY_PATH -ENV PATH=/opt/conda/bin:/opt/aws/neuron/bin:$PATH - -RUN apt-get update \ - && apt-get upgrade -y \ - && apt-get install -y --no-install-recommends \ - apt-transport-https \ - build-essential \ - ca-certificates \ - cmake \ - curl \ - dmidecode \ - emacs \ - environment-modules \ - ethtool \ - git \ - gnupg2 \ - gpg-agent \ - iproute2 \ - jq \ - libevent-core-2.1-7t64 \ - libevent-pthreads-2.1-7t64 \ - libgl1-mesa-dri \ - libglib2.0-0 \ - libnl-3-200 \ - libnl-3-dev \ - libnl-route-3-200 \ - libnl-route-3-dev \ - libsm6 \ - libxext6 \ - libxrender-dev \ - libcap-dev \ - libhwloc-dev \ - openjdk-11-jdk \ - openssh-client \ - pciutils \ - tcl \ - udev \ - unzip \ - vim \ - wget \ - zlib1g-dev \ - && rm -rf /var/lib/apt/lists/* \ - && rm -rf /tmp/tmp* \ - && apt-get clean - -# https://github.com/docker-library/openjdk/issues/261 https://github.com/docker-library/openjdk/pull/263/files -RUN keytool -importkeystore -srckeystore /etc/ssl/certs/java/cacerts -destkeystore /etc/ssl/certs/java/cacerts.jks -deststoretype JKS -srcstorepass changeit -deststorepass changeit -noprompt; \ - mv /etc/ssl/certs/java/cacerts.jks /etc/ssl/certs/java/cacerts; \ - /var/lib/dpkg/info/ca-certificates-java.postinst configure; - -RUN curl -L -o ~/miniforge.sh https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh \ - && chmod +x ~/miniforge.sh \ - && ~/miniforge.sh -b -p /opt/conda \ - && rm ~/miniforge.sh \ - && /opt/conda/bin/conda update -y conda \ - && /opt/conda/bin/mamba install -c conda-forge -y \ - python=$PYTHON_VERSION \ - pyopenssl \ - cython \ - mkl-include \ - mkl \ - parso \ - typing \ - # Below 2 are included in miniconda base, but not mamba so need to install - conda-content-trust \ - charset-normalizer \ - && /opt/conda/bin/conda clean -ya - -RUN ${PIP} config set global.break-system-packages true - -RUN /opt/conda/bin/mamba install -c conda-forge \ - python=$PYTHON_VERSION \ - scikit-learn \ - h5py \ - requests \ - && conda clean -ya \ - && ${PIP} install --upgrade pip --no-cache-dir -U \ - --trusted-host pypi.org --trusted-host files.pythonhosted.org \ - && ln -s /opt/conda/bin/pip /usr/local/bin/pip3 \ - && ${PIP} install --no-cache-dir -U \ - packaging \ - enum-compat \ - ipython \ - && rm -rf ~/.cache/pip/* - -# Install EFA -RUN apt-get update \ - && cd $HOME \ - && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz \ - && wget https://efa-installer.amazonaws.com/aws-efa-installer.key && gpg --import aws-efa-installer.key \ - && cat aws-efa-installer.key | gpg --fingerprint \ - && wget https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz.sig && gpg --verify ./aws-efa-installer-latest.tar.gz.sig \ - && tar -xf aws-efa-installer-latest.tar.gz \ - && cd aws-efa-installer \ - && ./efa_installer.sh -y -g --skip-kmod --skip-limit-conf --no-verify \ - && cd $HOME \ - && rm -rf /var/lib/apt/lists/* \ - && rm -rf /tmp/tmp* \ - && apt-get clean - -RUN ${PIP} install --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org \ - && ${PIP} install --no-cache-dir -U \ - "opencv-python>=4.8.1.78" \ - "scipy>=1.8.0" \ - six \ - "awscli<2" \ - pandas \ - boto3 \ - cryptography \ - "protobuf>=3.18.3,<4" \ - torchserve==${TORCHSERVE_VERSION} \ - torch-model-archiver==${TORCHSERVE_VERSION} \ - && rm -rf ~/.cache/pip/* - -ENV SAGEMAKER_SERVING_MODULE=sagemaker_pytorch_serving_container.serving:main -ENV TEMP=/home/model-server/tmp - -RUN useradd -m model-server \ - && mkdir -p /home/model-server/tmp /opt/ml/model \ - && chown -R model-server /home/model-server /opt/ml/model - -COPY --chmod=755 neuron-entrypoint.py /usr/local/bin/dockerd-entrypoint.py -COPY --chmod=755 neuron-monitor.sh deep_learning_container.py /usr/local/bin/ -COPY --chmod=755 torchserve-neuron.sh /usr/local/bin/entrypoint.sh -COPY config.properties /home/model-server - -RUN ${PIP} install --no-cache-dir "sagemaker-pytorch-inference==${SM_TOOLKIT_VERSION}" \ - # patch default_pytorch_inference_handler.py to import torch_neuronx - && DEST_DIR=$(python -c "import os.path, sagemaker_pytorch_serving_container; print(os.path.dirname(sagemaker_pytorch_serving_container.__file__))") \ - && DEST_FILE=${DEST_DIR}/default_pytorch_inference_handler.py \ - && sed -i "s/import torch/import torch, torch_neuronx/" ${DEST_FILE} \ - && rm -rf ~/.cache/pip/* - -# Compliance -RUN HOME_DIR=/root \ - && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ - && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ - && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ - && chmod +x /usr/local/bin/testOSSCompliance \ - && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ - && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ - && rm -rf ${HOME_DIR}/oss_compliance* \ - # conda leaves an empty /root/.cache/conda/notices.cache file which is not removed by conda clean -ya - && rm -rf ${HOME_DIR}/.cache/conda - -RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.9/license.txt - -# Setting up APT and PIP repo for neuron artifacts -ARG NEURON_APT_REPO=apt.repos.neuron.amazonaws.com -ARG NEURON_APT_REPO_KEY -ARG NEURON_PIP_REPO=pip.repos.neuron.amazonaws.com -ARG NEURON_PIP_REPO_KEY -RUN mkdir -p /etc/apt/keyrings \ - && APT_REPO_PREFIX=$([ -n "${NEURON_APT_REPO_KEY}" ] && echo "${NEURON_APT_REPO_KEY}@" || echo "") \ - && echo "deb [signed-by=/etc/apt/keyrings/neuron.gpg] https://${APT_REPO_PREFIX}${NEURON_APT_REPO} jammy main" > /etc/apt/sources.list.d/neuron.list \ - && curl $([ -n "${NEURON_APT_REPO_KEY}" ] && echo "-u ${NEURON_APT_REPO_KEY}") --retry 3 --retry-delay 1 --retry-all-errors -fSL "https://${NEURON_APT_REPO}/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB" | gpg --dearmor > /etc/apt/keyrings/neuron.gpg - -# Neuron SDK components version numbers -ARG NEURONX_COLLECTIVES_LIB_VERSION=2.29.41.0-681fef5f5 -ARG NEURONX_RUNTIME_LIB_VERSION=2.29.40.0-f954cd7a5 -ARG NEURONX_TOOLS_VERSION=2.27.33.0-5d9c0b901 - -ARG NEURONX_CC_VERSION=2.22.12471.0+b4a00d10 -ARG NEURONX_FRAMEWORK_VERSION=2.9.0.2.11.19912+e48cd891 -ARG NEURONX_DISTRIBUTED_VERSION=0.16.25997+f431c02e -ARG NEURONX_DISTRIBUTED_INFERENCE_VERSION=0.7.14366+9adb71b8 - -FROM base AS repo - -# Install Neuron components from the apt and pip repos (latest versions) -RUN apt-get update \ - && apt-get install -y \ - aws-neuronx-tools \ - aws-neuronx-collectives \ - aws-neuronx-runtime-lib \ - && rm -rf /var/lib/apt/lists/* \ - && rm -rf /tmp/tmp* \ - && apt-get clean - -RUN PIP_REPO_URL=$([ -n "${NEURON_PIP_REPO_KEY}" ] && echo "https://${NEURON_PIP_REPO_KEY}@${NEURON_PIP_REPO}" || echo "https://${NEURON_PIP_REPO}") \ - && ${PIP} install --no-cache-dir \ - --index-url ${PIP_REPO_URL} \ - --trusted-host ${NEURON_PIP_REPO} \ - --extra-index-url ${PYPI_SIMPLE_URL} \ - "neuronx-cc>=2.0" \ - "torch-neuronx==2.9.*" \ - neuronx_distributed \ - neuronx_distributed_inference \ - && rm -rf ~/.cache/pip/* - -FROM base AS prod - -# Install Neuron components with specific versions -RUN apt-get update \ - && apt-get install -y \ - aws-neuronx-tools=$NEURONX_TOOLS_VERSION \ - aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \ - aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \ - && rm -rf /var/lib/apt/lists/* \ - && rm -rf /tmp/tmp* \ - && apt-get clean - -RUN PIP_REPO_URL=$([ -n "${NEURON_PIP_REPO_KEY}" ] && echo "https://${NEURON_PIP_REPO_KEY}@${NEURON_PIP_REPO}" || echo "https://${NEURON_PIP_REPO}") \ - && ${PIP} install --no-cache-dir \ - --index-url ${PIP_REPO_URL} \ - --trusted-host ${NEURON_PIP_REPO} \ - --extra-index-url ${PYPI_SIMPLE_URL} \ - neuronx-cc==$NEURONX_CC_VERSION \ - torch-neuronx==$NEURONX_FRAMEWORK_VERSION \ - neuronx_distributed==$NEURONX_DISTRIBUTED_VERSION \ - neuronx_distributed_inference==$NEURONX_DISTRIBUTED_INFERENCE_VERSION \ - && rm -rf ~/.cache/pip/* - -FROM ${BUILD_STAGE} AS final - -EXPOSE 8080 8081 - -ENTRYPOINT ["python", "/usr/local/bin/dockerd-entrypoint.py"] -CMD ["/usr/local/bin/entrypoint.sh"] - -HEALTHCHECK CMD curl --fail http://localhost:8080/ping || exit 1 \ No newline at end of file diff --git a/pytorch/training/2.9.0/Dockerfile.neuronx b/pytorch/training/2.9.0/Dockerfile.neuronx deleted file mode 100644 index a69dc6b..0000000 --- a/pytorch/training/2.9.0/Dockerfile.neuronx +++ /dev/null @@ -1,288 +0,0 @@ -ARG BUILD_STAGE=prod - -FROM public.ecr.aws/docker/library/ubuntu:24.04 AS base - -LABEL maintainer="Amazon AI" -LABEL dlc_major_version="1" - -ARG PYTHON=python3.12 -ARG PYTHON_VERSION=3.12.11 -ARG PIP=pip3 -ARG OMPI_VERSION=4.1.5 -ARG PYPI_SIMPLE_URL="https://pypi.org/simple/" - -# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 22 -ARG DEBIAN_FRONTEND=noninteractive - -# Python won’t try to write .pyc or .pyo files on the import of source modules -# Force stdin, stdout and stderr to be totally unbuffered. Good for logging -ENV PYTHONDONTWRITEBYTECODE=1 -ENV PYTHONUNBUFFERED=1 -ENV PYTHONIOENCODING=UTF-8 -ENV LANG=C.UTF-8 -ENV LC_ALL=C.UTF-8 -ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/aws/neuron/lib" -ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib" -ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib64" -ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/openmpi/lib64" -ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib" -ENV PATH="/opt/aws/neuron/bin:${PATH}" -ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main -ENV DGLBACKEND=pytorch - -RUN apt-get update \ - && apt-get upgrade -y \ - && apt-get install -y --no-install-recommends \ - build-essential \ - ca-certificates \ - cmake \ - curl \ - emacs \ - git \ - gnupg2 \ - gpg-agent \ - jq \ - libopencv-dev \ - libglib2.0-0 \ - libgl1-mesa-dri \ - libsm6 \ - libxext6 \ - libxrender-dev \ - libssl-dev \ - libsqlite3-dev \ - libgdbm-dev \ - libc6-dev \ - libbz2-dev \ - libncurses-dev \ - libffi-dev \ - libcap-dev \ - libhwloc-dev \ - openjdk-8-jdk-headless \ - openjdk-8-jdk \ - openjdk-8-jre \ - openjdk-11-jdk \ - openssl \ - software-properties-common \ - tk-dev \ - unzip \ - wget \ - vim \ - zlib1g-dev \ - && rm -rf /tmp/tmp* \ - && rm -rf /var/lib/apt/lists/* \ - && apt-get clean - - -# Install Open MPI -RUN mkdir -p /tmp/openmpi \ - && cd /tmp/openmpi \ - && wget --quiet https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OMPI_VERSION}.tar.gz \ - && tar zxf openmpi-${OMPI_VERSION}.tar.gz \ - && cd openmpi-${OMPI_VERSION} \ - && ./configure --enable-orterun-prefix-by-default \ - && make -j $(nproc) all \ - && make install \ - && ldconfig \ - && rm -rf /tmp/openmpi - - # Install packages an configure SSH for MPI in K8s -RUN apt-get update && apt-get install -y openmpi-bin openssh-server \ - && mkdir -p /var/run/sshd \ - && echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config \ - && echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config \ - && sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config \ - && rm -rf /var/lib/apt/lists/* \ - && rm -rf /tmp/tmp* \ - && apt-get clean - - - # Install Python -RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz \ - && tar -xzf Python-$PYTHON_VERSION.tgz \ - && cd Python-$PYTHON_VERSION \ - && ./configure --enable-shared --prefix=/usr/local \ - && make -j $(nproc) && make install \ - && cd .. && rm -rf ../Python-$PYTHON_VERSION* \ - && ln -s /usr/local/bin/pip3 /usr/bin/pip \ - && ln -s /usr/local/bin/$PYTHON /usr/local/bin/python \ - && ${PIP} --no-cache-dir install --upgrade pip \ - && rm -rf ~/.cache/pip/* - -WORKDIR / - -# The ENV variables declared below are changed in the previous section -# Grouping these ENV variables in the first section causes -# ompi_info to fail. This is only observed in CPU containers -ENV PATH="$PATH:/home/.openmpi/bin" -ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/.openmpi/lib/" -RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value - -RUN ${PIP} install --no-cache-dir -U \ - "bokeh>=2.3,<3" \ - "awscli<2" \ - scipy \ - click \ - "cryptography" \ - "sagemaker>=2,<3" \ - "sagemaker-pytorch-training" \ - psutil==5.6.7 \ - dataset \ - Pillow \ - && rm -rf ~/.cache/pip/* - -RUN mkdir -p /etc/pki/tls/certs && cp /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt - -# Copy the NxDT Installation files -COPY --chmod=755 apex_setup.py nxdt_install_setup.sh nxdt_requirements.txt /root/ - -# attrs, neuronx-cc required: >=19.2.0, sagemaker <24,>=23.1.0 -# protobuf neuronx-cc<4, sagemaker-training >=3.9.2,<=3.20.3 -# awscli 1.25.47 has requirement docutils<0.17,>=0.10 -# etcd for kubernetes installation -# awscli 1.27.127 has requirement rsa<4.8,>=3.1.2, but you have rsa 4.9. -# awscli 1.27.127 requires urllib3 < 1.27, python-etcd requires urllib3 >= 1.7, latest urllib3 release is 2.0.2 -RUN ${PIP} install --no-cache-dir -U \ - "attrs<24,>=23.1.0" \ - "docutils>=0.10,<0.17" \ - "rsa<4.8,>=3.1.2" \ - "python-etcd" \ - "urllib3>=1.26.0,<1.27" \ - # Install extra packages needed by sagemaker (for passing test_utility_packages_using_import) - && ${PIP} install --no-cache-dir -U \ - "bokeh>=3.0.1,<4" \ - "imageio>=2.22,<3" \ - "opencv-python>=4.8.1.78" \ - "plotly>=5.11,<6" \ - "seaborn>=0.12,<1" \ - "shap>=0.41,<1" \ - && rm -rf ~/.cache/pip/* - -# EFA Installer does apt get. Make sure to run apt update before that -RUN apt-get update \ - && cd $HOME \ - && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz \ - && wget https://efa-installer.amazonaws.com/aws-efa-installer.key && gpg --import aws-efa-installer.key \ - && cat aws-efa-installer.key | gpg --fingerprint \ - && wget https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz.sig && gpg --verify ./aws-efa-installer-latest.tar.gz.sig \ - && tar -xf aws-efa-installer-latest.tar.gz \ - && cd aws-efa-installer \ - && ./efa_installer.sh -y -g --skip-kmod --skip-limit-conf --no-verify \ - && cd $HOME \ - && rm -rf /var/lib/apt/lists/* \ - && rm -rf /tmp/tmp* \ - && apt-get clean - -# Install some common packages used by training scripts -# Needed for running bert training scripts -RUN pip3 install --no-cache-dir -U \ - graphviz \ - tensorboard==2.6 \ - accelerate \ - # Install NxDT dependencies - && ${PIP} install --no-cache-dir \ - Cython \ - wheel \ - && rm -rf ~/.cache/pip/* - -# Copy workaround script for incorrect hostname -COPY changehostname.c / -COPY --chmod=755 start_with_right_hostname.sh deep_learning_container.py /usr/local/bin/ - -RUN HOME_DIR=/root \ - && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ - && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ - && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ - && chmod +x /usr/local/bin/testOSSCompliance \ - && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ - && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ - && rm -rf ${HOME_DIR}/oss_compliance* \ - && rm -rf /tmp/tmp* - -RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.9/license.txt - -# Setting up APT and PIP repo for neuron artifacts -ARG NEURON_APT_REPO=apt.repos.neuron.amazonaws.com -ARG NEURON_APT_REPO_KEY -ARG NEURON_PIP_REPO=pip.repos.neuron.amazonaws.com -ARG NEURON_PIP_REPO_KEY -RUN mkdir -p /etc/apt/keyrings \ - && APT_REPO_PREFIX=$([ -n "${NEURON_APT_REPO_KEY}" ] && echo "${NEURON_APT_REPO_KEY}@" || echo "") \ - && echo "deb [signed-by=/etc/apt/keyrings/neuron.gpg] https://${APT_REPO_PREFIX}${NEURON_APT_REPO} jammy main" > /etc/apt/sources.list.d/neuron.list \ - && curl $([ -n "${NEURON_APT_REPO_KEY}" ] && echo "-u ${NEURON_APT_REPO_KEY}") --retry 3 --retry-delay 1 --retry-all-errors -fSL "https://${NEURON_APT_REPO}/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB" | gpg --dearmor > /etc/apt/keyrings/neuron.gpg - -# Neuron SDK components -ARG NEURONX_COLLECTIVES_LIB_VERSION=2.29.41.0-681fef5f5 -ARG NEURONX_RUNTIME_LIB_VERSION=2.29.40.0-f954cd7a5 -ARG NEURONX_TOOLS_VERSION=2.27.33.0-5d9c0b901 -ARG NEURONX_FRAMEWORK_VERSION=2.9.0.2.11.19912+e48cd891 -ARG NEURONX_CC_VERSION=2.22.12471.0+b4a00d10 -ARG NEURONX_DISTRIBUTED_VERSION=0.16.25997+f431c02e -ARG NEURONX_DISTRIBUTED_TRAINING_VERSION=1.7.0 - -FROM base AS repo - -# Install Neuron components from the apt and pip repos (latest versions) -RUN apt-get update \ - && apt-get install -y \ - aws-neuronx-tools \ - aws-neuronx-collectives \ - aws-neuronx-runtime-lib \ - && rm -rf /var/lib/apt/lists/* \ - && rm -rf /tmp/tmp* \ - && apt-get clean - -RUN PIP_REPO_URL=$([ -n "${NEURON_PIP_REPO_KEY}" ] && echo "https://${NEURON_PIP_REPO_KEY}@${NEURON_PIP_REPO}" || echo "https://${NEURON_PIP_REPO}") \ - && ${PIP} install --no-cache-dir --force-reinstall \ - --index-url ${PIP_REPO_URL} \ - --trusted-host ${NEURON_PIP_REPO} \ - --extra-index-url ${PYPI_SIMPLE_URL} \ - "torch-neuronx==2.9.*" \ - "neuronx-cc>=2.0" \ - neuronx_distributed \ - neuronx_distributed_training \ - && rm -rf ~/.cache/pip/* - -FROM base AS prod - -# Install Neuron components with specific versions -RUN apt-get update \ - && apt-get install -y \ - aws-neuronx-tools=$NEURONX_TOOLS_VERSION \ - aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \ - aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \ - && rm -rf /var/lib/apt/lists/* \ - && rm -rf /tmp/tmp* \ - && apt-get clean - -RUN PIP_REPO_URL=$([ -n "${NEURON_PIP_REPO_KEY}" ] && echo "https://${NEURON_PIP_REPO_KEY}@${NEURON_PIP_REPO}" || echo "https://${NEURON_PIP_REPO}") \ - && ${PIP} install --no-cache-dir --force-reinstall \ - --index-url ${PIP_REPO_URL} \ - --trusted-host ${NEURON_PIP_REPO} \ - --extra-index-url ${PYPI_SIMPLE_URL} \ - torch-neuronx==$NEURONX_FRAMEWORK_VERSION \ - neuronx-cc==$NEURONX_CC_VERSION \ - neuronx_distributed==$NEURONX_DISTRIBUTED_VERSION \ - neuronx_distributed_training==$NEURONX_DISTRIBUTED_TRAINING_VERSION \ - && rm -rf ~/.cache/pip/* - -FROM ${BUILD_STAGE} AS final - -## Installation for Neuronx Distributed Training framework -# Clone and build Apex -RUN git clone https://github.com/NVIDIA/apex.git /root/apex \ - && cd /root/apex \ - && git checkout 25.07 \ - && cp /root/apex_setup.py setup.py \ - # Install dependencies from requirements and extras for SageMaker usecase - && ${PIP} install --no-cache-dir --no-build-isolation -r /root/nxdt_requirements.txt /root/apex \ - && /root/nxdt_install_setup.sh \ - && ${PIP} install --force-reinstall \ - "torch==2.9.0" \ - torchvision \ - && rm -rf ~/.cache/pip/* - -# Starts framework -ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"] -CMD ["/bin/bash"] - -HEALTHCHECK CMD curl --fail http://localhost:8080/ping || exit 1 \ No newline at end of file diff --git a/vllm/inference/0.11.0/Dockerfile.neuronx b/vllm/inference/0.11.0/Dockerfile.neuronx deleted file mode 100644 index 3ae8f08..0000000 --- a/vllm/inference/0.11.0/Dockerfile.neuronx +++ /dev/null @@ -1,242 +0,0 @@ -ARG BUILD_STAGE=prod - -FROM public.ecr.aws/docker/library/ubuntu:24.04 AS base - -LABEL dlc_major_version="1" -LABEL maintainer="Amazon AI" - -ARG DEBIAN_FRONTEND=noninteractive -ARG PIP=pip3 -ARG PYTHON=python3.12 -ARG PYTHON_VERSION=3.12.11 -ARG TORCHSERVE_VERSION=0.11.0 -ARG PYPI_SIMPLE_URL="https://pypi.org/simple/" - - -# See http://bugs.python.org/issue19846 -ENV LANG=C.UTF-8 -ENV LD_LIBRARY_PATH=/opt/aws/neuron/lib:/lib/x86_64-linux-gnu:/opt/conda/lib/:$LD_LIBRARY_PATH -ENV PATH=/opt/conda/bin:/opt/aws/neuron/bin:$PATH - -RUN apt-get update \ - && apt-get upgrade -y \ - && apt-get install -y --no-install-recommends \ - apt-transport-https \ - build-essential \ - ca-certificates \ - cmake \ - curl \ - emacs \ - ffmpeg \ - gcc \ - git \ - gnupg2 \ - gpg-agent \ - jq \ - libgl1 \ - libgl1-mesa-dri \ - libglib2.0-0 \ - libsm6 \ - libxext6 \ - libxrender-dev \ - libcap-dev \ - libhwloc-dev \ - openssh-client \ - openjdk-11-jdk \ - unzip \ - vim \ - wget \ - zlib1g-dev \ - && rm -rf /var/lib/apt/lists/* \ - && rm -rf /tmp/tmp* \ - && apt-get clean - - -# https://github.com/docker-library/openjdk/issues/261 https://github.com/docker-library/openjdk/pull/263/files -RUN keytool -importkeystore -srckeystore /etc/ssl/certs/java/cacerts -destkeystore /etc/ssl/certs/java/cacerts.jks -deststoretype JKS -srcstorepass changeit -deststorepass changeit -noprompt; \ - mv /etc/ssl/certs/java/cacerts.jks /etc/ssl/certs/java/cacerts; \ - /var/lib/dpkg/info/ca-certificates-java.postinst configure; - -RUN curl -L -o ~/miniforge.sh https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh \ - && chmod +x ~/miniforge.sh \ - && ~/miniforge.sh -b -p /opt/conda \ - && rm ~/miniforge.sh \ - && /opt/conda/bin/conda update -y conda \ - && /opt/conda/bin/mamba install -c conda-forge -y \ - python=$PYTHON_VERSION \ - pyopenssl \ - cython \ - mkl-include \ - mkl \ - parso \ - typing \ - # Below 2 are included in miniconda base, but not mamba so need to install - conda-content-trust \ - charset-normalizer \ - && /opt/conda/bin/conda clean -ya - -RUN /opt/conda/bin/mamba install -c conda-forge \ - python=$PYTHON_VERSION \ - scikit-learn \ - h5py \ - requests \ - && conda clean -ya \ - && pip install --upgrade pip \ - --trusted-host pypi.org --trusted-host files.pythonhosted.org \ - && ln -s /opt/conda/bin/pip /usr/local/bin/pip3 \ - && pip install \ - enum-compat \ - ipython \ - && rm -rf ~/.cache/pip/* - -# Install EFA -RUN apt-get update \ - && cd $HOME \ - && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz \ - && wget https://efa-installer.amazonaws.com/aws-efa-installer.key && gpg --import aws-efa-installer.key \ - && cat aws-efa-installer.key | gpg --fingerprint \ - && wget https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz.sig && gpg --verify ./aws-efa-installer-latest.tar.gz.sig \ - && tar -xf aws-efa-installer-latest.tar.gz \ - && cd aws-efa-installer \ - && ./efa_installer.sh -y -g --skip-kmod --skip-limit-conf --no-verify \ - && cd $HOME \ - && rm -rf /var/lib/apt/lists/* \ - && rm -rf /tmp/tmp* \ - && apt-get clean - -COPY --chmod=755 vllm_entrypoint.py neuron-monitor.sh deep_learning_container.py /usr/local/bin/ - -### Mount Point ### -# When launching the container, mount the code directory to /workspace -ARG APP_MOUNT=/workspace -VOLUME [ ${APP_MOUNT} ] -WORKDIR ${APP_MOUNT}/vllm - -RUN ${PIP} install --no-cache-dir -U \ - "opencv-python" \ - "awscli" \ - "pandas" \ - "boto3" \ - "cryptography" \ - "pytest" \ - "wheel" \ - "cmake>=3.26" \ - "setuptools-scm>=8" \ - "jinja2" \ - torchserve==${TORCHSERVE_VERSION} \ - torch-model-archiver==${TORCHSERVE_VERSION} \ - && rm -rf ~/.cache/pip/* - -RUN useradd -m model-server \ - && mkdir -p /home/model-server/tmp /opt/ml/model \ - && chown -R model-server /home/model-server /opt/ml/model -COPY config.properties /home/model-server - -# Compliance -RUN HOME_DIR=/root \ - && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ - && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ - && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ - && chmod +x /usr/local/bin/testOSSCompliance \ - && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ - && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ - && rm -rf ${HOME_DIR}/oss_compliance* \ - # conda leaves an empty /root/.cache/conda/notices.cache file which is not removed by conda clean -ya - && rm -rf ${HOME_DIR}/.cache/conda - -# Setting up APT and PIP repo for neuron artifacts -ARG NEURON_APT_REPO=apt.repos.neuron.amazonaws.com -ARG NEURON_APT_REPO_KEY -ARG NEURON_PIP_REPO=pip.repos.neuron.amazonaws.com -ARG NEURON_PIP_REPO_KEY -RUN mkdir -p /etc/apt/keyrings \ - && APT_REPO_PREFIX=$([ -n "${NEURON_APT_REPO_KEY}" ] && echo "${NEURON_APT_REPO_KEY}@" || echo "") \ - && echo "deb [signed-by=/etc/apt/keyrings/neuron.gpg] https://${APT_REPO_PREFIX}${NEURON_APT_REPO} jammy main" > /etc/apt/sources.list.d/neuron.list \ - && curl $([ -n "${NEURON_APT_REPO_KEY}" ] && echo "-u ${NEURON_APT_REPO_KEY}") --retry 3 --retry-delay 1 --retry-all-errors -fSL "https://${NEURON_APT_REPO}/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB" | gpg --dearmor > /etc/apt/keyrings/neuron.gpg - -# Neuron SDK components version numbers -ARG NEURONX_COLLECTIVES_LIB_VERSION=2.29.41.0-681fef5f5 -ARG NEURONX_RUNTIME_LIB_VERSION=2.29.40.0-f954cd7a5 -ARG NEURONX_TOOLS_VERSION=2.27.33.0-5d9c0b901 -ARG NEURONX_CC_VERSION=2.22.12471.0+b4a00d10 -ARG NEURONX_FRAMEWORK_VERSION=2.8.0.2.10.16998+e9bf8a50 -ARG NEURONX_DISTRIBUTED_VERSION=0.16.25997+f431c02e -ARG NEURONX_DISTRIBUTED_INFERENCE_VERSION=0.7.14366+9adb71b8 - -# vLLM branch names -ARG VLLM_PRIVATE_BRANCH=neuron-release-2.27 -ARG VLLM_PUBLIC_BRANCH=0.2.1-lts - -FROM base AS vllm-clone - -RUN mkdir -p /root/.ssh && \ - echo "StrictHostKeyChecking no" >> /root/.ssh/config && \ - ssh-keyscan -t rsa github.com >> /root/.ssh/known_hosts - -WORKDIR /vllm - -RUN --mount=type=secret,id=ssh_key,target=/root/.ssh/id_ed25519,mode=0600 \ - git clone -b ${VLLM_PRIVATE_BRANCH} git@github.com:aws-neuron/private-vllm-neuron.git . - -FROM base AS repo - - -# Install Neuron components from the apt and pip repos (latest versions) -RUN apt-get update \ - && apt-get install -y \ - aws-neuronx-tools \ - aws-neuronx-collectives \ - aws-neuronx-runtime-lib \ - && rm -rf /var/lib/apt/lists/* \ - && rm -rf /tmp/tmp* \ - && apt-get clean - -# Install VLLM from source -COPY --from=vllm-clone /vllm /opt/vllm - -RUN PIP_REPO_URL=$([ -n "${NEURON_PIP_REPO_KEY}" ] && echo "https://${NEURON_PIP_REPO_KEY}@${NEURON_PIP_REPO}" || echo "https://${NEURON_PIP_REPO}") \ - && ${PIP} install --no-cache-dir \ - --index-url ${PIP_REPO_URL} \ - --trusted-host ${NEURON_PIP_REPO} \ - --extra-index-url ${PYPI_SIMPLE_URL} \ - "neuronx-cc>=2.0" \ - "torch-neuronx==2.8.*" \ - neuronx_distributed \ - neuronx_distributed_inference \ - -e /opt/vllm \ - && rm -rf ~/.cache/pip/* - -FROM base AS prod - -# Install Neuron components with specific versions -RUN apt-get update \ - && apt-get install -y \ - aws-neuronx-tools=$NEURONX_TOOLS_VERSION \ - aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \ - aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \ - && rm -rf /var/lib/apt/lists/* \ - && rm -rf /tmp/tmp* \ - && apt-get clean - -# Clone VLLM source before pip installations -RUN git clone -b ${VLLM_PUBLIC_BRANCH} https://github.com/vllm-project/vllm-neuron.git /opt/vllm - -RUN PIP_REPO_URL=$([ -n "${NEURON_PIP_REPO_KEY}" ] && echo "https://${NEURON_PIP_REPO_KEY}@${NEURON_PIP_REPO}" || echo "https://${NEURON_PIP_REPO}") \ - && ${PIP} install --no-cache-dir \ - --index-url ${PIP_REPO_URL} \ - --trusted-host ${NEURON_PIP_REPO} \ - --extra-index-url ${PYPI_SIMPLE_URL} \ - neuronx-cc==$NEURONX_CC_VERSION \ - torch-neuronx==$NEURONX_FRAMEWORK_VERSION \ - neuronx_distributed==$NEURONX_DISTRIBUTED_VERSION \ - neuronx_distributed_inference==$NEURONX_DISTRIBUTED_INFERENCE_VERSION \ - -e /opt/vllm \ - && rm -rf ~/.cache/pip/* - -FROM ${BUILD_STAGE} AS final - -EXPOSE 8080 8081 - -ENTRYPOINT ["python", "/usr/local/bin/vllm_entrypoint.py"] -CMD ["/bin/bash"] -HEALTHCHECK CMD curl --fail http://localhost:8080/ping || exit 1 \ No newline at end of file