Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions common/nxdt_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ hydra-core>=1.3.0
omegaconf>=2.2,<2.3
pyyaml==6.0.1
torchmetrics>=0.4.1rc0,<=0.10.3
transformers==4.52.4
transformers==4.56.*
wandb
webdataset>=0.1.48,<=0.1.62
pandas
Expand All @@ -22,7 +22,7 @@ ftfy
gdown
inflect
jieba
opencc==1.1.6
opencc==1.1.9
pangu
rapidfuzz
pybind11
Expand All @@ -39,12 +39,12 @@ python-daemon
huggingface_hub>=0.27.1
multiprocess==0.70.16
numba<=0.60.0
numpy>=1.24.3,<=1.25.2
rouge_score
setuptools>=70.0
lightning==2.5.0
ml-dtypes==0.5.0
boto3==1.35.93
botocore==1.35.93
s3transfer==0.10.4
s3fs
s3fs
numpy==1.26.4
224 changes: 224 additions & 0 deletions jax/training/0.7/Dockerfile.neuronx
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
ARG BUILD_STAGE=prod

FROM public.ecr.aws/docker/library/ubuntu:24.04 AS base

LABEL dlc_major_version="1"
LABEL maintainer="Amazon AI"

# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 24
ARG DEBIAN_FRONTEND=noninteractive
ARG PYTHON=python3.12
ARG PYTHON_VERSION=3.12.11
ARG PIP=pip3
ARG OMPI_VERSION=4.1.5
ARG PYPI_SIMPLE_URL="https://pypi.org/simple/"

# Python won’t try to write .pyc or .pyo files on the import of source modules
# Force stdin, stdout and stderr to be totally unbuffered. Good for logging
ENV PYTHONDONTWRITEBYTECODE=1
ENV PYTHONUNBUFFERED=1
ENV PYTHONIOENCODING=UTF-8
ENV LANG=C.UTF-8
ENV LC_ALL=C.UTF-8
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/aws/neuron/lib"

Check warning on line 23 in jax/training/0.7/Dockerfile.neuronx

View workflow job for this annotation

GitHub Actions / build (0.7, jax/training/0.7)

Variables should be defined before their use

UndefinedVar: Usage of undefined variable '$LD_LIBRARY_PATH' More info: https://docs.docker.com/go/dockerfile/rule/undefined-var/
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib"
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib64"
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/openmpi/lib64"
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib"
ENV PATH="/opt/aws/neuron/bin:${PATH}"

RUN apt-get update \
&& apt-get upgrade -y \
&& apt-get install -y --no-install-recommends \
build-essential \
ca-certificates \
cmake \
curl \
emacs \
git \
gnupg2 \
gpg-agent \
jq \
libopencv-dev \
libglib2.0-0 \
libgl1-mesa-dri \
libsm6 \
libxext6 \
libxrender-dev \
libssl-dev \
libsqlite3-dev \
libgdbm-dev \
libc6-dev \
libbz2-dev \
libncurses-dev \
libffi-dev \
libcap-dev \
libhwloc-dev \
openjdk-8-jdk-headless \
openjdk-8-jdk \
openjdk-8-jre \
openjdk-11-jdk \
openssl \
software-properties-common \
tk-dev \
unzip \
wget \
vim \
zlib1g-dev \
&& rm -rf /var/lib/apt/lists/* \
&& rm -rf /tmp/tmp* \
&& apt-get clean

# Install Open MPI and configure SSH for MPI operator in k8s
RUN mkdir -p /tmp/openmpi \
&& cd /tmp/openmpi \
&& wget --quiet https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OMPI_VERSION}.tar.gz \
&& tar zxf openmpi-${OMPI_VERSION}.tar.gz \
&& cd openmpi-${OMPI_VERSION} \
&& ./configure --enable-orterun-prefix-by-default \
&& make -j $(nproc) all \
&& make install \
&& ldconfig \
&& rm -rf /tmp/openmpi

# Install packages and configure SSH for MPI operator in k8s
RUN apt-get update \
&& apt-get install -y openmpi-bin openssh-server \
&& mkdir -p /var/run/sshd \
&& echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config \
&& echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config \
&& sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config \
&& rm -rf /var/lib/apt/lists/* \
&& rm -rf /tmp/tmp* \
&& apt-get clean

# Install Python
RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz \
&& tar -xzf Python-$PYTHON_VERSION.tgz \
&& cd Python-$PYTHON_VERSION \
&& ./configure --enable-shared --prefix=/usr/local \
&& make -j $(nproc) && make install \
&& cd .. && rm -rf ../Python-$PYTHON_VERSION* \
&& ln -s /usr/local/bin/pip3 /usr/bin/pip \
&& ln -s /usr/local/bin/$PYTHON /usr/local/bin/python \
&& ${PIP} --no-cache-dir install --upgrade \
"awscli<2" \
pip \
requests \
setuptools \
&& rm -rf ~/.cache/pip/*

# U24 will not allow installation of pip packages outside of venv without this flag
# This is because U24 ships with Python 3.12 by default and installation into the Python
# interpreter’s directory are disabled outside of a virtual environment.
# https://peps.python.org/pep-0668/
RUN ${PIP} config set global.break-system-packages true

# Install EFA
RUN apt-get update \
&& cd $HOME \
&& curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz \
&& wget https://efa-installer.amazonaws.com/aws-efa-installer.key && gpg --import aws-efa-installer.key \
&& cat aws-efa-installer.key | gpg --fingerprint \
&& wget https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz.sig && gpg --verify ./aws-efa-installer-latest.tar.gz.sig \
&& tar -xf aws-efa-installer-latest.tar.gz \
&& cd aws-efa-installer \
&& ./efa_installer.sh -y -g --skip-kmod --skip-limit-conf --no-verify \
&& cd $HOME \
&& rm -rf /var/lib/apt/lists/* \
&& rm -rf /tmp/tmp* \
&& apt-get clean

WORKDIR /

# The ENV variables declared below are changed in the previous section
# Grouping these ENV variables in the first section causes
# ompi_info to fail. This is only observed in CPU containers
ENV PATH="$PATH:/home/.openmpi/bin"
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/.openmpi/lib/"
RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value

RUN mkdir -p /etc/pki/tls/certs && cp /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt

# Copy workaround script for incorrect hostname
COPY changehostname.c /
COPY --chmod=755 start_with_right_hostname.sh deep_learning_container.py /usr/local/bin/

RUN HOME_DIR=/root \
&& curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
&& unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
&& cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
&& chmod +x /usr/local/bin/testOSSCompliance \
&& chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
&& ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
&& rm -rf ${HOME_DIR}/oss_compliance* \
&& rm -rf /tmp/tmp*

# Setting up APT and PIP repo for neuron artifacts
ARG NEURON_APT_REPO=apt.repos.neuron.amazonaws.com
ARG NEURON_APT_REPO_KEY

Check warning on line 159 in jax/training/0.7/Dockerfile.neuronx

View workflow job for this annotation

GitHub Actions / build (0.7, jax/training/0.7)

Sensitive data should not be used in the ARG or ENV commands

SecretsUsedInArgOrEnv: Do not use ARG or ENV instructions for sensitive data (ARG "NEURON_APT_REPO_KEY") More info: https://docs.docker.com/go/dockerfile/rule/secrets-used-in-arg-or-env/
ARG NEURON_PIP_REPO=pip.repos.neuron.amazonaws.com
ARG NEURON_PIP_REPO_KEY

Check warning on line 161 in jax/training/0.7/Dockerfile.neuronx

View workflow job for this annotation

GitHub Actions / build (0.7, jax/training/0.7)

Sensitive data should not be used in the ARG or ENV commands

SecretsUsedInArgOrEnv: Do not use ARG or ENV instructions for sensitive data (ARG "NEURON_PIP_REPO_KEY") More info: https://docs.docker.com/go/dockerfile/rule/secrets-used-in-arg-or-env/
RUN mkdir -p /etc/apt/keyrings \
&& APT_REPO_PREFIX=$([ -n "${NEURON_APT_REPO_KEY}" ] && echo "${NEURON_APT_REPO_KEY}@" || echo "") \
&& echo "deb [signed-by=/etc/apt/keyrings/neuron.gpg] https://${APT_REPO_PREFIX}${NEURON_APT_REPO} jammy main" > /etc/apt/sources.list.d/neuron.list \
&& curl $([ -n "${NEURON_APT_REPO_KEY}" ] && echo "-u ${NEURON_APT_REPO_KEY}") --retry 3 --retry-delay 1 --retry-all-errors -fSL "https://${NEURON_APT_REPO}/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB" | gpg --dearmor > /etc/apt/keyrings/neuron.gpg

# Neuron SDK components version numbers
ARG NEURONX_RUNTIME_LIB_VERSION=2.29.40.0-f954cd7a5
ARG NEURONX_COLLECTIVES_LIB_VERSION=2.29.41.0-681fef5f5
ARG NEURONX_TOOLS_VERSION=2.27.33.0-5d9c0b901
ARG NEURONX_CC_VERSION=2.22.12471.0+b4a00d10
ARG NEURONX_JAX_TRAINING_VERSION=0.7.0.1.0.7377+5e6a4049

FROM base AS repo

# Install Neuron components from the apt and pip repos (latest versions)
RUN apt-get update \
&& apt-get install -y \
aws-neuronx-tools \
aws-neuronx-collectives \
aws-neuronx-runtime-lib \
&& rm -rf /var/lib/apt/lists/* \
&& rm -rf /tmp/tmp* \
&& apt-get clean

RUN PIP_REPO_URL=$([ -n "${NEURON_PIP_REPO_KEY}" ] && echo "https://${NEURON_PIP_REPO_KEY}@${NEURON_PIP_REPO}" || echo "https://${NEURON_PIP_REPO}") \
&& ${PIP} install --no-cache-dir --force-reinstall \
--index-url ${PIP_REPO_URL} \
--extra-index-url ${PYPI_SIMPLE_URL} \
--trusted-host ${NEURON_PIP_REPO} \
"neuronx-cc>=2.0" \
jax-neuronx \
&& rm -rf ~/.cache/pip/*

FROM base AS prod

# Install Neuron components
# Install Neuron Driver, Runtime and Tools
RUN apt-get update \
&& apt-get install -y \
aws-neuronx-tools=$NEURONX_TOOLS_VERSION \
aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \
aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \
&& rm -rf /var/lib/apt/lists/* \
&& rm -rf /tmp/tmp* \
&& apt-get clean

# Install JAX & Neuron CC
RUN PIP_REPO_URL=$([ -n "${NEURON_PIP_REPO_KEY}" ] && echo "https://${NEURON_PIP_REPO_KEY}@${NEURON_PIP_REPO}" || echo "https://${NEURON_PIP_REPO}") \
&& ${PIP} install --no-cache-dir --force-reinstall \
--index-url ${PIP_REPO_URL} \
--trusted-host ${NEURON_PIP_REPO} \
--extra-index-url ${PYPI_SIMPLE_URL} \
neuronx-cc==$NEURONX_CC_VERSION \
jax-neuronx==$NEURONX_JAX_TRAINING_VERSION \
&& rm -rf ~/.cache/pip/*

FROM ${BUILD_STAGE} AS final

# Starts framework
ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"]
CMD ["/bin/bash"]

HEALTHCHECK CMD curl --fail http://localhost:8080/ping || exit 1
Loading
Loading