From bc8f4b846a51fe4ec5a2ebd8423fd6028557de04 Mon Sep 17 00:00:00 2001 From: Mosout Date: Wed, 18 Dec 2024 10:59:43 +0800 Subject: [PATCH 01/18] refine --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 39c7266..88673df 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -43,7 +43,7 @@ jobs: build_test: name: Build oneflow and backend - runs-on: [self-hosted, linux, provision] + runs-on: [self-hosted, linux, gpu] needs: [cancel_previous] if: github.event_name == 'release' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || (github.event.pull_request.draft == false && github.base_ref == 'main' && contains(github.event.pull_request.requested_reviewers.*.login, 'oneflow-ci-bot')) outputs: From f9724a39369af96af0ad6e38ca0dca51ad3f02ce Mon Sep 17 00:00:00 2001 From: Mosout Date: Wed, 18 Dec 2024 14:56:42 +0800 Subject: [PATCH 02/18] fix --- ci/build/backend.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ci/build/backend.sh b/ci/build/backend.sh index 23125bb..f38527b 100644 --- a/ci/build/backend.sh +++ b/ci/build/backend.sh @@ -2,8 +2,8 @@ set -euxo pipefail # build oneflow-backend -git config --global http.proxy ${HTTP_PROXY} -git config --global https.proxy ${HTTP_PROXY} +# git config --global http.proxy ${HTTP_PROXY} +# git config --global https.proxy ${HTTP_PROXY} mkdir -p build cd build @@ -11,5 +11,7 @@ cmake -DCMAKE_PREFIX_PATH=$ONEFLOW_CI_BUILD_DIR/liboneflow_cpp/share \ -DTRITON_RELATED_REPO_TAG=r$TRITON_VERSION \ -DTRITON_ENABLE_GPU=ON \ -DTHIRD_PARTY_MIRROR=aliyun \ + -DBUILD_ONEFLOW_BACKEND=ON \ + -DBUILD_ONEFLOW_LITE_BACKEND=OFF \ -G Ninja .. ninja -j8 From d2a5e797caede043cb4e2724cefdd99ba3e1311b Mon Sep 17 00:00:00 2001 From: Mosout Date: Wed, 18 Dec 2024 16:54:12 +0800 Subject: [PATCH 03/18] revert --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 88673df..39c7266 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -43,7 +43,7 @@ jobs: build_test: name: Build oneflow and backend - runs-on: [self-hosted, linux, gpu] + runs-on: [self-hosted, linux, provision] needs: [cancel_previous] if: github.event_name == 'release' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || (github.event.pull_request.draft == false && github.base_ref == 'main' && contains(github.event.pull_request.requested_reviewers.*.login, 'oneflow-ci-bot')) outputs: From fcf4a503092c04054e55a0072a514bc6afd906f3 Mon Sep 17 00:00:00 2001 From: Mosout Date: Wed, 18 Dec 2024 17:00:18 +0800 Subject: [PATCH 04/18] refine --- .github/workflows/test.yml | 102 ++++++++++++++++++------------------- 1 file changed, 51 insertions(+), 51 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 39c7266..b2bda73 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -144,17 +144,17 @@ jobs: docker run $extra_docker_args ${{ env.image_tag }} sleep 3600 docker exec -w $(pwd) ${{ env.container_name }} pip3 install -r ./ci/test/requirement.txt -i https://pypi.tuna.tsinghua.edu.cn/simple docker exec -w $(pwd) ${{ env.container_name }} bash ./ci/test/run_tests.sh - - name: Login to ACR with the AccessKey pair - uses: aliyun/acr-login@v1 - with: - login-server: https://registry.${{ env.REGION_ID }}.aliyuncs.com - username: "${{ secrets.ACR_USERNAME }}" - password: "${{ secrets.ACR_PASSWORD }}" - - name: Docker push to ACR - if: github.event_name == 'release' || github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' - run: | - docker tag ${{ env.image_tag }} ${{ env.acr_image_tag }} - docker push ${{ env.acr_image_tag }} + # - name: Login to ACR with the AccessKey pair + # uses: aliyun/acr-login@v1 + # with: + # login-server: https://registry.${{ env.REGION_ID }}.aliyuncs.com + # username: "${{ secrets.ACR_USERNAME }}" + # password: "${{ secrets.ACR_PASSWORD }}" + # - name: Docker push to ACR + # if: github.event_name == 'release' || github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' + # run: | + # docker tag ${{ env.image_tag }} ${{ env.acr_image_tag }} + # docker push ${{ env.acr_image_tag }} - name: Remove container run: docker container rm -f ${{ env.container_name }} - name: Remove image @@ -174,43 +174,43 @@ jobs: echo "::set-output name=image_tag::${{ env.image_tag }}" echo "::set-output name=acr_image_tag::${{ env.acr_image_tag }}" - docker_push: - name: Push to docker hub - runs-on: ubuntu-latest - needs: [build_test] - if: github.event_name == 'release'|| github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' - steps: - - name: Set environment variables - run: | - echo "image_tag=oneflowinc/${{ needs.build_test.outputs.image_tag }}" >> $GITHUB_ENV - echo "acr_image_tag=${{ needs.build_test.outputs.acr_image_tag }}" >> $GITHUB_ENV - - name: Output environment variables - run: | - echo ${{ env.acr_image_tag }} - echo ${{ env.image_tag }} - - name: Login to ACR with the AccessKey pair - uses: aliyun/acr-login@v1 - with: - login-server: https://registry.${{ env.REGION_ID }}.aliyuncs.com - username: "${{ secrets.ACR_USERNAME }}" - password: "${{ secrets.ACR_PASSWORD }}" - - name: Login to docker hub - uses: docker/login-action@v1 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_TOKEN }} - - name: Pull, tag and push - run: | - docker pull ${{ env.acr_image_tag }} - docker tag ${{ env.acr_image_tag }} ${{ env.image_tag }} - docker push ${{ env.image_tag }} - - name: Pull, tag and push - run: | - docker pull ${{ env.acr_image_tag }} - docker tag ${{ env.acr_image_tag }} ${{ env.image_tag }} - docker push ${{ env.image_tag }} - - name: Push latest - if: github.event_name == 'release' || github.event_name == 'workflow_dispatch' - run: | - docker tag ${{ env.acr_image_tag }} oneflowinc/${{ env.SERVING_IMAGE }}:latest - docker push oneflowinc/${{ env.SERVING_IMAGE }}:latest + # docker_push: + # name: Push to docker hub + # runs-on: ubuntu-latest + # needs: [build_test] + # if: github.event_name == 'release'|| github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' + # steps: + # - name: Set environment variables + # run: | + # echo "image_tag=oneflowinc/${{ needs.build_test.outputs.image_tag }}" >> $GITHUB_ENV + # echo "acr_image_tag=${{ needs.build_test.outputs.acr_image_tag }}" >> $GITHUB_ENV + # - name: Output environment variables + # run: | + # echo ${{ env.acr_image_tag }} + # echo ${{ env.image_tag }} + # - name: Login to ACR with the AccessKey pair + # uses: aliyun/acr-login@v1 + # with: + # login-server: https://registry.${{ env.REGION_ID }}.aliyuncs.com + # username: "${{ secrets.ACR_USERNAME }}" + # password: "${{ secrets.ACR_PASSWORD }}" + # - name: Login to docker hub + # uses: docker/login-action@v1 + # with: + # username: ${{ secrets.DOCKERHUB_USERNAME }} + # password: ${{ secrets.DOCKERHUB_TOKEN }} + # - name: Pull, tag and push + # run: | + # docker pull ${{ env.acr_image_tag }} + # docker tag ${{ env.acr_image_tag }} ${{ env.image_tag }} + # docker push ${{ env.image_tag }} + # - name: Pull, tag and push + # run: | + # docker pull ${{ env.acr_image_tag }} + # docker tag ${{ env.acr_image_tag }} ${{ env.image_tag }} + # docker push ${{ env.image_tag }} + # - name: Push latest + # if: github.event_name == 'release' || github.event_name == 'workflow_dispatch' + # run: | + # docker tag ${{ env.acr_image_tag }} oneflowinc/${{ env.SERVING_IMAGE }}:latest + # docker push oneflowinc/${{ env.SERVING_IMAGE }}:latest From 9e7c4f0461c94e6c2ddc32e0972f2c871a1efbaf Mon Sep 17 00:00:00 2001 From: Mosout Date: Thu, 19 Dec 2024 11:01:08 +0800 Subject: [PATCH 05/18] fix --- docker/Dockerfile | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 8bf3b9d..dd16cbc 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -42,10 +42,15 @@ RUN sed -i 's/archive.ubuntu.com/mirrors.ustc.edu.cn/g' /etc/apt/sources.list && pip3 install cmake ENV DCGM_VERSION 2.2.9 +# # Install DCGM. Steps from https://developer.nvidia.com/dcgm#Downloads +# RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin && \ +# mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \ +# apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub && \ +# add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /" && \ +# apt-get update && apt-get install -y datacenter-gpu-manager=1:2.2.9 # Install DCGM. Steps from https://developer.nvidia.com/dcgm#Downloads -RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin && \ - mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \ - apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub && \ +RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb && \ + dpkg -i cuda-keyring_1.0-1_all.deb && \ add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /" && \ apt-get update && apt-get install -y datacenter-gpu-manager=1:2.2.9 From aae38c78210c310b06255603baa909867b6c6bcc Mon Sep 17 00:00:00 2001 From: Mosout Date: Thu, 19 Dec 2024 14:45:32 +0800 Subject: [PATCH 06/18] try for_cuda12 --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index b2bda73..061e4c6 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -56,7 +56,7 @@ jobs: - uses: actions/checkout@v2 - name: Clone oneflow run: | - git clone https://github.com/Oneflow-Inc/oneflow --depth=1 + git clone https://github.com/Oneflow-Inc/oneflow --depth=1 --branch for_cuda12 - name: Set environment variables run: | set -x From 6cb802a5945eaf03d77c829ce39c01c613f9445a Mon Sep 17 00:00:00 2001 From: Mosout Date: Thu, 19 Dec 2024 16:55:17 +0800 Subject: [PATCH 07/18] refine --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 061e4c6..b2bda73 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -56,7 +56,7 @@ jobs: - uses: actions/checkout@v2 - name: Clone oneflow run: | - git clone https://github.com/Oneflow-Inc/oneflow --depth=1 --branch for_cuda12 + git clone https://github.com/Oneflow-Inc/oneflow --depth=1 - name: Set environment variables run: | set -x From cbc9f9be4be5f95e665f86e98114a5247eb8e83f Mon Sep 17 00:00:00 2001 From: Mosout Date: Fri, 20 Dec 2024 09:58:56 +0800 Subject: [PATCH 08/18] try --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index b2bda73..7e91b17 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -56,7 +56,7 @@ jobs: - uses: actions/checkout@v2 - name: Clone oneflow run: | - git clone https://github.com/Oneflow-Inc/oneflow --depth=1 + git clone https://github.com/Oneflow-Inc/oneflow --depth=1 --branch for_serving_test - name: Set environment variables run: | set -x From 953e211d9c09e8a9eedb55e5b30a66150e5064b3 Mon Sep 17 00:00:00 2001 From: Mosout Date: Fri, 20 Dec 2024 11:40:27 +0800 Subject: [PATCH 09/18] fix --- ci/test/requirement.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/test/requirement.txt b/ci/test/requirement.txt index 0e915d9..d3b245e 100644 --- a/ci/test/requirement.txt +++ b/ci/test/requirement.txt @@ -1,2 +1,2 @@ -tritonclient[all] +tritonclient Jinja2 \ No newline at end of file From 4713102e6ea55efdf0aec78e831f6d69a089322c Mon Sep 17 00:00:00 2001 From: Mosout Date: Mon, 23 Dec 2024 11:46:42 +0800 Subject: [PATCH 10/18] fix --- ci/test/requirement.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/test/requirement.txt b/ci/test/requirement.txt index d3b245e..465abad 100644 --- a/ci/test/requirement.txt +++ b/ci/test/requirement.txt @@ -1,2 +1,2 @@ -tritonclient +tritonclient[all]==2.26.0 Jinja2 \ No newline at end of file From 22be79c0462515e9248ab09f1b82e52d7251bd03 Mon Sep 17 00:00:00 2001 From: mosout Date: Tue, 24 Dec 2024 11:09:34 +0800 Subject: [PATCH 11/18] refine --- ci/build/oneflow-serving.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ci/build/oneflow-serving.py b/ci/build/oneflow-serving.py index 3a54dcb..59cfca6 100755 --- a/ci/build/oneflow-serving.py +++ b/ci/build/oneflow-serving.py @@ -102,7 +102,8 @@ def __init__(self) -> None: def prepare(self): self._parse() - self._unknown.extend(['--disable-auto-complete-config']) + # self._unknown.extend(['--disable-auto-complete-config']) + self._unknown.extend(['--strict-model-config', 'false']) self._unknown_split = [] for argument in self._unknown: self._unknown_split.extend(argument.split('=')) From cf39384de4bad7a5abd368362b5a1ba689c5bad8 Mon Sep 17 00:00:00 2001 From: Mosout Date: Wed, 25 Dec 2024 11:45:41 +0800 Subject: [PATCH 12/18] refine --- test/test_resnet50/test.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_resnet50/test.sh b/test/test_resnet50/test.sh index 0c2f759..d15f308 100755 --- a/test/test_resnet50/test.sh +++ b/test/test_resnet50/test.sh @@ -28,8 +28,8 @@ fi echo "running resnet50 basic test" python3 ../common/test_model.py --model resnet50 --target-output ../common/resnet50_output.npy -echo "running resnet50 batching test" -python3 ../common/test_model.py --model resnet50_batching --target-output ../common/resnet50_output.npy +# echo "running resnet50 batching test" +# python3 ../common/test_model.py --model resnet50_batching --target-output ../common/resnet50_output.npy kill $SERVER_PID From 1b2abc2cf50f5f33fd0698af6fd291c589f45fed Mon Sep 17 00:00:00 2001 From: Mosout Date: Thu, 10 Apr 2025 10:36:02 +0800 Subject: [PATCH 13/18] refine --- test/test_resnet50/test.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/test/test_resnet50/test.sh b/test/test_resnet50/test.sh index d15f308..92dfcdd 100755 --- a/test/test_resnet50/test.sh +++ b/test/test_resnet50/test.sh @@ -31,7 +31,6 @@ python3 ../common/test_model.py --model resnet50 --target-output ../common/resne # echo "running resnet50 batching test" # python3 ../common/test_model.py --model resnet50_batching --target-output ../common/resnet50_output.npy - kill $SERVER_PID wait $SERVER_PID From a452db7a0304325883cf35b11d5d78dda222b25e Mon Sep 17 00:00:00 2001 From: Mosout Date: Thu, 10 Apr 2025 15:18:41 +0800 Subject: [PATCH 14/18] add lite test --- test/test_lite/test.sh | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 test/test_lite/test.sh diff --git a/test/test_lite/test.sh b/test/test_lite/test.sh new file mode 100644 index 0000000..8b82c72 --- /dev/null +++ b/test/test_lite/test.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +set -euxo pipefail + +export CUDA_VISIBLE_DEVICES=0 + +rm -rf ./models +mkdir -p models/resnet50/1 +cp -r ../common/model models/resnet50/1/ + +# generate minimal config.pbtxt +echo "name: \"resnet50\"" >> models/resnet50/config.pbtxt +echo "backend: \"oneflow_lite\"" >> models/resnet50/config.pbtxt + +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=1 --strict-model-config false" +SERVER_LOG="./inference_server.log" +source ../common/util.sh + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +echo "running resnet50 basic test with oneflow lite backend" +python3 ../common/test_model.py --model resnet50 --target-output ../common/resnet50_output.npy + +kill $SERVER_PID +wait $SERVER_PID + +exit 0 From 8e632f6058b7ff70f3e101242ffa41fb747230b4 Mon Sep 17 00:00:00 2001 From: Mosout Date: Thu, 10 Apr 2025 18:58:05 +0800 Subject: [PATCH 15/18] not use cache --- ci/build/oneflow.sh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/ci/build/oneflow.sh b/ci/build/oneflow.sh index 87a5513..1baaa56 100644 --- a/ci/build/oneflow.sh +++ b/ci/build/oneflow.sh @@ -30,9 +30,10 @@ else if [ "$oneflow_head_built" != "$oneflow_head" ]; then build_oneflow else - cached_whl=$(ls $WHEELHOUSE_DIR) - python3 -m pip install $WHEELHOUSE_DIR/$cached_whl - > $export_pythonpath_script - echo "Use build cache for oneflow." + # cached_whl=$(ls $WHEELHOUSE_DIR) + # python3 -m pip install $WHEELHOUSE_DIR/$cached_whl + # > $export_pythonpath_script + # echo "Use build cache for oneflow." + build_oneflow fi fi From 82ce333a6794213bcd0f3d0034353a6b454f1000 Mon Sep 17 00:00:00 2001 From: Mosout Date: Thu, 10 Apr 2025 20:04:27 +0800 Subject: [PATCH 16/18] chmod --- test/test_lite/test.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 test/test_lite/test.sh diff --git a/test/test_lite/test.sh b/test/test_lite/test.sh old mode 100644 new mode 100755 From ec8ca338a384137c76609b43f6c76770b30f478a Mon Sep 17 00:00:00 2001 From: Mosout Date: Thu, 10 Apr 2025 21:42:10 +0800 Subject: [PATCH 17/18] try --- ci/build/backend.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/build/backend.sh b/ci/build/backend.sh index f38527b..77ef000 100644 --- a/ci/build/backend.sh +++ b/ci/build/backend.sh @@ -12,6 +12,6 @@ cmake -DCMAKE_PREFIX_PATH=$ONEFLOW_CI_BUILD_DIR/liboneflow_cpp/share \ -DTRITON_ENABLE_GPU=ON \ -DTHIRD_PARTY_MIRROR=aliyun \ -DBUILD_ONEFLOW_BACKEND=ON \ - -DBUILD_ONEFLOW_LITE_BACKEND=OFF \ + -DBUILD_ONEFLOW_LITE_BACKEND=ON \ -G Ninja .. ninja -j8 From a6dd0b9d0b3f7fd5a55f7d3857e9e325bbdeaae5 Mon Sep 17 00:00:00 2001 From: Mosout Date: Thu, 10 Apr 2025 22:20:45 +0800 Subject: [PATCH 18/18] revert --- ci/build/backend.sh | 2 +- test/test_lite/test.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/build/backend.sh b/ci/build/backend.sh index 77ef000..f38527b 100644 --- a/ci/build/backend.sh +++ b/ci/build/backend.sh @@ -12,6 +12,6 @@ cmake -DCMAKE_PREFIX_PATH=$ONEFLOW_CI_BUILD_DIR/liboneflow_cpp/share \ -DTRITON_ENABLE_GPU=ON \ -DTHIRD_PARTY_MIRROR=aliyun \ -DBUILD_ONEFLOW_BACKEND=ON \ - -DBUILD_ONEFLOW_LITE_BACKEND=ON \ + -DBUILD_ONEFLOW_LITE_BACKEND=OFF \ -G Ninja .. ninja -j8 diff --git a/test/test_lite/test.sh b/test/test_lite/test.sh index 8b82c72..9d9355c 100755 --- a/test/test_lite/test.sh +++ b/test/test_lite/test.sh @@ -9,7 +9,7 @@ cp -r ../common/model models/resnet50/1/ # generate minimal config.pbtxt echo "name: \"resnet50\"" >> models/resnet50/config.pbtxt -echo "backend: \"oneflow_lite\"" >> models/resnet50/config.pbtxt +echo "backend: \"oneflow\"" >> models/resnet50/config.pbtxt SERVER=/opt/tritonserver/bin/tritonserver SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=1 --strict-model-config false"