From 443838994fd6a69cfb12b2732085e46ccfa314e5 Mon Sep 17 00:00:00 2001 From: itono-yuichiro Date: Thu, 5 Mar 2026 19:03:19 +0900 Subject: [PATCH] Pytorch v2.9.1 --- README.md | 190 ++++- llvm19.1.4/patch/numpy.patch | 102 +++ llvm19.1.4/patch/pytorch.patch | 1133 +++++++++++++++++++++++++++++ llvm19.1.4/patch/tensorpipe.patch | 13 + llvm21.1.0/patch/numpy.patch | 102 +++ llvm21.1.0/patch/pytorch.patch | 997 +++++++++++++++++++++++++ llvm21.1.0/patch/tensorpipe.patch | 13 + run/mnist.py | 145 ++++ run/run1proc_mnist.sh | 36 + 9 files changed, 2730 insertions(+), 1 deletion(-) create mode 100644 llvm19.1.4/patch/numpy.patch create mode 100644 llvm19.1.4/patch/pytorch.patch create mode 100644 llvm19.1.4/patch/tensorpipe.patch create mode 100644 llvm21.1.0/patch/numpy.patch create mode 100644 llvm21.1.0/patch/pytorch.patch create mode 100644 llvm21.1.0/patch/tensorpipe.patch create mode 100644 run/mnist.py create mode 100644 run/run1proc_mnist.sh diff --git a/README.md b/README.md index 11d0608..a30ba12 100644 --- a/README.md +++ b/README.md @@ -1 +1,189 @@ -# rccs-pytorch \ No newline at end of file +# rccs-pytorch + +## はじめに + +本書では、「富岳」におけるAIフレームワークPyTorch v2.9系のビルド手順および標準的なテストデータ(mnist)を用いた動作確認の手順について述べる。 + +## AIプレームワークPyTorchのバージョンアップ + +### PyTorchおよび主要モジュールの版数 + +ビルド対象であるPyTorchおよび主要モジュールの版数を示す。本作業では、Python v3.10、PyTorch v2.9.1、Numpy v1.22.4、Scipy v1.10.1、OneDNN v3.7.1、Horovod v0.26.1を採用することとした。 + +| モジュール名 | 版数 | +| --- | --- | +| Python | v3.10 | +| PyTorch | v2.9.1 | +| Numpy | v1.22.4 | +| Scipy | v1.10.3 | +| oneDNN | v3.7.1 | +|Horovod | v0.26.1 | + +### ビルド環境の整備 + +Pytorch v2.9.1の「富岳」向けビルドでは、富士通Githubで公開されている” 富士通 Supercomputer PRIMEHPC FX1000/FX700 上の PyTorch 構築手順”から入手可能なPytorch v1.13.1向けのビルド用スクリプトを利用する。言語環境としては、「富岳」にインストールされているllvm-v21.1.0を用いた。なお、現行の富士通製コンパイラはPytorch v2.9.1をビルドするために必要なC++言語規格要件を満たさない。 + +#### (1) 富士通GithubからPyTorchをクローンする。 + +``` +$ git clone https://github.com/fujitsu/pytorch.git +``` + +#### (2) pytorch/ディレクトリへ移動し、公式PyTorchのリポジトリを認識する。 + +``` +$ PYTORCH_TOP=$(cd $(dirname ${BASH_SOURCE:-$0})/pytorch && pwd) +$ PATCH_DIR=$(cd $(dirname ${BASH_SOURCE:-$0})/patch && pwd +$ cd ${PYTORCH_TOP} +$ git remote add upstream https://github.com/pytorch/pytorch.git +$ git fetch upstream v2.9.1 +``` + +#### (3) 公式v2.3.1をベースに新しいブランチを作成する。 + +``` +$ git checkout -b r2.9.1_for_a64fx FETCH_HEAD +``` + +#### (4) 富士通PyTorch v1.13.1から、ビルド用スクリプト一式を取り込む。 + +``` +$ git cherry-pick 17afed104f0a2ac47bab78aebf584fb3c578e707 +$ git reset --mixed HEAD^ +$ git add scripts/fujitsu --all +$ git commit -m "add scripts/fujitsu" +``` + +#### (5) pytorchに対するパッチを適用し、numpyおよびtensorpipeに対するパッチを所定のディレクトリに置く。 +``` +$ cd ${PYTORCH_TOP} && patch -p 1 < ${PATCH_DIR}/pytorch.patch +$ cp ${PATCH_DIR}/numpy.patch ${PYTORCH_TOP}/scripts/fujitsu +$ cp ${PATCH_DIR}/tensorpipe.patch ${PYTORCH_TOP}/scripts/fujitsu +``` + + +### ビルド手順 +ビルド環境の整備後、計算ノード上にて以下のように実行する。なお、すべてのscriptを実行するのには15時間程度を要する。 +``` +$ cd ${PYTORCH_TOP}/pytorch/scripts/fujitsu +$ . ./env.src +$ bash 1_python.sh +$ bash 3_venv.sh +$ bash 4_numpy_scipy.sh +$ bash 5_pytorch.sh +$ bash 6_vision.sh +$ bash 7_horovod.sh +$ bash 8_libtcmalloc.sh +``` + +ビルド用のスクリプトの実行後に出力されるpip3 list(pip3_list.txt)の内容を示す。 +``` +Package Version +------------------ ------------------ +beniget 0.4.2.post1 +build 1.4.0 +certifi 2026.1.4 +cffi 2.0.0 +charset-normalizer 3.4.4 +cloudpickle 3.1.2 +cmake 4.2.1 +Cython 0.29.37 +exceptiongroup 1.3.1 +expecttest 0.3.0 +filelock 3.20.3 +fsspec 2026.1.0 +gast 0.6.0 +horovod 0.26.1 +hypothesis 6.151.4 +idna 3.11 +iniconfig 2.3.0 +Jinja2 3.1.6 +lintrunner 0.13.0 +MarkupSafe 3.0.3 +mpmath 1.3.0 +networkx 3.4.2 +ninja 1.13.0 +numpy 1.22.4 +optree 0.18.0 +packaging 26.0 +Pillow 8.4.0 +pip 25.3 +pluggy 1.6.0 +ply 3.11 +psutil 7.2.2 +pybind11 3.0.1 +pycparser 3.0 +Pygments 2.19.2 +pyproject_hooks 1.2.0 +pytest 9.0.2 +pythran 0.18.1 +PyYAML 6.0.3 +requests 2.32.5 +SciPy 1.10.1 +setuptools 73.0.1 +six 1.17.0 +sortedcontainers 2.4.0 +sympy 1.14.0 +tomli 2.4.0 +torch 2.9.1a0+gitcdd1b45 +torchvision 0.24.1+d801a34 +typing_extensions 4.15.0 +urllib3 2.6.3 +uv 0.9.28 +wheel 0.46.3 +``` + +### 標準的なテストデータ(mnist)を用いた動作確認 + +ビルドしたPyTorch v2.9.1の動作確認では、機械学習の画像認識の学習においてサンプルデータ +としてよく利用される「mnist」を用いた。 +mnistを実行するコードは公式PyTorchのgithubのexamplesから入手した。 +(https://github.com/pytorch/examples/blob/main/mnist/main.py) +また、mnistのコードを実行するスクリプトにはscripts/fujitsu/run1proc.shを流用した。 + +#### mnistの実行環境の構築 + +run/ディレクトリに格納されている以下の2つのファイルをscripts/fujitsu/配下にコピーする。 +- mnist.py +- run1proc_mnist.sh + +#### mnistの実行 +mnistをジョブ実行する。 +``` +$ cd ${PYTORCH_TOP}/pytorch/scripts/Fujitsu +$ pjsub ./run1proc_mnist.sh +``` + +以下の出力によりmnistがPyTorch v2.9.1で正常に動作していることを確認した。 + +``` +Train Epoch: 1 [0/60000 (0%)] Loss: 2.329474 +Train Epoch: 1 [640/60000 (1%)] Loss: 1.425025 +Train Epoch: 1 [1280/60000 (2%)] Loss: 0.797880 +Train Epoch: 1 [1920/60000 (3%)] Loss: 0.536055 +Train Epoch: 1 [2560/60000 (4%)] Loss: 0.444745 +Train Epoch: 1 [3200/60000 (5%)] Loss: 0.262757 + : +Train Epoch: 1 [56960/60000 (95%)] Loss: 0.050381 +Train Epoch: 1 [57600/60000 (96%)] Loss: 0.137881 +Train Epoch: 1 [58240/60000 (97%)] Loss: 0.006410 +Train Epoch: 1 [58880/60000 (98%)] Loss: 0.003386 +Train Epoch: 1 [59520/60000 (99%)] Loss: 0.002083 + +Test set: Average loss: 0.0497, Accuracy: 9830/10000 (98%) + +Train Epoch: 2 [0/60000 (0%)] Loss: 0.026067 +Train Epoch: 2 [640/60000 (1%)] Loss: 0.045588 +Train Epoch: 2 [1280/60000 (2%)] Loss: 0.069181 +Train Epoch: 2 [1920/60000 (3%)] Loss: 0.178524 +Train Epoch: 2 [2560/60000 (4%)] Loss: 0.084490 +Train Epoch: 2 [3200/60000 (5%)] Loss: 0.047848 + : +Train Epoch: 2 [56960/60000 (95%)] Loss: 0.038513 +Train Epoch: 2 [57600/60000 (96%)] Loss: 0.112719 +Train Epoch: 2 [58240/60000 (97%)] Loss: 0.022632 +Train Epoch: 2 [58880/60000 (98%)] Loss: 0.009396 +Train Epoch: 2 [59520/60000 (99%)] Loss: 0.002736 + +Test set: Average loss: 0.0375, Accuracy: 9877/10000 (99%) +``` diff --git a/llvm19.1.4/patch/numpy.patch b/llvm19.1.4/patch/numpy.patch new file mode 100644 index 0000000..7a5ce4e --- /dev/null +++ b/llvm19.1.4/patch/numpy.patch @@ -0,0 +1,102 @@ +diff --git a/numpy/distutils/fcompiler/__init__.py b/numpy/distutils/fcompiler/__init__.py +index d8dcfa8..ebe0647 100644 +--- a/numpy/distutils/fcompiler/__init__.py ++++ b/numpy/distutils/fcompiler/__init__.py +@@ -745,7 +745,7 @@ def wrap_unlinkable_objects(self, objects, output_dir, extra_dll_dir): + ('cygwin.*', ('gnu', 'intelv', 'absoft', 'compaqv', 'intelev', 'gnu95', 'g95')), + ('linux.*', ('arm', 'gnu95', 'intel', 'lahey', 'pg', 'nv', 'absoft', 'nag', + 'vast', 'compaq', 'intele', 'intelem', 'gnu', 'g95', +- 'pathf95', 'nagfor', 'fujitsu')), ++ 'pathf95', 'nagfor', 'fujitsu', 'llvm')), + ('darwin.*', ('gnu95', 'nag', 'nagfor', 'absoft', 'ibm', 'intel', 'gnu', + 'g95', 'pg')), + ('sunos.*', ('sun', 'gnu', 'gnu95', 'g95')), +diff --git a/numpy/distutils/fcompiler/llvm.py b/numpy/distutils/fcompiler/llvm.py +new file mode 100644 +index 0000000..f3db492 +--- /dev/null ++++ b/numpy/distutils/fcompiler/llvm.py +@@ -0,0 +1,71 @@ ++from __future__ import division, absolute_import, print_function ++ ++import sys ++ ++from numpy.distutils.fcompiler import FCompiler, dummy_fortran_file ++from sys import platform ++from os.path import join, dirname, normpath ++ ++compilers = ['LlvmFlangFCompiler'] ++ ++import functools ++ ++class LlvmFlangFCompiler(FCompiler): ++ compiler_type = 'llvm' ++ description = 'LLVM Fortran Compiler' ++ version_pattern = r'\s*flang.*version (?P[\d.-]+).*' ++ ++ possible_executables = ['flang'] ++ ++ executables = { ++ 'version_cmd': ["", "--version"], ++ 'compiler_f77': ["flang", "-fPIC"], ++ 'compiler_fix': ["flang", "-fPIC", "-ffixed-form"], ++ 'compiler_f90': ["flang", "-fPIC"], ++ 'linker_so': ["flang", "-fPIC", "-shared"], ++ 'archiver': ["ar", "-cr"], ++ 'ranlib': None ++ } ++ ++ pic_flags = ["-fPIC", "-DPIC"] ++ c_compiler = 'clang' ++ module_dir_switch = '-module ' # Don't remove ending space! ++ ++ def get_libraries(self): ++ opt = FCompiler.get_libraries(self) ++ return opt ++ ++ @functools.lru_cache(maxsize=128) ++ def get_library_dirs(self): ++ """List of compiler library directories.""" ++ opt = FCompiler.get_library_dirs(self) ++ flang_dir = dirname(self.executables['compiler_f77'][0]) ++ opt.append(normpath(join(flang_dir, '..', 'lib'))) ++ ++ return opt ++ ++ def get_flags(self): ++ return [] ++ ++ def get_flags_free(self): ++ return [] ++ ++ def get_flags_debug(self): ++ return ['-g'] ++ ++ def get_flags_opt(self): ++ return ['-O3'] ++ ++ def get_flags_arch(self): ++ return [] ++ ++ def runtime_library_dir_option(self, dir): ++ return '-Wl,-rpath=%s' % dir ++ ++ ++if __name__ == '__main__': ++ from distutils import log ++ log.set_verbosity(2) ++ from numpy.distutils import customized_fcompiler ++ print(customized_fcompiler(compiler='llvm').get_version()) ++ +diff --git a/numpy/tests/test_public_api.py b/numpy/tests/test_public_api.py +index bb15e10..9369424 100644 +--- a/numpy/tests/test_public_api.py ++++ b/numpy/tests/test_public_api.py +@@ -233,6 +233,7 @@ def test_NPY_NO_EXPORT(): + "distutils.fcompiler.sun", + "distutils.fcompiler.vast", + "distutils.fcompiler.fujitsu", ++ "distutils.fcompiler.llvm", + "distutils.from_template", + "distutils.intelccompiler", + "distutils.lib2def", diff --git a/llvm19.1.4/patch/pytorch.patch b/llvm19.1.4/patch/pytorch.patch new file mode 100644 index 0000000..7d9a172 --- /dev/null +++ b/llvm19.1.4/patch/pytorch.patch @@ -0,0 +1,1133 @@ +diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt +index 6ab41b6c84..61a0cd2f9e 100644 +--- a/caffe2/CMakeLists.txt ++++ b/caffe2/CMakeLists.txt +@@ -1549,7 +1549,8 @@ target_link_libraries(torch_cpu PUBLIC ${Caffe2_PUBLIC_DEPENDENCY_LIBS}) + target_link_libraries(torch_cpu PRIVATE ${Caffe2_DEPENDENCY_LIBS}) + target_link_libraries(torch_cpu PRIVATE ${Caffe2_DEPENDENCY_WHOLE_LINK_LIBS}) + if(USE_MPI) +- target_link_libraries(torch_cpu PRIVATE MPI::MPI_CXX) ++ target_link_libraries(torch_cpu PRIVATE ${MPI_CXX_LIBRARIES}) ++ target_include_directories(torch_cpu PRIVATE ${MPI_CXX_INCLUDE_PATH}) + endif() + target_include_directories(torch_cpu INTERFACE $) + target_include_directories(torch_cpu PRIVATE ${Caffe2_CPU_INCLUDE}) +@@ -1727,7 +1728,8 @@ if(BUILD_SHARED_LIBS) + endif() + set_target_properties(torch_global_deps PROPERTIES LINKER_LANGUAGE C) + if(USE_MPI) +- target_link_libraries(torch_global_deps MPI::MPI_CXX) ++ target_link_libraries(torch_global_deps ${MPI_CXX_LIBRARIES}) ++ target_include_directories(torch_global_deps PUBLIC ${MPI_CXX_INCLUDE_PATH}) + endif() + if(CAFFE2_USE_MKL) + target_link_libraries(torch_global_deps caffe2::mkl) +diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake +index ef5c2fd4e9..a266e956aa 100644 +--- a/cmake/Dependencies.cmake ++++ b/cmake/Dependencies.cmake +@@ -162,7 +162,7 @@ else() + set(AT_MKLDNN_ENABLED 0) + set(AT_MKL_ENABLED 0) + endif() +-set_property(CACHE BLAS PROPERTY STRINGS "ATLAS;BLIS;Eigen;FLAME;Generic;MKL;OpenBLAS;vecLib;APL") ++set_property(CACHE BLAS PROPERTY STRINGS "ATLAS;BLIS;Eigen;FLAME;Generic;MKL;OpenBLAS;vecLib;APL;SSL2") + message(STATUS "Trying to find preferred BLAS backend of choice: " ${BLAS}) + set(BLAS_CHECK_F2C 0) + +@@ -233,6 +233,20 @@ elseif(BLAS STREQUAL "FlexiBLAS") + include_directories(SYSTEM ${FlexiBLAS_INCLUDE_DIR}) + list(APPEND Caffe2_DEPENDENCY_LIBS ${FlexiBLAS_LIB}) + set(BLAS_CHECK_F2C 1) ++elseif(BLAS STREQUAL "SSL2") ++ if(CMAKE_CXX_COMPILER MATCHES ".*/clang\\+\\+$" ++ AND CMAKE_C_COMPILER MATCHES ".*/clang$") ++ message(STATUS "SSL2 Selected BLAS library") ++ list(APPEND Caffe2_PUBLIC_DEPENDENCY_LIBS "fjlapackexsve.so") ++ set(SSL2_FOUND ON) ++ message(STATUS "set CMAKE_SHARED_LINKER_FLAGS: -SSL2") ++ set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -SSL2") ++ set(WITH_BLAS "ssl2") ++ else() ++ message(STATUS "Not built using clang and clang++.") ++ message(STATUS "CMAKE_C_COMPILER: ${CMAKE_C_COMPILER}") ++ message(STATUS "CMAKE_CXX_COMPILER: ${CMAKE_CXX_COMPILER}") ++ endif() + elseif(BLAS STREQUAL "APL") + find_package(APL REQUIRED) + include_directories(SYSTEM ${APL_INCLUDE_DIR}) +diff --git a/cmake/Modules/FindARM.cmake b/cmake/Modules/FindARM.cmake +index 903025c5c2..a419c1aeed 100644 +--- a/cmake/Modules/FindARM.cmake ++++ b/cmake/Modules/FindARM.cmake +@@ -153,7 +153,7 @@ IF(CMAKE_SYSTEM_NAME MATCHES "Linux") + + # Check for SVE256 vector length + CHECK_COMPILES(CXX "SVE256" "-march=armv8.2-a+sve -msve-vector-bits=256" "${SVE_CODE}") +- CHECK_COMPILES(CXX "ARM_BF16" "-march=armv8.2-a+sve+bf16 -msve-vector-bits=256" "${ARM_BF16_CODE}") ++ #CHECK_COMPILES(CXX "ARM_BF16" "-march=armv8.2-a+sve+bf16 -msve-vector-bits=256" "${ARM_BF16_CODE}") + + # If SVE256 support is not found, set CXX_SVE_FOUND to FALSE and notify the user + if(NOT CXX_SVE256_FOUND) +diff --git a/cmake/Modules/FindBLAS.cmake b/cmake/Modules/FindBLAS.cmake +index b4b158fc49..948f7e99ad 100644 +--- a/cmake/Modules/FindBLAS.cmake ++++ b/cmake/Modules/FindBLAS.cmake +@@ -290,6 +290,28 @@ if((NOT BLAS_LIBRARIES) + endif() + endif() + ++# BLAS in SSL2 library? ++if((NOT BLAS_LIBRARIES) ++ AND ((NOT WITH_BLAS) OR (WITH_BLAS STREQUAL "ssl2"))) ++ if(CMAKE_CXX_COMPILER MATCHES ".*/clang\\+\\+$" ++ AND CMAKE_C_COMPILER MATCHES ".*/clang$") ++ check_fortran_libraries( ++ BLAS_LIBRARIES ++ BLAS ++ sgemm ++ "-SSL2" ++ "fjlapackexsve") ++ if (BLAS_LIBRARIES) ++ set(BLAS_INFO "ssl2") ++ set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -SSL2") ++ endif (BLAS_LIBRARIES) ++ else() ++ message(STATUS "Not built using clang and clang++.") ++ message(STATUS "CMAKE_C_COMPILER: ${CMAKE_C_COMPILER}") ++ message(STATUS "CMAKE_CXX_COMPILER: ${CMAKE_CXX_COMPILER}") ++ endif() ++endif() ++ + # Generic BLAS library? + if((NOT BLAS_LIBRARIES) + AND ((NOT WITH_BLAS) OR (WITH_BLAS STREQUAL "generic"))) +diff --git a/cmake/Modules/FindLAPACK.cmake b/cmake/Modules/FindLAPACK.cmake +index 500bec8cef..062610259e 100644 +--- a/cmake/Modules/FindLAPACK.cmake ++++ b/cmake/Modules/FindLAPACK.cmake +@@ -208,6 +208,18 @@ if(BLAS_FOUND) + endif() + endif() + ++ # SSL2 ++ if((NOT LAPACK_INFO) AND (BLAS_INFO STREQUAL "ssl2")) ++ set(CMAKE_REQUIRED_LIBRARIES ${BLAS_LIBRARIES}) ++ check_function_exists("cheev_" SSL2_LAPACK_WORKS) ++ set(CMAKE_REQUIRED_LIBRARIES) ++ if(SSL2_LAPACK_WORKS) ++ SET(LAPACK_INFO "ssl2") ++ else() ++ message(STATUS "Strangely, this SSL2 library does not support Lapack?!") ++ endif() ++ endif() ++ + # Generic LAPACK library? + IF((NOT LAPACK_INFO) AND (BLAS_INFO STREQUAL "generic")) + check_lapack_libraries( +diff --git a/cmake/Modules/FindMPI.cmake b/cmake/Modules/FindMPI.cmake +new file mode 100644 +index 0000000000..8c5274199b +--- /dev/null ++++ b/cmake/Modules/FindMPI.cmake +@@ -0,0 +1,55 @@ ++if(CMAKE_C_COMPILER MATCHES ".*/clang$" AND ++ CMAKE_CXX_COMPILER MATCHES ".*/clang\\+\\+$") ++ if(DEFINED ENV{MPI_HOME}) ++ set(TCSMPI_EXEC_PATH "$ENV{MPI_HOME}/bin") ++ else() ++ string(REGEX REPLACE "/clang\\+\\+$" "" CMAKE_CXX_COMPILER_DIR "${CMAKE_CXX_COMPILER}") ++ set(TCSMPI_EXEC_PATH "${CMAKE_CXX_COMPILER_DIR}") ++ endif() ++ execute_process(COMMAND "${TCSMPI_EXEC_PATH}/mpiclang++" "--show" ++ RESULT_VARIABLE MPIFCC_EXEC_RESULT ++ OUTPUT_QUIET ++ ERROR_QUIET) ++ if(MPIFCC_EXEC_RESULT EQUAL 0) ++ message(STATUS "TCS-MPI ENABLED") ++ set(MPI_CXX_FOUND ON) ++ execute_process(COMMAND "${TCSMPI_EXEC_PATH}/mpiclang++" "--showme:compile" ++ OUTPUT_VARIABLE MPI_CXX_COMPILE_FLAGS) ++ string(REPLACE "\n" "" MPI_CXX_COMPILE_FLAGS "${MPI_CXX_COMPILE_FLAGS}") ++ execute_process(COMMAND "${TCSMPI_EXEC_PATH}/mpiclang++" "--showme:incdirs" ++ OUTPUT_VARIABLE MPI_CXX_INCLUDE_PATH) ++ string(REPLACE "\n" ";" MPI_CXX_INCLUDE_PATH "${MPI_CXX_INCLUDE_PATH}") ++ execute_process(COMMAND "${TCSMPI_EXEC_PATH}/mpiclang++" "--showme:link" ++ OUTPUT_VARIABLE MPI_CXX_LINK_FLAGS) ++ string(REPLACE "\n" "" MPI_CXX_LINK_FLAGS "${MPI_CXX_LINK_FLAGS}") ++ execute_process(COMMAND "${TCSMPI_EXEC_PATH}/mpiclang++" "--showme:libdirs" ++ OUTPUT_VARIABLE MPI_CXX_LIBRARY_DIRS) ++ string(REPLACE "\n" "" MPI_CXX_LIBRARY_DIRS "${MPI_CXX_LIBRARY_DIRS}") ++ string(REPLACE " " ";" MPI_CXX_LIBRARY_DIRS "${MPI_CXX_LIBRARY_DIRS}") ++ foreach(dir IN LISTS MPI_CXX_LIBRARY_DIRS) ++ if(dir MATCHES "llvm-v19\\.1\\.4" OR dir MATCHES "llvm-v19\\.1\\.0" OR dir MATCHES "llvm-v17\\.0\\.2") ++ set(MPI_CXX_LIBRARIES "${dir}/libmpi.so") ++ endif() ++ endforeach() ++ set(MPI_FOUND ON) ++ set(MPI_C_FOUND ON) ++ set(MPIEXEC "${TCSMPI_EXEC_PATH}/mpiexec") ++ set(MPI_COMPILE_FLAGS ${MPI_CXX_COMPILE_FLAGS}) ++ set(MPI_C_COMPILE_FLAGS ${MPI_CXX_COMPILE_FLAGS}) ++ set(MPI_INCLUDE_PATH ${MPI_CXX_INCLUDE_PATH}) ++ set(MPI_C_INCLUDE_PATH ${MPI_CXX_INCLUDE_PATH}) ++ set(MPI_LINK_FLAGS ${MPI_CXX_LINK_FLAGS}) ++ set(MPI_C_LINK_FLAGS ${MPI_CXX_LINK_FLAGS}) ++ set(MPI_LIBRARIES ${MPI_CXX_LIBRARIES}) ++ set(MPI_C_LIBRARIES ${MPI_CXX_LIBRARIES}) ++ else() ++ message(STATUS "TCS-MPI DISABLED") ++ endif() ++endif() ++if(NOT MPI_FOUND) ++ set(CMAKE_MODULE_PATH_TMP "${CMAKE_MODULE_PATH}") ++ unset(CMAKE_MODULE_PATH) ++ find_package(MPI) ++ set(CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH_TMP}") ++ unset(CMAKE_MODULE_PATH_TMP) ++endif() +diff --git a/cmake/Modules/FindOpenMP.cmake b/cmake/Modules/FindOpenMP.cmake +index 8a9abff398..6e82b2964c 100644 +--- a/cmake/Modules/FindOpenMP.cmake ++++ b/cmake/Modules/FindOpenMP.cmake +@@ -98,6 +98,9 @@ function(_OPENMP_FLAG_CANDIDATES LANG) + # regular clang flags + set(OMP_FLAG_Clang "-fopenmp=libomp" "-fopenmp=libiomp5" "-fopenmp") + endif() ++ if(BLAS STREQUAL "SSL2") ++ set(OMP_FLAG_Clang "-Kopenmp") ++ endif() + + if(WIN32) + # Prefer Intel OpenMP header which can be provided by CMAKE_INCLUDE_PATH. +@@ -259,6 +262,29 @@ function(_OPENMP_GET_FLAGS LANG FLAG_MODE OPENMP_FLAG_VAR OPENMP_LIB_NAMES_VAR) + find_package(MKL QUIET) + unset(IN_FIND_OMP CACHE) + endif() ++ if(MKL_FOUND AND (NOT "${MKL_OPENMP_LIBRARY}" STREQUAL "")) ++ # If we already link OpenMP via MKL, use that. Otherwise at run-time ++ # OpenMP will complain about being initialized twice (OMP: Error #15), ++ # can may cause incorrect behavior. ++ set(OpenMP_libomp_LIBRARY "${MKL_OPENMP_LIBRARY}" CACHE STRING "libomp location for OpenMP") ++ elseif(BLAS STREQUAL "SSL2") ++ try_compile( OpenMP_COMPILE_RESULT_${FLAG_MODE}_${OPENMP_PLAIN_FLAG} ${CMAKE_BINARY_DIR} ${_OPENMP_TEST_SRC} ++ CMAKE_FLAGS "-DCOMPILE_DEFINITIONS:STRING=${OPENMP_FLAGS_TEST}" ++ OUTPUT_VARIABLE OpenMP_TRY_COMPILE_OUTPUT ++ ) ++ if(OpenMP_COMPILE_RESULT_${FLAG_MODE}_${OPENMP_PLAIN_FLAG}) ++ set("${OPENMP_FLAG_VAR}" "${OPENMP_FLAG}" PARENT_SCOPE) ++ set("${OPENMP_LIB_NAMES_VAR}" "" PARENT_SCOPE) ++ break() ++ endif() ++ else() ++ find_library(OpenMP_libomp_LIBRARY ++ NAMES omp gomp iomp5 ++ HINTS ${CMAKE_${LANG}_IMPLICIT_LINK_DIRECTORIES} ++ DOC "libomp location for OpenMP" ++ ) ++ endif() ++ mark_as_advanced(OpenMP_libomp_LIBRARY) + + if(MKL_OPENMP_LIBRARY) + # If we already link OpenMP via MKL, use that. Otherwise at run-time +diff --git a/scripts/fujitsu/1_python.sh b/scripts/fujitsu/1_python.sh +index 90790698f9..0c9f828549 100755 +--- a/scripts/fujitsu/1_python.sh ++++ b/scripts/fujitsu/1_python.sh +@@ -39,7 +39,7 @@ source $script_basedir/env.src + + if [ -v fjenv_debug ]; then set -x; fi + +-PYTHON_VER=3.9 ++PYTHON_VER=3.10 + PYTHON_DIR=cpython + + # +@@ -83,7 +83,7 @@ if [ "$fjenv_use_fcc" = "true" ]; then + # TODO: $ORIGIN sometimes parsed as 'RIGIN'. + # perhaps more backslashs are needed to protect $ORIGIN from parsing in shell. + # export LDFLAGS="-Wl,-rpath,\$ORIGIN/../lib" +- export LDFLAGS="-Wl,-rpath,${PREFIX}/lib -Wl,-rpath,${TCSDS_PATH}/lib64 -lpthread" ++ export LDFLAGS="-Wl,-rpath,${PREFIX}/lib -L/usr/lib64" + else + # Ditto. + #export LDFLAGS="-Wl,-rpath,\$ORIGIN/../lib" +@@ -105,7 +105,7 @@ if [ "${fjenv_use_fcc}" = "true" ]; then + # We used to link with '--linkfortran', which turned out to be unnecessary. + # It was used to link with the module solery written by Fortran. + # ${CXX} --linkfortran -SSL2 -Kopenmp -Nlibomp -o python Programs/python.o -L. -lpython$PYTHON_VER $LDFLAGS +- ${CXX} -Kopenmp -Nlibomp -SSL2BLAMP -lfjlapackexsve -o python Programs/python.o -L. -lpython$PYTHON_VER $LDFLAGS ++ ${CXX} -Kopenmp -SSL2BLAMP -lfjlapackexsve -o python Programs/python.o -L. -lpython$PYTHON_VER $LDFLAGS + fi + + make install +@@ -119,7 +119,7 @@ hash -r + # Note that python 3.9 buildles setuptools 58.1. + #pip3 uninstall -y setuptools + +-pip3 install --upgrade ${PIP3_OPTIONS} 'setuptools>60.6.0' # or setuptools<59.6.0 ++pip3 install --upgrade ${PIP3_OPTIONS} 'setuptools==73.0.1' # or setuptools<59.6.0 + + # Show configuration + +diff --git a/scripts/fujitsu/3_venv.sh b/scripts/fujitsu/3_venv.sh +index b271fdc535..4169a4930d 100755 +--- a/scripts/fujitsu/3_venv.sh ++++ b/scripts/fujitsu/3_venv.sh +@@ -81,9 +81,9 @@ fi + # Workaround is found in: + # See https://stackoverflow.com/questions/70520120/attributeerror-module-setuptools-distutils-has-no-attribute-version + +-pip3 install --upgrade ${PIP3_OPTIONS} 'setuptools>60.6.0' # or setuptools<59.6.0 ++pip3 install --upgrade ${PIP3_OPTIONS} 'setuptools==73.0.1' # or setuptools<59.6.0 + +-pip3 install --upgrade ${PIP3_OPTIONS} pip future six wheel ++pip3 install --upgrade ${PIP3_OPTIONS} pip==25.3 + + pip3 list | tee $script_basedir/pip3_list.txt + +diff --git a/scripts/fujitsu/4_numpy_scipy.sh b/scripts/fujitsu/4_numpy_scipy.sh +index fe07872ab3..6a50b7f880 100755 +--- a/scripts/fujitsu/4_numpy_scipy.sh ++++ b/scripts/fujitsu/4_numpy_scipy.sh +@@ -41,8 +41,12 @@ if [ -v fjenv_debug ]; then set -x; fi + + NUMPY_VER=v1.22.4 + NUMPY_DIR=numpy +-SCIPY_VER=v1.7.3 ++SCIPY_VER=v1.10.1 + SCIPY_DIR=scipy ++SCIPY_CHERRY_PICK=ab7d08c6148286059f6498ab5c3070268d13cbd9 ++export NPY_BLAS_ORDER=openblas ++export NPY_LAPACK_ORDER=openblas ++ + + # + # Clean up +@@ -61,17 +65,20 @@ fi + [ -d ${DOWNLOAD_PATH} ] || mkdir -p ${DOWNLOAD_PATH} + cd ${DOWNLOAD_PATH} + +-[ -d $NUMPY_DIR ] || ++if [ ! -d $NUMPY_DIR ]; then + git clone ${GIT_OPTIONS} \ + -b $NUMPY_VER \ + --depth 1 \ + https://github.com/numpy/numpy.git $NUMPY_DIR ++ (cd $NUMPY_DIR && patch -p 1 < $script_basedir/numpy.patch) ++fi + +-[ -d $SCIPY_DIR ] || ++if [ ! -d $SCIPY_DIR ]; then + git clone ${GIT_OPTIONS} --recursive \ + -b $SCIPY_VER \ +- --depth 1 \ + https://github.com/scipy/scipy.git $SCIPY_DIR ++ (cd $SCIPY_DIR && git cherry-pick ${SCIPY_CHERRY_PICK}) ++fi + + [ -v fjenv_download ] && fjenv_safe_exit 0 + +@@ -94,7 +101,7 @@ fi + + # NumPy maintenance/1.22.x requires Cythone >= 0.29.21 + # NumPy maintenance/1.22.2 requires Cythone >= 0.29.30 +-pip3 install ${PIP3_OPTIONS} 'Cython>=0.29.30' || ++pip3 install ${PIP3_OPTIONS} 'Cython>=0.29.33,<3.0' || + pip3 install ${PIP3_OPTIONS} $PIP_PACKAGE_PATH/Cython*.whl + + cd $DOWNLOAD_PATH/$NUMPY_DIR +@@ -108,19 +115,19 @@ if [ "$fjenv_use_fcc" = "true" -a ! -f site.cfg ]; then + cat <site.cfg + [openblas] + libraries = fjlapackexsve +-library_dirs = $TCSDS_PATH/lib64 +-include_dirs = $TCSDS_PATH/include ++library_dirs = ${SSL2_ROOT}/lib64 ++include_dirs = ${SSL2_ROOT}/include + extra_link_args = -SSL2BLAMP + + [lapack] + lapack_libs = fjlapackexsve +-library_dirs = $TCSDS_PATH/lib64 ++library_dirs = ${SSL2_ROOT}/lib64 + extra_link_args = -SSL2BLAMP + EOF + fi + + NPY_NUM_BUILD_JOBS=$MAX_JOBS \ +- python3 setup.py build -j $MAX_JOBS install ++ python3 setup.py build -j $MAX_JOBS config_fc --fcompiler=llvm install + + # + # Build SciPy +@@ -130,7 +137,7 @@ NPY_NUM_BUILD_JOBS=$MAX_JOBS \ + # older than what NumPy is requiring, but running for reference purpose, + # such as in case of using older NumPy. + +-pip3 install ${PIP3_OPTIONS} 'Cython>=0.29.18' ++pip3 install ${PIP3_OPTIONS} 'Cython>=0.29.33,<3.0' + pip3 install ${PIP3_OPTIONS} pybind11 pythran + + cd $DOWNLOAD_PATH/$SCIPY_DIR +@@ -140,7 +147,7 @@ if [ -v fjenv_rebuild ]; then + fi + + SCIPY_NUM_CYTHONIZE_JOBS=$MAX_JOBS \ +- python3 setup.py build -j $MAX_JOBS --fcompiler=fujitsu install ++ python3 setup.py build -j $MAX_JOBS config_fc --fcompiler=llvm install + + pip3 list | tee $script_basedir/pip3_list.txt + +diff --git a/scripts/fujitsu/5_pytorch.sh b/scripts/fujitsu/5_pytorch.sh +index 8128e584c8..ce820f847d 100755 +--- a/scripts/fujitsu/5_pytorch.sh ++++ b/scripts/fujitsu/5_pytorch.sh +@@ -40,7 +40,7 @@ source $script_basedir/env.src + + if [ -v fjenv_debug ]; then set -x; fi + +-ONEDNN_VER=v2.7 ++ONEDNN_VER=v3.7.1 + + # + # Clean up +@@ -59,8 +59,9 @@ fi + if [ ! -d $PYTORCH_TOP/third_party/ideep/mkl-dnn ]; then + cd $PYTORCH_TOP + git submodule update --init --recursive $GIT_OPTIONS ++ cd $PYTORCH_TOP/third_party/tensorpipe && patch -p 1 < $script_basedir/tensorpipe.patch + fi +-cd $PYTORCH_TOP/third_party/ideep/mkl-dnn/third_party/oneDNN ++cd $PYTORCH_TOP/third_party/ideep/mkl-dnn + git checkout $GIT_OPTIONS $ONEDNN_VER + + [ -v fjenv_download ] && fjenv_safe_exit 0 +@@ -96,7 +97,7 @@ fi + + # 'setup.py' in PyTorch ensures that CFLAGS is used for both C and C++ compiler, + # but just in case... +-CFLAGS=-O3 CXXFLAGS=-O3 python3 setup.py build -j $MAX_JOBS install ++BLAS=SSL2 CFLAGS='-O3 -Kopenmp' CXXFLAGS="-O3 -Kopenmp" python3 setup.py build -j $MAX_JOBS install + + pip3 list | tee $script_basedir/pip3_list.txt + +diff --git a/scripts/fujitsu/6_vision.sh b/scripts/fujitsu/6_vision.sh +index 68034079e4..19378d157f 100755 +--- a/scripts/fujitsu/6_vision.sh ++++ b/scripts/fujitsu/6_vision.sh +@@ -42,9 +42,9 @@ if [ -v fjenv_debug ]; then set -x; fi + + JPEG_ARCHIVE_NAME=jpegsrc.v9d + JPEG_DIR=jpeg-9d +-PILLOW_VER=7.2.0 ++PILLOW_VER=8.4.0 + PILLOW_DIR=Pillow +-TORCHVISION_VER=v0.14.1 ++TORCHVISION_VER=v0.24.1 + TORCHVISION_DIR=vision + + # +@@ -132,7 +132,7 @@ export LDFLAGS="-Wl,-rpath,${PREFIX}/lib" + if [ -v fjenv_rebuild ]; then + python3 setup.py clean + fi +-python3 setup.py install ++pip3 install . --verbose --no-build-isolation + + # + # Install torchvision +@@ -145,7 +145,7 @@ fi + + export TORCHVISION_INCLUDE=$PREFIX/include + export TORCHVISION_LIBRARY=$PREFIX/lib +-CFLAGS="-Kfast" python3 setup.py build -j $MAX_JOBS install ++pip3 install . --verbose --no-build-isolation + + pip3 list | tee $script_basedir/pip3_list.txt + +diff --git a/scripts/fujitsu/7_horovod.sh b/scripts/fujitsu/7_horovod.sh +index 7aa6f302b9..4e0138da0d 100755 +--- a/scripts/fujitsu/7_horovod.sh ++++ b/scripts/fujitsu/7_horovod.sh +@@ -41,6 +41,7 @@ if [ -v fjenv_debug ]; then set -x; fi + + HOROVOD_VER=v0.26.1 + HOROVOD_DIR=horovod ++FLATBUFFERS_CHERRY_PICK=20aad0c41e1252b04c72111c3eb221280a9c2009 + + # + # Clean up +@@ -62,6 +63,7 @@ if [ ! -d horovod ]; then + -b $HOROVOD_VER \ + --depth 1 \ + https://github.com/horovod/horovod.git ++ (cd horovod/third_party/flatbuffers && git cherry-pick ${FLATBUFFERS_CHERRY_PICK}) + (cd horovod; patch -p 1 < $script_basedir/horovod.patch) + cp -p horovod/examples/pytorch/pytorch_synthetic_benchmark.py $script_basedir + fi +@@ -81,7 +83,7 @@ fi + # + + if [ "${fjenv_use_fcc}" != "true" ]; then +- echo "$0 works for FCC only for now" ++ echo "$0 works for clang++ only for now" + exit 1 + fi + +diff --git a/scripts/fujitsu/env.src b/scripts/fujitsu/env.src +index 7dd81f6d6c..47b2add059 100644 +--- a/scripts/fujitsu/env.src ++++ b/scripts/fujitsu/env.src +@@ -43,10 +43,19 @@ fjenv_src_sourced="Y" + ######################################################################## + ######################################################################## + +-#TCSDS_PATH=/opt/FJSVxtclanga/tcsds-1.2.34 # TCS (FX1000) +-TCSDS_PATH=/opt/FJSVstclanga/cp-1.0.21.01 # CP (FX700) +-VENV_PATH=~/venv +-PREFIX=~/prefix ++module purge ++module load lang/tcsds-1.2.42 ++. /vol0004/apps/oss/llvm-v19.1.4/init.sh ++export SSL2_ROOT=/vol0004/apps/oss/llvm-v19.1.4/compute_node/ssl2 ++export MPI_HOME=/vol0004/apps/oss/llvm-v19.1.4/compute_node ++ ++TCSDS_PATH=/opt/FJSVxtclanga/tcsds-1.2.42 # TCS (FX1000) ++#TCSDS_PATH=/opt/FJSVstclanga/cp-1.0.21.01 # CP (FX700) ++#VENV_PATH=~/venv ++#PREFIX=~/prefix ++ROOT_DIR=$(cd $(dirname ${BASH_SOURCE:-$0})/../../..; pwd) ++VENV_PATH=$ROOT_DIR/venv ++PREFIX=$ROOT_DIR/prefix + + ######################################################################## + ######################################################################## +@@ -63,14 +72,18 @@ PIP_PACKAGE_PATH=${DOWNLOAD_PATH}/pip_packages + # MAX_JOBS should be 40 or less. (Note: TCS set this to 50 or 52) + : ${MAX_JOBS:=40} + if [ $MAX_JOBS -gt 40 ]; then MAX_JOBS=40; fi ++export MAX_JOBS=${MAX_JOBS} + + # + # Env for Compilers + # + + if [ "$fjenv_use_fcc" = "true" ]; then +- export CC="fcc -Nclang -Knolargepage" +- export CXX="FCC -Nclang -Knolargepage" ++ export CC="clang" ++ export CXX="clang++" ++ export FC="flang" ++ export F77="flang" ++ export F90="flang" + export LC_ALL=C + fi + +diff --git a/scripts/fujitsu/horovod.patch b/scripts/fujitsu/horovod.patch +index ace3ba5866..c0b1f5b63b 100644 +--- a/scripts/fujitsu/horovod.patch ++++ b/scripts/fujitsu/horovod.patch +@@ -1,6 +1,3 @@ +-# +-# patch for v0.26.1 (Oct-14 2022, 34604870eabd9dc670c222deb1da9acc6b9d7c03) +-# + diff --git a/examples/pytorch/pytorch_synthetic_benchmark.py b/examples/pytorch/pytorch_synthetic_benchmark.py + index d645a20..a3c838f 100644 + --- a/examples/pytorch/pytorch_synthetic_benchmark.py +@@ -60,6 +57,24 @@ index d645a20..a3c838f 100644 + loss.backward() + optimizer.step() + ++diff --git a/horovod/torch/CMakeLists.txt b/horovod/torch/CMakeLists.txt ++index eecd198..04816b5 100644 ++--- a/horovod/torch/CMakeLists.txt +++++ b/horovod/torch/CMakeLists.txt ++@@ -63,9 +63,12 @@ endif() ++ parse_version(${Pytorch_VERSION} VERSION_DEC) ++ add_definitions(-DPYTORCH_VERSION=${VERSION_DEC} -DTORCH_API_INCLUDE_EXTENSION_H=1) ++ set(Pytorch_CXX11 ${Pytorch_CXX11} PARENT_SCOPE) ++-if(NOT Pytorch_VERSION VERSION_LESS "1.5.0") +++if(Pytorch_VERSION VERSION_GREATER_EQUAL "1.5.0" AND Pytorch_VERSION VERSION_LESS "2.0.0") ++ set(CMAKE_CXX_STANDARD 14) ++ endif() +++if(Pytorch_VERSION VERSION_GREATER_EQUAL "2.0.0") +++ set(CMAKE_CXX_STANDARD 17) +++endif() ++ ++ # PyTorch SOURCES ++ # Later versions of PyTorch that use ROCm's hipify step will rename files. + diff --git a/horovod/torch/mpi_ops.py b/horovod/torch/mpi_ops.py + index ab764c5..b78a108 100644 + --- a/horovod/torch/mpi_ops.py +diff --git a/scripts/fujitsu/vision.patch b/scripts/fujitsu/vision.patch +index 3a1b5da138..79c0525627 100644 +--- a/scripts/fujitsu/vision.patch ++++ b/scripts/fujitsu/vision.patch +@@ -1,498 +1,42 @@ +-# +-# patch for v0.14.1 (Dec-8 2022, 5e8e2f125f140d1e908cf424a6a85cacad758125) +-# + diff --git a/setup.py b/setup.py +-index 9519890..4a09c3f 100644 ++index c3ba164..57ad652 100644 + --- a/setup.py + +++ b/setup.py +-@@ -209,6 +209,17 @@ def get_extensions(): +- define_macros += [("USE_PYTHON", None)] +- extra_compile_args["cxx"].append("/MP") ++@@ -14,6 +14,7 @@ import torch ++ from pkg_resources import DistributionNotFound, get_distribution, parse_version ++ from setuptools import find_packages, setup ++ from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDA_HOME, CUDAExtension, ROCM_HOME +++import re ++ ++ FORCE_CUDA = os.getenv("FORCE_CUDA", "0") == "1" ++ FORCE_MPS = os.getenv("FORCE_MPS", "0") == "1" ++@@ -140,6 +141,29 @@ def get_macros_and_flags(): ++ if sysconfig.get_config_var("Py_GIL_DISABLED"): ++ extra_compile_args["cxx"].append("-DPy_GIL_DISABLED") + + + # As long as torch is utilizing OpenMP, + + # FCC requires -fopenmp for all submodules even though it doesn't use OpenMP. + + if torch.has_openmp: + + if sys.platform == 'linux': +-+ try: +-+ extra_compile_args['cxx'].append('-fopenmp') +-+ except KeyError: +-+ extra_compile_args = { +-+ 'cxx': ['-fopenmp'] +-+ } +-+ +- if debug_mode: +- print("Compiling in debug mode") +++ config_output = torch.__config__.show() +++ cxx_flags_match = re.search(r'CXX_FLAGS=(.*)', config_output) +++ if cxx_flags_match: +++ cxx_flags = cxx_flags_match.group(1) +++ if re.search(r'-fopenmp', cxx_flags): +++ try: +++ extra_compile_args['cxx'].append('-fopenmp') +++ except KeyError: +++ extra_compile_args = { +++ 'cxx': ['-fopenmp'] +++ } +++ if re.search(r'-Kopenmp', cxx_flags): +++ try: +++ extra_compile_args['cxx'].append('-Kopenmp') +++ except KeyError: +++ extra_compile_args = { +++ 'cxx': ['-Kopenmp'] +++ } +++ ++ if DEBUG: + extra_compile_args["cxx"].append("-g") +-diff --git a/torchvision/csrc/ops/cpu/nms_kernel.cpp b/torchvision/csrc/ops/cpu/nms_kernel.cpp +-index c54d1f0..369b6a9 100644 +---- a/torchvision/csrc/ops/cpu/nms_kernel.cpp +-+++ b/torchvision/csrc/ops/cpu/nms_kernel.cpp +-@@ -20,13 +20,6 @@ at::Tensor nms_kernel_impl( +- if (dets.numel() == 0) +- return at::empty({0}, dets.options().dtype(at::kLong)); +- +-- auto x1_t = dets.select(1, 0).contiguous(); +-- auto y1_t = dets.select(1, 1).contiguous(); +-- auto x2_t = dets.select(1, 2).contiguous(); +-- auto y2_t = dets.select(1, 3).contiguous(); +-- +-- at::Tensor areas_t = (x2_t - x1_t) * (y2_t - y1_t); +-- +- auto order_t = std::get<1>( +- scores.sort(/*stable=*/true, /*dim=*/0, /* descending=*/true)); +- +-@@ -34,6 +27,15 @@ at::Tensor nms_kernel_impl( +- at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte)); +- at::Tensor keep_t = at::zeros({ndets}, dets.options().dtype(at::kLong)); +- +-+ auto dets_sorted = dets.index_select(0, order_t); +-+ +-+ auto x1_t = dets_sorted.select(1, 0).contiguous(); +-+ auto y1_t = dets_sorted.select(1, 1).contiguous(); +-+ auto x2_t = dets_sorted.select(1, 2).contiguous(); +-+ auto y2_t = dets_sorted.select(1, 3).contiguous(); +-+ +-+ at::Tensor areas_t = (x2_t - x1_t) * (y2_t - y1_t); +-+ +- auto suppressed = suppressed_t.data_ptr(); +- auto keep = keep_t.data_ptr(); +- auto order = order_t.data_ptr(); +-@@ -45,19 +47,16 @@ at::Tensor nms_kernel_impl( +- +- int64_t num_to_keep = 0; +- +-- for (int64_t _i = 0; _i < ndets; _i++) { +-- auto i = order[_i]; +-+ for (int64_t i = 0; i < ndets; i++) { +- if (suppressed[i] == 1) +- continue; +-- keep[num_to_keep++] = i; +- auto ix1 = x1[i]; +- auto iy1 = y1[i]; +- auto ix2 = x2[i]; +- auto iy2 = y2[i]; +- auto iarea = areas[i]; +- +-- for (int64_t _j = _i + 1; _j < ndets; _j++) { +-- auto j = order[_j]; +-+ for (int64_t j = i + 1; j < ndets; j++) { +- if (suppressed[j] == 1) +- continue; +- auto xx1 = std::max(ix1, x1[j]); +-@@ -73,6 +72,11 @@ at::Tensor nms_kernel_impl( +- suppressed[j] = 1; +- } +- } +-+ for (int64_t i = 0; i < ndets; i++) { +-+ if (suppressed[i] == 1) +-+ continue; +-+ keep[num_to_keep++] = order[i]; +-+ } +- return keep_t.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep); +- } +- +-diff --git a/torchvision/csrc/ops/cpu/roi_align_kernel.cpp b/torchvision/csrc/ops/cpu/roi_align_kernel.cpp +-index e6684e9..d4c6b0e 100644 +---- a/torchvision/csrc/ops/cpu/roi_align_kernel.cpp +-+++ b/torchvision/csrc/ops/cpu/roi_align_kernel.cpp +-@@ -1,4 +1,5 @@ +- #include +-+#include +- #include +- +- #include "./roi_align_common.h" +-@@ -24,87 +25,89 @@ void roi_align_forward_kernel_impl( +- T* output) { +- // (n, c, ph, pw) is an element in the pooled output +- // can be parallelized using omp +-- // #pragma omp parallel for num_threads(32) +-- for (int n = 0; n < n_rois; n++) { +-- int index_n = n * channels * pooled_width * pooled_height; +-- +-- const T* offset_rois = rois + n * 5; +-- int roi_batch_ind = offset_rois[0]; +-- +-- // Do not using rounding; this implementation detail is critical +-- T offset = aligned ? (T)0.5 : (T)0.0; +-- T roi_start_w = offset_rois[1] * spatial_scale - offset; +-- T roi_start_h = offset_rois[2] * spatial_scale - offset; +-- T roi_end_w = offset_rois[3] * spatial_scale - offset; +-- T roi_end_h = offset_rois[4] * spatial_scale - offset; +-- +-- T roi_width = roi_end_w - roi_start_w; +-- T roi_height = roi_end_h - roi_start_h; +-- if (!aligned) { +-- // Force malformed ROIs to be 1x1 +-- roi_width = std::max(roi_width, (T)1.); +-- roi_height = std::max(roi_height, (T)1.); +-- } +-- +-- T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); +-- T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); +-- +-- // We use roi_bin_grid to sample the grid and mimic integral +-- int roi_bin_grid_h = (sampling_ratio > 0) +-- ? sampling_ratio +-- : ceil(roi_height / pooled_height); // e.g., = 2 +-- int roi_bin_grid_w = +-- (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); +-- +-- // We do average (integral) pooling inside a bin +-- // When the grid is empty, output zeros. +-- const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4 +-- +-- // we want to precalculate indices and weights shared by all chanels, +-- // this is the key point of optimization +-- std::vector> pre_calc( +-- roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height); +-- detail::pre_calc_for_bilinear_interpolate( +-- height, +-- width, +-- pooled_height, +-- pooled_width, +-- roi_start_h, +-- roi_start_w, +-- bin_size_h, +-- bin_size_w, +-- roi_bin_grid_h, +-- roi_bin_grid_w, +-- pre_calc); +-- +-- for (int c = 0; c < channels; c++) { +-- int index_n_c = index_n + c * pooled_width * pooled_height; +-- const T* offset_input = +-- input + (roi_batch_ind * channels + c) * height * width; +-- int pre_calc_index = 0; +-- +-- for (int ph = 0; ph < pooled_height; ph++) { +-- for (int pw = 0; pw < pooled_width; pw++) { +-- int index = index_n_c + ph * pooled_width + pw; +-- +-- T output_val = 0.; +-- for (int iy = 0; iy < roi_bin_grid_h; iy++) { +-- for (int ix = 0; ix < roi_bin_grid_w; ix++) { +-- detail::PreCalc pc = pre_calc[pre_calc_index]; +-- output_val += pc.w1 * offset_input[pc.pos1] + +-- pc.w2 * offset_input[pc.pos2] + +-- pc.w3 * offset_input[pc.pos3] + pc.w4 * offset_input[pc.pos4]; +-- +-- pre_calc_index += 1; +-- } +-- } +-- output_val /= count; // Average pooling +-- +-- output[index] = output_val; +-- } // for pw +-- } // for ph +-- } // for c +-- } // for n +-+ int grain_size = ceil(n_rois / at::get_num_threads()); +-+ at::parallel_for(0, n_rois, grain_size, [&](int64_t start, int64_t end) { +-+ for (int n = start; n < end; n++) { +-+ int index_n = n * channels * pooled_width * pooled_height; +-+ +-+ const T* offset_rois = rois + n * 5; +-+ int roi_batch_ind = offset_rois[0]; +-+ +-+ // Do not using rounding; this implementation detail is critical +-+ T offset = aligned ? (T)0.5 : (T)0.0; +-+ T roi_start_w = offset_rois[1] * spatial_scale - offset; +-+ T roi_start_h = offset_rois[2] * spatial_scale - offset; +-+ T roi_end_w = offset_rois[3] * spatial_scale - offset; +-+ T roi_end_h = offset_rois[4] * spatial_scale - offset; +-+ +-+ T roi_width = roi_end_w - roi_start_w; +-+ T roi_height = roi_end_h - roi_start_h; +-+ if (!aligned) { +-+ // Force malformed ROIs to be 1x1 +-+ roi_width = std::max(roi_width, (T)1.); +-+ roi_height = std::max(roi_height, (T)1.); +-+ } +-+ +-+ T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); +-+ T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); +-+ +-+ // We use roi_bin_grid to sample the grid and mimic integral +-+ int roi_bin_grid_h = (sampling_ratio > 0) +-+ ? sampling_ratio +-+ : ceil(roi_height / pooled_height); // e.g., = 2 +-+ int roi_bin_grid_w = +-+ (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); +-+ +-+ // We do average (integral) pooling inside a bin +-+ // When the grid is empty, output zeros. +-+ const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4 +-+ +-+ // we want to precalculate indices and weights shared by all chanels, +-+ // this is the key point of optimization +-+ std::vector> pre_calc( +-+ roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height); +-+ detail::pre_calc_for_bilinear_interpolate( +-+ height, +-+ width, +-+ pooled_height, +-+ pooled_width, +-+ roi_start_h, +-+ roi_start_w, +-+ bin_size_h, +-+ bin_size_w, +-+ roi_bin_grid_h, +-+ roi_bin_grid_w, +-+ pre_calc); +-+ +-+ for (int c = 0; c < channels; c++) { +-+ int index_n_c = index_n + c * pooled_width * pooled_height; +-+ const T* offset_input = +-+ input + (roi_batch_ind * channels + c) * height * width; +-+ int pre_calc_index = 0; +-+ +-+ for (int ph = 0; ph < pooled_height; ph++) { +-+ for (int pw = 0; pw < pooled_width; pw++) { +-+ int index = index_n_c + ph * pooled_width + pw; +-+ +-+ T output_val = 0.; +-+ for (int iy = 0; iy < roi_bin_grid_h; iy++) { +-+ for (int ix = 0; ix < roi_bin_grid_w; ix++) { +-+ detail::PreCalc pc = pre_calc[pre_calc_index]; +-+ output_val += pc.w1 * offset_input[pc.pos1] + +-+ pc.w2 * offset_input[pc.pos2] + +-+ pc.w3 * offset_input[pc.pos3] + pc.w4 * offset_input[pc.pos4]; +-+ +-+ pre_calc_index += 1; +-+ } +-+ } +-+ output_val /= count; // Average pooling +-+ +-+ output[index] = output_val; +-+ } // for pw +-+ } // for ph +-+ } // for c +-+ } // for n +-+ }); +- } +- +- template +-@@ -183,100 +186,105 @@ void roi_align_backward_kernel_impl( +- int pooled_width, +- int sampling_ratio, +- bool aligned, +-- T* grad_input, +-+ const int64_t grad_input_size, +-+ T* grad_input_buffer, +- const T* rois, +- int n_stride, +- int c_stride, +- int h_stride, +- int w_stride) { +-- for (int index = 0; index < nthreads; index++) { +-- // (n, c, ph, pw) is an element in the pooled output +-- int pw = index % pooled_width; +-- int ph = (index / pooled_width) % pooled_height; +-- int c = (index / pooled_width / pooled_height) % channels; +-- int n = index / pooled_width / pooled_height / channels; +-- +-- const T* offset_rois = rois + n * 5; +-- int roi_batch_ind = offset_rois[0]; +-- +-- // Do not using rounding; this implementation detail is critical +-- T offset = aligned ? (T)0.5 : (T)0.0; +-- T roi_start_w = offset_rois[1] * spatial_scale - offset; +-- T roi_start_h = offset_rois[2] * spatial_scale - offset; +-- T roi_end_w = offset_rois[3] * spatial_scale - offset; +-- T roi_end_h = offset_rois[4] * spatial_scale - offset; +-- +-- T roi_width = roi_end_w - roi_start_w; +-- T roi_height = roi_end_h - roi_start_h; +-- if (!aligned) { +-- // Force malformed ROIs to be 1x1 +-- roi_width = std::max(roi_width, (T)1.); +-- roi_height = std::max(roi_height, (T)1.); +-- } +-- +-- T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); +-- T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); +-- +-- T* offset_grad_input = +-- grad_input + ((roi_batch_ind * channels + c) * height * width); +-- +-- int output_offset = n * n_stride + c * c_stride; +-- const T* offset_grad_output = grad_output + output_offset; +-- const T grad_output_this_bin = +-- offset_grad_output[ph * h_stride + pw * w_stride]; +-- +-- // We use roi_bin_grid to sample the grid and mimic integral +-- int roi_bin_grid_h = (sampling_ratio > 0) +-- ? sampling_ratio +-- : ceil(roi_height / pooled_height); // e.g., = 2 +-- int roi_bin_grid_w = +-- (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); +-- +-- // We do average (integral) pooling inside a bin +-- const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 +-- +-- for (int iy = 0; iy < roi_bin_grid_h; iy++) { +-- const T y = roi_start_h + ph * bin_size_h + +-- static_cast(iy + .5f) * bin_size_h / +-- static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 +-- for (int ix = 0; ix < roi_bin_grid_w; ix++) { +-- const T x = roi_start_w + pw * bin_size_w + +-- static_cast(ix + .5f) * bin_size_w / +-- static_cast(roi_bin_grid_w); +-- +-- T w1, w2, w3, w4; +-- int x_low, x_high, y_low, y_high; +-- +-- bilinear_interpolate_gradient( +-- height, +-- width, +-- y, +-- x, +-- w1, +-- w2, +-- w3, +-- w4, +-- x_low, +-- x_high, +-- y_low, +-- y_high, +-- index); +-- +-- T g1 = grad_output_this_bin * w1 / count; +-- T g2 = grad_output_this_bin * w2 / count; +-- T g3 = grad_output_this_bin * w3 / count; +-- T g4 = grad_output_this_bin * w4 / count; +-- +-- if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { +-- // atomic add is not needed for now since it is single threaded +-- add(offset_grad_input + y_low * width + x_low, static_cast(g1)); +-- add(offset_grad_input + y_low * width + x_high, static_cast(g2)); +-- add(offset_grad_input + y_high * width + x_low, static_cast(g3)); +-- add(offset_grad_input + y_high * width + x_high, static_cast(g4)); +-- } // if +-- } // ix +-- } // iy +-- } // for +-+ int grain_size = ceil(nthreads / at::get_num_threads()); +-+ at::parallel_for(0, nthreads, grain_size, [&](int64_t start, int64_t end) { +-+ for (int index = start; index < end; index++) { +-+ int thread_no = at::get_thread_num(); +-+ // (n, c, ph, pw) is an element in the pooled output +-+ int pw = index % pooled_width; +-+ int ph = (index / pooled_width) % pooled_height; +-+ int c = (index / pooled_width / pooled_height) % channels; +-+ int n = index / pooled_width / pooled_height / channels; +-+ +-+ const T* offset_rois = rois + n * 5; +-+ int roi_batch_ind = offset_rois[0]; +-+ +-+ // Do not using rounding; this implementation detail is critical +-+ T offset = aligned ? (T)0.5 : (T)0.0; +-+ T roi_start_w = offset_rois[1] * spatial_scale - offset; +-+ T roi_start_h = offset_rois[2] * spatial_scale - offset; +-+ T roi_end_w = offset_rois[3] * spatial_scale - offset; +-+ T roi_end_h = offset_rois[4] * spatial_scale - offset; +-+ +-+ T roi_width = roi_end_w - roi_start_w; +-+ T roi_height = roi_end_h - roi_start_h; +-+ if (!aligned) { +-+ // Force malformed ROIs to be 1x1 +-+ roi_width = std::max(roi_width, (T)1.); +-+ roi_height = std::max(roi_height, (T)1.); +-+ } +-+ +-+ T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); +-+ T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); +-+ +-+ T* offset_grad_input = grad_input_buffer + (thread_no * grad_input_size) + +-+ ((roi_batch_ind * channels + c) * height * width); +-+ +-+ int output_offset = n * n_stride + c * c_stride; +-+ const T* offset_grad_output = grad_output + output_offset; +-+ const T grad_output_this_bin = +-+ offset_grad_output[ph * h_stride + pw * w_stride]; +-+ +-+ // We use roi_bin_grid to sample the grid and mimic integral +-+ int roi_bin_grid_h = (sampling_ratio > 0) +-+ ? sampling_ratio +-+ : ceil(roi_height / pooled_height); // e.g., = 2 +-+ int roi_bin_grid_w = +-+ (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); +-+ +-+ // We do average (integral) pooling inside a bin +-+ const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 +-+ +-+ for (int iy = 0; iy < roi_bin_grid_h; iy++) { +-+ const T y = roi_start_h + ph * bin_size_h + +-+ static_cast(iy + .5f) * bin_size_h / +-+ static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 +-+ for (int ix = 0; ix < roi_bin_grid_w; ix++) { +-+ const T x = roi_start_w + pw * bin_size_w + +-+ static_cast(ix + .5f) * bin_size_w / +-+ static_cast(roi_bin_grid_w); +-+ +-+ T w1, w2, w3, w4; +-+ int x_low, x_high, y_low, y_high; +-+ +-+ bilinear_interpolate_gradient( +-+ height, +-+ width, +-+ y, +-+ x, +-+ w1, +-+ w2, +-+ w3, +-+ w4, +-+ x_low, +-+ x_high, +-+ y_low, +-+ y_high, +-+ index); +-+ +-+ T g1 = grad_output_this_bin * w1 / count; +-+ T g2 = grad_output_this_bin * w2 / count; +-+ T g3 = grad_output_this_bin * w3 / count; +-+ T g4 = grad_output_this_bin * w4 / count; +-+ +-+ if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { +-+ // atomic add is not needed for now since it is single threaded +-+ add(offset_grad_input + y_low * width + x_low, static_cast(g1)); +-+ add(offset_grad_input + y_low * width + x_high, static_cast(g2)); +-+ add(offset_grad_input + y_high * width + x_low, static_cast(g3)); +-+ add(offset_grad_input + y_high * width + x_high, static_cast(g4)); +-+ } // if +-+ } // ix +-+ } // iy +-+ } // for +-+ }); +- } +- +- at::Tensor roi_align_forward_kernel( +-@@ -355,6 +363,9 @@ at::Tensor roi_align_backward_kernel( +- return grad_input; +- } +- +-+ const int num_threads = at::get_num_threads(); +-+ at::Tensor grad_input_buffer = +-+ at::zeros({num_threads, batch_size, channels, height, width}, grad.options()); +- // get stride values to ensure indexing into gradients is correct. +- int n_stride = grad.stride(0); +- int c_stride = grad.stride(1); +-@@ -375,13 +386,17 @@ at::Tensor roi_align_backward_kernel( +- pooled_width, +- sampling_ratio, +- aligned, +-- grad_input.data_ptr(), +-+ grad_input.numel(), +-+ grad_input_buffer.data_ptr(), +- rois_.data_ptr(), +- n_stride, +- c_stride, +- h_stride, +- w_stride); +- }); +-+ for (int64_t i = 0; i < num_threads; ++i) { +-+ grad_input.add_(grad_input_buffer.select(0, i)); +-+ } +- return grad_input; +- } +- ++ extra_compile_args["cxx"].append("-O0") +diff --git a/test/cpp/c10d/CMakeLists.txt b/test/cpp/c10d/CMakeLists.txt +index 285a5dd2a7..9b440e217f 100644 +--- a/test/cpp/c10d/CMakeLists.txt ++++ b/test/cpp/c10d/CMakeLists.txt +@@ -20,6 +20,7 @@ function(c10d_add_test test_src) + $ + $ + ) ++ target_include_directories(${test_name} PRIVATE ${MPI_CXX_INCLUDE_PATH}) + target_link_libraries(${test_name} PRIVATE + fmt::fmt-header-only + ${ARG_LINK_LIBRARIES} +@@ -83,7 +84,7 @@ if(USE_MPI AND USE_C10D_MPI) + # private headers of libtorch, which in turn include MPI. As a hacky + # alternative to making MPI a public dependency of libtorch, we make it + # a private dependency of the tests as well. +- c10d_add_test(ProcessGroupMPITest.cpp LINK_LIBRARIES torch_cpu MPI::MPI_CXX INSTALL_TEST ${INSTALL_TEST}) ++ c10d_add_test(ProcessGroupMPITest.cpp LINK_LIBRARIES torch_cpu ${MPI_CXX_LIBRARIES} INSTALL_TEST ${INSTALL_TEST}) + endif() + + if(LINUX AND USE_GLOO AND USE_C10D_GLOO) +diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt +index 1632147f02..3fa6ec87fd 100644 +--- a/torch/CMakeLists.txt ++++ b/torch/CMakeLists.txt +@@ -294,7 +294,8 @@ if(USE_DISTRIBUTED) + endif() + # Same for MPI. + if(USE_MPI) +- list(APPEND TORCH_PYTHON_LINK_LIBRARIES MPI::MPI_CXX) ++ list(APPEND TORCH_PYTHON_LINK_LIBRARIES ${MPI_CXX_LIBRARIES}) ++ list(APPEND TORCH_PYTHON_INCLUDE_DIRECTORIES ${MPI_CXX_INCLUDE_PATH}) + endif() + list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D) + diff --git a/llvm19.1.4/patch/tensorpipe.patch b/llvm19.1.4/patch/tensorpipe.patch new file mode 100644 index 0000000..87a2ba5 --- /dev/null +++ b/llvm19.1.4/patch/tensorpipe.patch @@ -0,0 +1,13 @@ +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 77df76d..bba7a14 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -10,6 +10,8 @@ project(tensorpipe LANGUAGES C CXX) + + set(CMAKE_CXX_STANDARD 17) + ++set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-missing-template-arg-list-after-template-kw") ++ + list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake") + + # Expose build options. diff --git a/llvm21.1.0/patch/numpy.patch b/llvm21.1.0/patch/numpy.patch new file mode 100644 index 0000000..7a5ce4e --- /dev/null +++ b/llvm21.1.0/patch/numpy.patch @@ -0,0 +1,102 @@ +diff --git a/numpy/distutils/fcompiler/__init__.py b/numpy/distutils/fcompiler/__init__.py +index d8dcfa8..ebe0647 100644 +--- a/numpy/distutils/fcompiler/__init__.py ++++ b/numpy/distutils/fcompiler/__init__.py +@@ -745,7 +745,7 @@ def wrap_unlinkable_objects(self, objects, output_dir, extra_dll_dir): + ('cygwin.*', ('gnu', 'intelv', 'absoft', 'compaqv', 'intelev', 'gnu95', 'g95')), + ('linux.*', ('arm', 'gnu95', 'intel', 'lahey', 'pg', 'nv', 'absoft', 'nag', + 'vast', 'compaq', 'intele', 'intelem', 'gnu', 'g95', +- 'pathf95', 'nagfor', 'fujitsu')), ++ 'pathf95', 'nagfor', 'fujitsu', 'llvm')), + ('darwin.*', ('gnu95', 'nag', 'nagfor', 'absoft', 'ibm', 'intel', 'gnu', + 'g95', 'pg')), + ('sunos.*', ('sun', 'gnu', 'gnu95', 'g95')), +diff --git a/numpy/distutils/fcompiler/llvm.py b/numpy/distutils/fcompiler/llvm.py +new file mode 100644 +index 0000000..f3db492 +--- /dev/null ++++ b/numpy/distutils/fcompiler/llvm.py +@@ -0,0 +1,71 @@ ++from __future__ import division, absolute_import, print_function ++ ++import sys ++ ++from numpy.distutils.fcompiler import FCompiler, dummy_fortran_file ++from sys import platform ++from os.path import join, dirname, normpath ++ ++compilers = ['LlvmFlangFCompiler'] ++ ++import functools ++ ++class LlvmFlangFCompiler(FCompiler): ++ compiler_type = 'llvm' ++ description = 'LLVM Fortran Compiler' ++ version_pattern = r'\s*flang.*version (?P[\d.-]+).*' ++ ++ possible_executables = ['flang'] ++ ++ executables = { ++ 'version_cmd': ["", "--version"], ++ 'compiler_f77': ["flang", "-fPIC"], ++ 'compiler_fix': ["flang", "-fPIC", "-ffixed-form"], ++ 'compiler_f90': ["flang", "-fPIC"], ++ 'linker_so': ["flang", "-fPIC", "-shared"], ++ 'archiver': ["ar", "-cr"], ++ 'ranlib': None ++ } ++ ++ pic_flags = ["-fPIC", "-DPIC"] ++ c_compiler = 'clang' ++ module_dir_switch = '-module ' # Don't remove ending space! ++ ++ def get_libraries(self): ++ opt = FCompiler.get_libraries(self) ++ return opt ++ ++ @functools.lru_cache(maxsize=128) ++ def get_library_dirs(self): ++ """List of compiler library directories.""" ++ opt = FCompiler.get_library_dirs(self) ++ flang_dir = dirname(self.executables['compiler_f77'][0]) ++ opt.append(normpath(join(flang_dir, '..', 'lib'))) ++ ++ return opt ++ ++ def get_flags(self): ++ return [] ++ ++ def get_flags_free(self): ++ return [] ++ ++ def get_flags_debug(self): ++ return ['-g'] ++ ++ def get_flags_opt(self): ++ return ['-O3'] ++ ++ def get_flags_arch(self): ++ return [] ++ ++ def runtime_library_dir_option(self, dir): ++ return '-Wl,-rpath=%s' % dir ++ ++ ++if __name__ == '__main__': ++ from distutils import log ++ log.set_verbosity(2) ++ from numpy.distutils import customized_fcompiler ++ print(customized_fcompiler(compiler='llvm').get_version()) ++ +diff --git a/numpy/tests/test_public_api.py b/numpy/tests/test_public_api.py +index bb15e10..9369424 100644 +--- a/numpy/tests/test_public_api.py ++++ b/numpy/tests/test_public_api.py +@@ -233,6 +233,7 @@ def test_NPY_NO_EXPORT(): + "distutils.fcompiler.sun", + "distutils.fcompiler.vast", + "distutils.fcompiler.fujitsu", ++ "distutils.fcompiler.llvm", + "distutils.from_template", + "distutils.intelccompiler", + "distutils.lib2def", diff --git a/llvm21.1.0/patch/pytorch.patch b/llvm21.1.0/patch/pytorch.patch new file mode 100644 index 0000000..7287b1f --- /dev/null +++ b/llvm21.1.0/patch/pytorch.patch @@ -0,0 +1,997 @@ +diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt +index 6ab41b6c84..61a0cd2f9e 100644 +--- a/caffe2/CMakeLists.txt ++++ b/caffe2/CMakeLists.txt +@@ -1549,7 +1549,8 @@ target_link_libraries(torch_cpu PUBLIC ${Caffe2_PUBLIC_DEPENDENCY_LIBS}) + target_link_libraries(torch_cpu PRIVATE ${Caffe2_DEPENDENCY_LIBS}) + target_link_libraries(torch_cpu PRIVATE ${Caffe2_DEPENDENCY_WHOLE_LINK_LIBS}) + if(USE_MPI) +- target_link_libraries(torch_cpu PRIVATE MPI::MPI_CXX) ++ target_link_libraries(torch_cpu PRIVATE ${MPI_CXX_LIBRARIES}) ++ target_include_directories(torch_cpu PRIVATE ${MPI_CXX_INCLUDE_PATH}) + endif() + target_include_directories(torch_cpu INTERFACE $) + target_include_directories(torch_cpu PRIVATE ${Caffe2_CPU_INCLUDE}) +@@ -1727,7 +1728,8 @@ if(BUILD_SHARED_LIBS) + endif() + set_target_properties(torch_global_deps PROPERTIES LINKER_LANGUAGE C) + if(USE_MPI) +- target_link_libraries(torch_global_deps MPI::MPI_CXX) ++ target_link_libraries(torch_global_deps ${MPI_CXX_LIBRARIES}) ++ target_include_directories(torch_global_deps PUBLIC ${MPI_CXX_INCLUDE_PATH}) + endif() + if(CAFFE2_USE_MKL) + target_link_libraries(torch_global_deps caffe2::mkl) +diff --git a/cmake/Modules/FindARM.cmake b/cmake/Modules/FindARM.cmake +index 903025c5c2..a419c1aeed 100644 +--- a/cmake/Modules/FindARM.cmake ++++ b/cmake/Modules/FindARM.cmake +@@ -153,7 +153,7 @@ IF(CMAKE_SYSTEM_NAME MATCHES "Linux") + + # Check for SVE256 vector length + CHECK_COMPILES(CXX "SVE256" "-march=armv8.2-a+sve -msve-vector-bits=256" "${SVE_CODE}") +- CHECK_COMPILES(CXX "ARM_BF16" "-march=armv8.2-a+sve+bf16 -msve-vector-bits=256" "${ARM_BF16_CODE}") ++ #CHECK_COMPILES(CXX "ARM_BF16" "-march=armv8.2-a+sve+bf16 -msve-vector-bits=256" "${ARM_BF16_CODE}") + + # If SVE256 support is not found, set CXX_SVE_FOUND to FALSE and notify the user + if(NOT CXX_SVE256_FOUND) +diff --git a/cmake/Modules/FindMPI.cmake b/cmake/Modules/FindMPI.cmake +new file mode 100644 +index 0000000000..cd77062a48 +--- /dev/null ++++ b/cmake/Modules/FindMPI.cmake +@@ -0,0 +1,53 @@ ++if(CMAKE_C_COMPILER MATCHES ".*/clang$" AND ++ CMAKE_CXX_COMPILER MATCHES ".*/clang\\+\\+$") ++ if(DEFINED ENV{MPI_HOME}) ++ set(TCSMPI_EXEC_PATH "$ENV{MPI_HOME}/bin") ++ else() ++ string(REGEX REPLACE "/clang\\+\\+$" "" CMAKE_CXX_COMPILER_DIR "${CMAKE_CXX_COMPILER}") ++ set(TCSMPI_EXEC_PATH "${CMAKE_CXX_COMPILER_DIR}") ++ endif() ++ execute_process(COMMAND "${TCSMPI_EXEC_PATH}/mpiclang++" "--show" ++ RESULT_VARIABLE MPIFCC_EXEC_RESULT ++ OUTPUT_QUIET ++ ERROR_QUIET) ++ if(MPIFCC_EXEC_RESULT EQUAL 0) ++ message(STATUS "TCS-MPI ENABLED") ++ set(MPI_CXX_FOUND ON) ++ execute_process(COMMAND "${TCSMPI_EXEC_PATH}/mpiclang++" "--showme:compile" ++ OUTPUT_VARIABLE MPI_CXX_COMPILE_FLAGS) ++ string(REPLACE "\n" "" MPI_CXX_COMPILE_FLAGS "${MPI_CXX_COMPILE_FLAGS}") ++ execute_process(COMMAND "${TCSMPI_EXEC_PATH}/mpiclang++" "--showme:incdirs" ++ OUTPUT_VARIABLE MPI_CXX_INCLUDE_PATH) ++ string(REPLACE "\n" ";" MPI_CXX_INCLUDE_PATH "${MPI_CXX_INCLUDE_PATH}") ++ execute_process(COMMAND "${TCSMPI_EXEC_PATH}/mpiclang++" "--showme:link" ++ OUTPUT_VARIABLE MPI_CXX_LINK_FLAGS) ++ string(REPLACE "\n" "" MPI_CXX_LINK_FLAGS "${MPI_CXX_LINK_FLAGS}") ++ execute_process(COMMAND "${TCSMPI_EXEC_PATH}/mpiclang++" "--showme:libdirs" ++ OUTPUT_VARIABLE MPI_CXX_LIBRARY_DIRS) ++ string(REPLACE "\n" "" MPI_CXX_LIBRARY_DIRS "${MPI_CXX_LIBRARY_DIRS}") ++ string(REPLACE " " ";" MPI_CXX_LIBRARY_DIRS "${MPI_CXX_LIBRARY_DIRS}") ++ foreach(dir IN LISTS MPI_CXX_LIBRARY_DIRS) ++ set(MPI_CXX_LIBRARIES "${dir}/libmpi.so") ++ endforeach() ++ set(MPI_FOUND ON) ++ set(MPI_C_FOUND ON) ++ set(MPIEXEC "${TCSMPI_EXEC_PATH}/mpiexec") ++ set(MPI_COMPILE_FLAGS ${MPI_CXX_COMPILE_FLAGS}) ++ set(MPI_C_COMPILE_FLAGS ${MPI_CXX_COMPILE_FLAGS}) ++ set(MPI_INCLUDE_PATH ${MPI_CXX_INCLUDE_PATH}) ++ set(MPI_C_INCLUDE_PATH ${MPI_CXX_INCLUDE_PATH}) ++ set(MPI_LINK_FLAGS ${MPI_CXX_LINK_FLAGS}) ++ set(MPI_C_LINK_FLAGS ${MPI_CXX_LINK_FLAGS}) ++ set(MPI_LIBRARIES ${MPI_CXX_LIBRARIES}) ++ set(MPI_C_LIBRARIES ${MPI_CXX_LIBRARIES}) ++ else() ++ message(STATUS "TCS-MPI DISABLED") ++ endif() ++endif() ++if(NOT MPI_FOUND) ++ set(CMAKE_MODULE_PATH_TMP "${CMAKE_MODULE_PATH}") ++ unset(CMAKE_MODULE_PATH) ++ find_package(MPI) ++ set(CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH_TMP}") ++ unset(CMAKE_MODULE_PATH_TMP) ++endif() +diff --git a/scripts/fujitsu/1_python.sh b/scripts/fujitsu/1_python.sh +index 90790698f9..98a02f52c6 100755 +--- a/scripts/fujitsu/1_python.sh ++++ b/scripts/fujitsu/1_python.sh +@@ -39,7 +39,7 @@ source $script_basedir/env.src + + if [ -v fjenv_debug ]; then set -x; fi + +-PYTHON_VER=3.9 ++PYTHON_VER=3.10 + PYTHON_DIR=cpython + + # +@@ -83,7 +83,7 @@ if [ "$fjenv_use_fcc" = "true" ]; then + # TODO: $ORIGIN sometimes parsed as 'RIGIN'. + # perhaps more backslashs are needed to protect $ORIGIN from parsing in shell. + # export LDFLAGS="-Wl,-rpath,\$ORIGIN/../lib" +- export LDFLAGS="-Wl,-rpath,${PREFIX}/lib -Wl,-rpath,${TCSDS_PATH}/lib64 -lpthread" ++ export LDFLAGS="-Wl,-rpath,${PREFIX}/lib -L/usr/lib64" + else + # Ditto. + #export LDFLAGS="-Wl,-rpath,\$ORIGIN/../lib" +@@ -105,7 +105,7 @@ if [ "${fjenv_use_fcc}" = "true" ]; then + # We used to link with '--linkfortran', which turned out to be unnecessary. + # It was used to link with the module solery written by Fortran. + # ${CXX} --linkfortran -SSL2 -Kopenmp -Nlibomp -o python Programs/python.o -L. -lpython$PYTHON_VER $LDFLAGS +- ${CXX} -Kopenmp -Nlibomp -SSL2BLAMP -lfjlapackexsve -o python Programs/python.o -L. -lpython$PYTHON_VER $LDFLAGS ++ ${CXX} -fopenmp -L${OpenBLAS_HOME}/lib -lopenblas -lflang_rt.runtime -o python Programs/python.o -L. -lpython$PYTHON_VER $LDFLAGS + fi + + make install +@@ -119,7 +119,7 @@ hash -r + # Note that python 3.9 buildles setuptools 58.1. + #pip3 uninstall -y setuptools + +-pip3 install --upgrade ${PIP3_OPTIONS} 'setuptools>60.6.0' # or setuptools<59.6.0 ++pip3 install --upgrade ${PIP3_OPTIONS} 'setuptools==73.0.1' # or setuptools<59.6.0 + + # Show configuration + +diff --git a/scripts/fujitsu/3_venv.sh b/scripts/fujitsu/3_venv.sh +index b271fdc535..4169a4930d 100755 +--- a/scripts/fujitsu/3_venv.sh ++++ b/scripts/fujitsu/3_venv.sh +@@ -81,9 +81,9 @@ fi + # Workaround is found in: + # See https://stackoverflow.com/questions/70520120/attributeerror-module-setuptools-distutils-has-no-attribute-version + +-pip3 install --upgrade ${PIP3_OPTIONS} 'setuptools>60.6.0' # or setuptools<59.6.0 ++pip3 install --upgrade ${PIP3_OPTIONS} 'setuptools==73.0.1' # or setuptools<59.6.0 + +-pip3 install --upgrade ${PIP3_OPTIONS} pip future six wheel ++pip3 install --upgrade ${PIP3_OPTIONS} pip==25.3 + + pip3 list | tee $script_basedir/pip3_list.txt + +diff --git a/scripts/fujitsu/4_numpy_scipy.sh b/scripts/fujitsu/4_numpy_scipy.sh +index fe07872ab3..02e52f2c7a 100755 +--- a/scripts/fujitsu/4_numpy_scipy.sh ++++ b/scripts/fujitsu/4_numpy_scipy.sh +@@ -41,8 +41,9 @@ if [ -v fjenv_debug ]; then set -x; fi + + NUMPY_VER=v1.22.4 + NUMPY_DIR=numpy +-SCIPY_VER=v1.7.3 ++SCIPY_VER=v1.10.1 + SCIPY_DIR=scipy ++SCIPY_CHERRY_PICK=ab7d08c6148286059f6498ab5c3070268d13cbd9 + + # + # Clean up +@@ -61,17 +62,20 @@ fi + [ -d ${DOWNLOAD_PATH} ] || mkdir -p ${DOWNLOAD_PATH} + cd ${DOWNLOAD_PATH} + +-[ -d $NUMPY_DIR ] || ++if [ ! -d $NUMPY_DIR ]; then + git clone ${GIT_OPTIONS} \ + -b $NUMPY_VER \ + --depth 1 \ + https://github.com/numpy/numpy.git $NUMPY_DIR ++ (cd $NUMPY_DIR && patch -p 1 < $script_basedir/numpy.patch) ++fi + +-[ -d $SCIPY_DIR ] || ++if [ ! -d $SCIPY_DIR ]; then + git clone ${GIT_OPTIONS} --recursive \ + -b $SCIPY_VER \ +- --depth 1 \ + https://github.com/scipy/scipy.git $SCIPY_DIR ++ (cd $SCIPY_DIR && git cherry-pick ${SCIPY_CHERRY_PICK}) ++fi + + [ -v fjenv_download ] && fjenv_safe_exit 0 + +@@ -94,7 +98,7 @@ fi + + # NumPy maintenance/1.22.x requires Cythone >= 0.29.21 + # NumPy maintenance/1.22.2 requires Cythone >= 0.29.30 +-pip3 install ${PIP3_OPTIONS} 'Cython>=0.29.30' || ++pip3 install ${PIP3_OPTIONS} 'Cython>=0.29.33,<3.0' || + pip3 install ${PIP3_OPTIONS} $PIP_PACKAGE_PATH/Cython*.whl + + cd $DOWNLOAD_PATH/$NUMPY_DIR +@@ -107,20 +111,18 @@ fi + if [ "$fjenv_use_fcc" = "true" -a ! -f site.cfg ]; then + cat <site.cfg + [openblas] +-libraries = fjlapackexsve +-library_dirs = $TCSDS_PATH/lib64 +-include_dirs = $TCSDS_PATH/include +-extra_link_args = -SSL2BLAMP ++library_dirs = ${OpenBLAS_HOME}/lib ++include_dirs = ${OpenBLAS_HOME}/include ++extra_link_args = -lopenblas -lflang_rt.runtime + + [lapack] +-lapack_libs = fjlapackexsve +-library_dirs = $TCSDS_PATH/lib64 +-extra_link_args = -SSL2BLAMP ++library_dirs = ${OpenBLAS_HOME}/lib ++extra_link_args = -lopenblas -lflang_rtruntime + EOF + fi + + NPY_NUM_BUILD_JOBS=$MAX_JOBS \ +- python3 setup.py build -j $MAX_JOBS install ++ python3 setup.py build -j $MAX_JOBS config_fc --fcompiler=llvm install + + # + # Build SciPy +@@ -130,7 +132,7 @@ NPY_NUM_BUILD_JOBS=$MAX_JOBS \ + # older than what NumPy is requiring, but running for reference purpose, + # such as in case of using older NumPy. + +-pip3 install ${PIP3_OPTIONS} 'Cython>=0.29.18' ++pip3 install ${PIP3_OPTIONS} 'Cython>=0.29.33,<3.0' + pip3 install ${PIP3_OPTIONS} pybind11 pythran + + cd $DOWNLOAD_PATH/$SCIPY_DIR +@@ -140,7 +142,7 @@ if [ -v fjenv_rebuild ]; then + fi + + SCIPY_NUM_CYTHONIZE_JOBS=$MAX_JOBS \ +- python3 setup.py build -j $MAX_JOBS --fcompiler=fujitsu install ++ python3 setup.py build -j $MAX_JOBS config_fc --fcompiler=llvm install + + pip3 list | tee $script_basedir/pip3_list.txt + +diff --git a/scripts/fujitsu/5_pytorch.sh b/scripts/fujitsu/5_pytorch.sh +index 8128e584c8..0942df8b4e 100755 +--- a/scripts/fujitsu/5_pytorch.sh ++++ b/scripts/fujitsu/5_pytorch.sh +@@ -40,7 +40,8 @@ source $script_basedir/env.src + + if [ -v fjenv_debug ]; then set -x; fi + +-ONEDNN_VER=v2.7 ++ONEDNN_VER=v3.7.1 ++GOOGLETEST_CHERRY_PICK=fa8438ae6b70c57010177de47a9f13d7041a6328 + + # + # Clean up +@@ -59,8 +60,10 @@ fi + if [ ! -d $PYTORCH_TOP/third_party/ideep/mkl-dnn ]; then + cd $PYTORCH_TOP + git submodule update --init --recursive $GIT_OPTIONS ++ cd $PYTORCH_TOP/third_party/googletest && git cherry-pick ${GOOGLETEST_CHERRY_PICK} ++ cd $PYTORCH_TOP/third_party/tensorpipe && patch -p 1 < $script_basedir/tensorpipe.patch + fi +-cd $PYTORCH_TOP/third_party/ideep/mkl-dnn/third_party/oneDNN ++cd $PYTORCH_TOP/third_party/ideep/mkl-dnn + git checkout $GIT_OPTIONS $ONEDNN_VER + + [ -v fjenv_download ] && fjenv_safe_exit 0 +@@ -96,7 +99,7 @@ fi + + # 'setup.py' in PyTorch ensures that CFLAGS is used for both C and C++ compiler, + # but just in case... +-CFLAGS=-O3 CXXFLAGS=-O3 python3 setup.py build -j $MAX_JOBS install ++CFLAGS='-O3 -fopenmp' CXXFLAGS="-O3 -fopenmp" LDFLAGS="-lflang_rt.runtime" python3 setup.py build -j $MAX_JOBS install + + pip3 list | tee $script_basedir/pip3_list.txt + +diff --git a/scripts/fujitsu/6_vision.sh b/scripts/fujitsu/6_vision.sh +index 68034079e4..2b8aa5bc5c 100755 +--- a/scripts/fujitsu/6_vision.sh ++++ b/scripts/fujitsu/6_vision.sh +@@ -42,9 +42,9 @@ if [ -v fjenv_debug ]; then set -x; fi + + JPEG_ARCHIVE_NAME=jpegsrc.v9d + JPEG_DIR=jpeg-9d +-PILLOW_VER=7.2.0 ++PILLOW_VER=8.4.0 + PILLOW_DIR=Pillow +-TORCHVISION_VER=v0.14.1 ++TORCHVISION_VER=v0.24.1 + TORCHVISION_DIR=vision + + # +@@ -89,7 +89,6 @@ if [ ! -d $TORCHVISION_DIR ]; then + git clone ${GIT_OPTIONS} -b $TORCHVISION_VER \ + --depth 1 \ + https://github.com/pytorch/vision.git +- (cd vision; patch -p 1 < $script_basedir/vision.patch) + fi + + [ -v fjenv_download ] && fjenv_safe_exit 0 +@@ -132,7 +131,7 @@ export LDFLAGS="-Wl,-rpath,${PREFIX}/lib" + if [ -v fjenv_rebuild ]; then + python3 setup.py clean + fi +-python3 setup.py install ++pip3 install . --verbose --no-build-isolation + + # + # Install torchvision +@@ -145,7 +144,7 @@ fi + + export TORCHVISION_INCLUDE=$PREFIX/include + export TORCHVISION_LIBRARY=$PREFIX/lib +-CFLAGS="-Kfast" python3 setup.py build -j $MAX_JOBS install ++pip3 install . --verbose --no-build-isolation + + pip3 list | tee $script_basedir/pip3_list.txt + +diff --git a/scripts/fujitsu/7_horovod.sh b/scripts/fujitsu/7_horovod.sh +index 7aa6f302b9..4e0138da0d 100755 +--- a/scripts/fujitsu/7_horovod.sh ++++ b/scripts/fujitsu/7_horovod.sh +@@ -41,6 +41,7 @@ if [ -v fjenv_debug ]; then set -x; fi + + HOROVOD_VER=v0.26.1 + HOROVOD_DIR=horovod ++FLATBUFFERS_CHERRY_PICK=20aad0c41e1252b04c72111c3eb221280a9c2009 + + # + # Clean up +@@ -62,6 +63,7 @@ if [ ! -d horovod ]; then + -b $HOROVOD_VER \ + --depth 1 \ + https://github.com/horovod/horovod.git ++ (cd horovod/third_party/flatbuffers && git cherry-pick ${FLATBUFFERS_CHERRY_PICK}) + (cd horovod; patch -p 1 < $script_basedir/horovod.patch) + cp -p horovod/examples/pytorch/pytorch_synthetic_benchmark.py $script_basedir + fi +@@ -81,7 +83,7 @@ fi + # + + if [ "${fjenv_use_fcc}" != "true" ]; then +- echo "$0 works for FCC only for now" ++ echo "$0 works for clang++ only for now" + exit 1 + fi + +diff --git a/scripts/fujitsu/env.src b/scripts/fujitsu/env.src +index 7dd81f6d6c..aee93aaff4 100644 +--- a/scripts/fujitsu/env.src ++++ b/scripts/fujitsu/env.src +@@ -43,10 +43,21 @@ fjenv_src_sourced="Y" + ######################################################################## + ######################################################################## + +-#TCSDS_PATH=/opt/FJSVxtclanga/tcsds-1.2.34 # TCS (FX1000) +-TCSDS_PATH=/opt/FJSVstclanga/cp-1.0.21.01 # CP (FX700) +-VENV_PATH=~/venv +-PREFIX=~/prefix ++module purge ++. /vol0004/apps/oss/spack/share/spack/setup-env.sh ++spack load gcc@12.2.0 arch=linux-rhel8-a64fx ++module load lang/tcsds-1.2.42 ++module load LLVM/llvmorg-21.1.0 ++export OpenBLAS_HOME=/vol0004/apps/r/OSS_CN/llvm/openblas-omp ++export GCC_INSTALL_DIR=/vol0004/apps/oss/spack-v1.0.1/opt/spack/linux-a64fx/gcc-12.2.0-f57uyl2rzc74cow54td7bdy77xajibir/lib/gcc/aarch64-unknown-linux-gnu/12.2.0 ++ ++TCSDS_PATH=/opt/FJSVxtclanga/tcsds-1.2.42 # TCS (FX1000) ++#TCSDS_PATH=/opt/FJSVstclanga/cp-1.0.21.01 # CP (FX700) ++#VENV_PATH=~/venv ++#PREFIX=~/prefix ++ROOT_DIR=$(cd $(dirname ${BASH_SOURCE:-$0})/../../..; pwd) ++VENV_PATH=$ROOT_DIR/venv ++PREFIX=$ROOT_DIR/prefix + + ######################################################################## + ######################################################################## +@@ -63,20 +74,24 @@ PIP_PACKAGE_PATH=${DOWNLOAD_PATH}/pip_packages + # MAX_JOBS should be 40 or less. (Note: TCS set this to 50 or 52) + : ${MAX_JOBS:=40} + if [ $MAX_JOBS -gt 40 ]; then MAX_JOBS=40; fi ++export MAX_JOBS=${MAX_JOBS} + + # + # Env for Compilers + # + + if [ "$fjenv_use_fcc" = "true" ]; then +- export CC="fcc -Nclang -Knolargepage" +- export CXX="FCC -Nclang -Knolargepage" ++ export CC="clang --gcc-install-dir=${GCC_INSTALL_DIR}" ++ export CXX="clang++ --gcc-install-dir=${GCC_INSTALL_DIR}" ++ export FC="flang" ++ export F77="flang" ++ export F90="flang" + export LC_ALL=C + fi + + if [ ! -v fjenv_clean -a ! -v fjenv_download ]; then + if [ "$fjenv_use_fcc" = "true" ]; then +- export LD_LIBRARY_PATH=${TCSDS_PATH}/lib64${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH} ++ export LD_LIBRARY_PATH=${OpenBLAS_HOME}/lib:${TCSDS_PATH}/lib64${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH} + fi + if [ ! -z "$PREFIX" ]; then + PATH=${TCSDS_PATH}/bin:${PREFIX}/bin:${PATH} +diff --git a/scripts/fujitsu/horovod.patch b/scripts/fujitsu/horovod.patch +index ace3ba5866..c0b1f5b63b 100644 +--- a/scripts/fujitsu/horovod.patch ++++ b/scripts/fujitsu/horovod.patch +@@ -1,6 +1,3 @@ +-# +-# patch for v0.26.1 (Oct-14 2022, 34604870eabd9dc670c222deb1da9acc6b9d7c03) +-# + diff --git a/examples/pytorch/pytorch_synthetic_benchmark.py b/examples/pytorch/pytorch_synthetic_benchmark.py + index d645a20..a3c838f 100644 + --- a/examples/pytorch/pytorch_synthetic_benchmark.py +@@ -60,6 +57,24 @@ index d645a20..a3c838f 100644 + loss.backward() + optimizer.step() + ++diff --git a/horovod/torch/CMakeLists.txt b/horovod/torch/CMakeLists.txt ++index eecd198..04816b5 100644 ++--- a/horovod/torch/CMakeLists.txt +++++ b/horovod/torch/CMakeLists.txt ++@@ -63,9 +63,12 @@ endif() ++ parse_version(${Pytorch_VERSION} VERSION_DEC) ++ add_definitions(-DPYTORCH_VERSION=${VERSION_DEC} -DTORCH_API_INCLUDE_EXTENSION_H=1) ++ set(Pytorch_CXX11 ${Pytorch_CXX11} PARENT_SCOPE) ++-if(NOT Pytorch_VERSION VERSION_LESS "1.5.0") +++if(Pytorch_VERSION VERSION_GREATER_EQUAL "1.5.0" AND Pytorch_VERSION VERSION_LESS "2.0.0") ++ set(CMAKE_CXX_STANDARD 14) ++ endif() +++if(Pytorch_VERSION VERSION_GREATER_EQUAL "2.0.0") +++ set(CMAKE_CXX_STANDARD 17) +++endif() ++ ++ # PyTorch SOURCES ++ # Later versions of PyTorch that use ROCm's hipify step will rename files. + diff --git a/horovod/torch/mpi_ops.py b/horovod/torch/mpi_ops.py + index ab764c5..b78a108 100644 + --- a/horovod/torch/mpi_ops.py +diff --git a/scripts/fujitsu/vision.patch b/scripts/fujitsu/vision.patch +deleted file mode 100644 +index 3a1b5da138..0000000000 +--- a/scripts/fujitsu/vision.patch ++++ /dev/null +@@ -1,498 +0,0 @@ +-# +-# patch for v0.14.1 (Dec-8 2022, 5e8e2f125f140d1e908cf424a6a85cacad758125) +-# +-diff --git a/setup.py b/setup.py +-index 9519890..4a09c3f 100644 +---- a/setup.py +-+++ b/setup.py +-@@ -209,6 +209,17 @@ def get_extensions(): +- define_macros += [("USE_PYTHON", None)] +- extra_compile_args["cxx"].append("/MP") +- +-+ # As long as torch is utilizing OpenMP, +-+ # FCC requires -fopenmp for all submodules even though it doesn't use OpenMP. +-+ if torch.has_openmp: +-+ if sys.platform == 'linux': +-+ try: +-+ extra_compile_args['cxx'].append('-fopenmp') +-+ except KeyError: +-+ extra_compile_args = { +-+ 'cxx': ['-fopenmp'] +-+ } +-+ +- if debug_mode: +- print("Compiling in debug mode") +- extra_compile_args["cxx"].append("-g") +-diff --git a/torchvision/csrc/ops/cpu/nms_kernel.cpp b/torchvision/csrc/ops/cpu/nms_kernel.cpp +-index c54d1f0..369b6a9 100644 +---- a/torchvision/csrc/ops/cpu/nms_kernel.cpp +-+++ b/torchvision/csrc/ops/cpu/nms_kernel.cpp +-@@ -20,13 +20,6 @@ at::Tensor nms_kernel_impl( +- if (dets.numel() == 0) +- return at::empty({0}, dets.options().dtype(at::kLong)); +- +-- auto x1_t = dets.select(1, 0).contiguous(); +-- auto y1_t = dets.select(1, 1).contiguous(); +-- auto x2_t = dets.select(1, 2).contiguous(); +-- auto y2_t = dets.select(1, 3).contiguous(); +-- +-- at::Tensor areas_t = (x2_t - x1_t) * (y2_t - y1_t); +-- +- auto order_t = std::get<1>( +- scores.sort(/*stable=*/true, /*dim=*/0, /* descending=*/true)); +- +-@@ -34,6 +27,15 @@ at::Tensor nms_kernel_impl( +- at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte)); +- at::Tensor keep_t = at::zeros({ndets}, dets.options().dtype(at::kLong)); +- +-+ auto dets_sorted = dets.index_select(0, order_t); +-+ +-+ auto x1_t = dets_sorted.select(1, 0).contiguous(); +-+ auto y1_t = dets_sorted.select(1, 1).contiguous(); +-+ auto x2_t = dets_sorted.select(1, 2).contiguous(); +-+ auto y2_t = dets_sorted.select(1, 3).contiguous(); +-+ +-+ at::Tensor areas_t = (x2_t - x1_t) * (y2_t - y1_t); +-+ +- auto suppressed = suppressed_t.data_ptr(); +- auto keep = keep_t.data_ptr(); +- auto order = order_t.data_ptr(); +-@@ -45,19 +47,16 @@ at::Tensor nms_kernel_impl( +- +- int64_t num_to_keep = 0; +- +-- for (int64_t _i = 0; _i < ndets; _i++) { +-- auto i = order[_i]; +-+ for (int64_t i = 0; i < ndets; i++) { +- if (suppressed[i] == 1) +- continue; +-- keep[num_to_keep++] = i; +- auto ix1 = x1[i]; +- auto iy1 = y1[i]; +- auto ix2 = x2[i]; +- auto iy2 = y2[i]; +- auto iarea = areas[i]; +- +-- for (int64_t _j = _i + 1; _j < ndets; _j++) { +-- auto j = order[_j]; +-+ for (int64_t j = i + 1; j < ndets; j++) { +- if (suppressed[j] == 1) +- continue; +- auto xx1 = std::max(ix1, x1[j]); +-@@ -73,6 +72,11 @@ at::Tensor nms_kernel_impl( +- suppressed[j] = 1; +- } +- } +-+ for (int64_t i = 0; i < ndets; i++) { +-+ if (suppressed[i] == 1) +-+ continue; +-+ keep[num_to_keep++] = order[i]; +-+ } +- return keep_t.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep); +- } +- +-diff --git a/torchvision/csrc/ops/cpu/roi_align_kernel.cpp b/torchvision/csrc/ops/cpu/roi_align_kernel.cpp +-index e6684e9..d4c6b0e 100644 +---- a/torchvision/csrc/ops/cpu/roi_align_kernel.cpp +-+++ b/torchvision/csrc/ops/cpu/roi_align_kernel.cpp +-@@ -1,4 +1,5 @@ +- #include +-+#include +- #include +- +- #include "./roi_align_common.h" +-@@ -24,87 +25,89 @@ void roi_align_forward_kernel_impl( +- T* output) { +- // (n, c, ph, pw) is an element in the pooled output +- // can be parallelized using omp +-- // #pragma omp parallel for num_threads(32) +-- for (int n = 0; n < n_rois; n++) { +-- int index_n = n * channels * pooled_width * pooled_height; +-- +-- const T* offset_rois = rois + n * 5; +-- int roi_batch_ind = offset_rois[0]; +-- +-- // Do not using rounding; this implementation detail is critical +-- T offset = aligned ? (T)0.5 : (T)0.0; +-- T roi_start_w = offset_rois[1] * spatial_scale - offset; +-- T roi_start_h = offset_rois[2] * spatial_scale - offset; +-- T roi_end_w = offset_rois[3] * spatial_scale - offset; +-- T roi_end_h = offset_rois[4] * spatial_scale - offset; +-- +-- T roi_width = roi_end_w - roi_start_w; +-- T roi_height = roi_end_h - roi_start_h; +-- if (!aligned) { +-- // Force malformed ROIs to be 1x1 +-- roi_width = std::max(roi_width, (T)1.); +-- roi_height = std::max(roi_height, (T)1.); +-- } +-- +-- T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); +-- T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); +-- +-- // We use roi_bin_grid to sample the grid and mimic integral +-- int roi_bin_grid_h = (sampling_ratio > 0) +-- ? sampling_ratio +-- : ceil(roi_height / pooled_height); // e.g., = 2 +-- int roi_bin_grid_w = +-- (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); +-- +-- // We do average (integral) pooling inside a bin +-- // When the grid is empty, output zeros. +-- const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4 +-- +-- // we want to precalculate indices and weights shared by all chanels, +-- // this is the key point of optimization +-- std::vector> pre_calc( +-- roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height); +-- detail::pre_calc_for_bilinear_interpolate( +-- height, +-- width, +-- pooled_height, +-- pooled_width, +-- roi_start_h, +-- roi_start_w, +-- bin_size_h, +-- bin_size_w, +-- roi_bin_grid_h, +-- roi_bin_grid_w, +-- pre_calc); +-- +-- for (int c = 0; c < channels; c++) { +-- int index_n_c = index_n + c * pooled_width * pooled_height; +-- const T* offset_input = +-- input + (roi_batch_ind * channels + c) * height * width; +-- int pre_calc_index = 0; +-- +-- for (int ph = 0; ph < pooled_height; ph++) { +-- for (int pw = 0; pw < pooled_width; pw++) { +-- int index = index_n_c + ph * pooled_width + pw; +-- +-- T output_val = 0.; +-- for (int iy = 0; iy < roi_bin_grid_h; iy++) { +-- for (int ix = 0; ix < roi_bin_grid_w; ix++) { +-- detail::PreCalc pc = pre_calc[pre_calc_index]; +-- output_val += pc.w1 * offset_input[pc.pos1] + +-- pc.w2 * offset_input[pc.pos2] + +-- pc.w3 * offset_input[pc.pos3] + pc.w4 * offset_input[pc.pos4]; +-- +-- pre_calc_index += 1; +-- } +-- } +-- output_val /= count; // Average pooling +-- +-- output[index] = output_val; +-- } // for pw +-- } // for ph +-- } // for c +-- } // for n +-+ int grain_size = ceil(n_rois / at::get_num_threads()); +-+ at::parallel_for(0, n_rois, grain_size, [&](int64_t start, int64_t end) { +-+ for (int n = start; n < end; n++) { +-+ int index_n = n * channels * pooled_width * pooled_height; +-+ +-+ const T* offset_rois = rois + n * 5; +-+ int roi_batch_ind = offset_rois[0]; +-+ +-+ // Do not using rounding; this implementation detail is critical +-+ T offset = aligned ? (T)0.5 : (T)0.0; +-+ T roi_start_w = offset_rois[1] * spatial_scale - offset; +-+ T roi_start_h = offset_rois[2] * spatial_scale - offset; +-+ T roi_end_w = offset_rois[3] * spatial_scale - offset; +-+ T roi_end_h = offset_rois[4] * spatial_scale - offset; +-+ +-+ T roi_width = roi_end_w - roi_start_w; +-+ T roi_height = roi_end_h - roi_start_h; +-+ if (!aligned) { +-+ // Force malformed ROIs to be 1x1 +-+ roi_width = std::max(roi_width, (T)1.); +-+ roi_height = std::max(roi_height, (T)1.); +-+ } +-+ +-+ T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); +-+ T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); +-+ +-+ // We use roi_bin_grid to sample the grid and mimic integral +-+ int roi_bin_grid_h = (sampling_ratio > 0) +-+ ? sampling_ratio +-+ : ceil(roi_height / pooled_height); // e.g., = 2 +-+ int roi_bin_grid_w = +-+ (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); +-+ +-+ // We do average (integral) pooling inside a bin +-+ // When the grid is empty, output zeros. +-+ const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4 +-+ +-+ // we want to precalculate indices and weights shared by all chanels, +-+ // this is the key point of optimization +-+ std::vector> pre_calc( +-+ roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height); +-+ detail::pre_calc_for_bilinear_interpolate( +-+ height, +-+ width, +-+ pooled_height, +-+ pooled_width, +-+ roi_start_h, +-+ roi_start_w, +-+ bin_size_h, +-+ bin_size_w, +-+ roi_bin_grid_h, +-+ roi_bin_grid_w, +-+ pre_calc); +-+ +-+ for (int c = 0; c < channels; c++) { +-+ int index_n_c = index_n + c * pooled_width * pooled_height; +-+ const T* offset_input = +-+ input + (roi_batch_ind * channels + c) * height * width; +-+ int pre_calc_index = 0; +-+ +-+ for (int ph = 0; ph < pooled_height; ph++) { +-+ for (int pw = 0; pw < pooled_width; pw++) { +-+ int index = index_n_c + ph * pooled_width + pw; +-+ +-+ T output_val = 0.; +-+ for (int iy = 0; iy < roi_bin_grid_h; iy++) { +-+ for (int ix = 0; ix < roi_bin_grid_w; ix++) { +-+ detail::PreCalc pc = pre_calc[pre_calc_index]; +-+ output_val += pc.w1 * offset_input[pc.pos1] + +-+ pc.w2 * offset_input[pc.pos2] + +-+ pc.w3 * offset_input[pc.pos3] + pc.w4 * offset_input[pc.pos4]; +-+ +-+ pre_calc_index += 1; +-+ } +-+ } +-+ output_val /= count; // Average pooling +-+ +-+ output[index] = output_val; +-+ } // for pw +-+ } // for ph +-+ } // for c +-+ } // for n +-+ }); +- } +- +- template +-@@ -183,100 +186,105 @@ void roi_align_backward_kernel_impl( +- int pooled_width, +- int sampling_ratio, +- bool aligned, +-- T* grad_input, +-+ const int64_t grad_input_size, +-+ T* grad_input_buffer, +- const T* rois, +- int n_stride, +- int c_stride, +- int h_stride, +- int w_stride) { +-- for (int index = 0; index < nthreads; index++) { +-- // (n, c, ph, pw) is an element in the pooled output +-- int pw = index % pooled_width; +-- int ph = (index / pooled_width) % pooled_height; +-- int c = (index / pooled_width / pooled_height) % channels; +-- int n = index / pooled_width / pooled_height / channels; +-- +-- const T* offset_rois = rois + n * 5; +-- int roi_batch_ind = offset_rois[0]; +-- +-- // Do not using rounding; this implementation detail is critical +-- T offset = aligned ? (T)0.5 : (T)0.0; +-- T roi_start_w = offset_rois[1] * spatial_scale - offset; +-- T roi_start_h = offset_rois[2] * spatial_scale - offset; +-- T roi_end_w = offset_rois[3] * spatial_scale - offset; +-- T roi_end_h = offset_rois[4] * spatial_scale - offset; +-- +-- T roi_width = roi_end_w - roi_start_w; +-- T roi_height = roi_end_h - roi_start_h; +-- if (!aligned) { +-- // Force malformed ROIs to be 1x1 +-- roi_width = std::max(roi_width, (T)1.); +-- roi_height = std::max(roi_height, (T)1.); +-- } +-- +-- T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); +-- T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); +-- +-- T* offset_grad_input = +-- grad_input + ((roi_batch_ind * channels + c) * height * width); +-- +-- int output_offset = n * n_stride + c * c_stride; +-- const T* offset_grad_output = grad_output + output_offset; +-- const T grad_output_this_bin = +-- offset_grad_output[ph * h_stride + pw * w_stride]; +-- +-- // We use roi_bin_grid to sample the grid and mimic integral +-- int roi_bin_grid_h = (sampling_ratio > 0) +-- ? sampling_ratio +-- : ceil(roi_height / pooled_height); // e.g., = 2 +-- int roi_bin_grid_w = +-- (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); +-- +-- // We do average (integral) pooling inside a bin +-- const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 +-- +-- for (int iy = 0; iy < roi_bin_grid_h; iy++) { +-- const T y = roi_start_h + ph * bin_size_h + +-- static_cast(iy + .5f) * bin_size_h / +-- static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 +-- for (int ix = 0; ix < roi_bin_grid_w; ix++) { +-- const T x = roi_start_w + pw * bin_size_w + +-- static_cast(ix + .5f) * bin_size_w / +-- static_cast(roi_bin_grid_w); +-- +-- T w1, w2, w3, w4; +-- int x_low, x_high, y_low, y_high; +-- +-- bilinear_interpolate_gradient( +-- height, +-- width, +-- y, +-- x, +-- w1, +-- w2, +-- w3, +-- w4, +-- x_low, +-- x_high, +-- y_low, +-- y_high, +-- index); +-- +-- T g1 = grad_output_this_bin * w1 / count; +-- T g2 = grad_output_this_bin * w2 / count; +-- T g3 = grad_output_this_bin * w3 / count; +-- T g4 = grad_output_this_bin * w4 / count; +-- +-- if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { +-- // atomic add is not needed for now since it is single threaded +-- add(offset_grad_input + y_low * width + x_low, static_cast(g1)); +-- add(offset_grad_input + y_low * width + x_high, static_cast(g2)); +-- add(offset_grad_input + y_high * width + x_low, static_cast(g3)); +-- add(offset_grad_input + y_high * width + x_high, static_cast(g4)); +-- } // if +-- } // ix +-- } // iy +-- } // for +-+ int grain_size = ceil(nthreads / at::get_num_threads()); +-+ at::parallel_for(0, nthreads, grain_size, [&](int64_t start, int64_t end) { +-+ for (int index = start; index < end; index++) { +-+ int thread_no = at::get_thread_num(); +-+ // (n, c, ph, pw) is an element in the pooled output +-+ int pw = index % pooled_width; +-+ int ph = (index / pooled_width) % pooled_height; +-+ int c = (index / pooled_width / pooled_height) % channels; +-+ int n = index / pooled_width / pooled_height / channels; +-+ +-+ const T* offset_rois = rois + n * 5; +-+ int roi_batch_ind = offset_rois[0]; +-+ +-+ // Do not using rounding; this implementation detail is critical +-+ T offset = aligned ? (T)0.5 : (T)0.0; +-+ T roi_start_w = offset_rois[1] * spatial_scale - offset; +-+ T roi_start_h = offset_rois[2] * spatial_scale - offset; +-+ T roi_end_w = offset_rois[3] * spatial_scale - offset; +-+ T roi_end_h = offset_rois[4] * spatial_scale - offset; +-+ +-+ T roi_width = roi_end_w - roi_start_w; +-+ T roi_height = roi_end_h - roi_start_h; +-+ if (!aligned) { +-+ // Force malformed ROIs to be 1x1 +-+ roi_width = std::max(roi_width, (T)1.); +-+ roi_height = std::max(roi_height, (T)1.); +-+ } +-+ +-+ T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); +-+ T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); +-+ +-+ T* offset_grad_input = grad_input_buffer + (thread_no * grad_input_size) + +-+ ((roi_batch_ind * channels + c) * height * width); +-+ +-+ int output_offset = n * n_stride + c * c_stride; +-+ const T* offset_grad_output = grad_output + output_offset; +-+ const T grad_output_this_bin = +-+ offset_grad_output[ph * h_stride + pw * w_stride]; +-+ +-+ // We use roi_bin_grid to sample the grid and mimic integral +-+ int roi_bin_grid_h = (sampling_ratio > 0) +-+ ? sampling_ratio +-+ : ceil(roi_height / pooled_height); // e.g., = 2 +-+ int roi_bin_grid_w = +-+ (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); +-+ +-+ // We do average (integral) pooling inside a bin +-+ const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 +-+ +-+ for (int iy = 0; iy < roi_bin_grid_h; iy++) { +-+ const T y = roi_start_h + ph * bin_size_h + +-+ static_cast(iy + .5f) * bin_size_h / +-+ static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 +-+ for (int ix = 0; ix < roi_bin_grid_w; ix++) { +-+ const T x = roi_start_w + pw * bin_size_w + +-+ static_cast(ix + .5f) * bin_size_w / +-+ static_cast(roi_bin_grid_w); +-+ +-+ T w1, w2, w3, w4; +-+ int x_low, x_high, y_low, y_high; +-+ +-+ bilinear_interpolate_gradient( +-+ height, +-+ width, +-+ y, +-+ x, +-+ w1, +-+ w2, +-+ w3, +-+ w4, +-+ x_low, +-+ x_high, +-+ y_low, +-+ y_high, +-+ index); +-+ +-+ T g1 = grad_output_this_bin * w1 / count; +-+ T g2 = grad_output_this_bin * w2 / count; +-+ T g3 = grad_output_this_bin * w3 / count; +-+ T g4 = grad_output_this_bin * w4 / count; +-+ +-+ if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { +-+ // atomic add is not needed for now since it is single threaded +-+ add(offset_grad_input + y_low * width + x_low, static_cast(g1)); +-+ add(offset_grad_input + y_low * width + x_high, static_cast(g2)); +-+ add(offset_grad_input + y_high * width + x_low, static_cast(g3)); +-+ add(offset_grad_input + y_high * width + x_high, static_cast(g4)); +-+ } // if +-+ } // ix +-+ } // iy +-+ } // for +-+ }); +- } +- +- at::Tensor roi_align_forward_kernel( +-@@ -355,6 +363,9 @@ at::Tensor roi_align_backward_kernel( +- return grad_input; +- } +- +-+ const int num_threads = at::get_num_threads(); +-+ at::Tensor grad_input_buffer = +-+ at::zeros({num_threads, batch_size, channels, height, width}, grad.options()); +- // get stride values to ensure indexing into gradients is correct. +- int n_stride = grad.stride(0); +- int c_stride = grad.stride(1); +-@@ -375,13 +386,17 @@ at::Tensor roi_align_backward_kernel( +- pooled_width, +- sampling_ratio, +- aligned, +-- grad_input.data_ptr(), +-+ grad_input.numel(), +-+ grad_input_buffer.data_ptr(), +- rois_.data_ptr(), +- n_stride, +- c_stride, +- h_stride, +- w_stride); +- }); +-+ for (int64_t i = 0; i < num_threads; ++i) { +-+ grad_input.add_(grad_input_buffer.select(0, i)); +-+ } +- return grad_input; +- } +- +diff --git a/test/cpp/c10d/CMakeLists.txt b/test/cpp/c10d/CMakeLists.txt +index 285a5dd2a7..387d54835a 100644 +--- a/test/cpp/c10d/CMakeLists.txt ++++ b/test/cpp/c10d/CMakeLists.txt +@@ -20,6 +20,8 @@ function(c10d_add_test test_src) + $ + $ + ) ++ target_include_directories(${test_name} PRIVATE ${MPI_CXX_INCLUDE_PATH}) ++ target_link_libraries(${test_name} PRIVATE ${MPI_CXX_LIBRARIES}) + target_link_libraries(${test_name} PRIVATE + fmt::fmt-header-only + ${ARG_LINK_LIBRARIES} +@@ -83,7 +85,7 @@ if(USE_MPI AND USE_C10D_MPI) + # private headers of libtorch, which in turn include MPI. As a hacky + # alternative to making MPI a public dependency of libtorch, we make it + # a private dependency of the tests as well. +- c10d_add_test(ProcessGroupMPITest.cpp LINK_LIBRARIES torch_cpu MPI::MPI_CXX INSTALL_TEST ${INSTALL_TEST}) ++ c10d_add_test(ProcessGroupMPITest.cpp LINK_LIBRARIES torch_cpu ${MPI_CXX_LIBRARIES} INSTALL_TEST ${INSTALL_TEST}) + endif() + + if(LINUX AND USE_GLOO AND USE_C10D_GLOO) +diff --git a/test/cpp/nativert/CMakeLists.txt b/test/cpp/nativert/CMakeLists.txt +index 1b4752ed90..39a0f187bc 100644 +--- a/test/cpp/nativert/CMakeLists.txt ++++ b/test/cpp/nativert/CMakeLists.txt +@@ -62,6 +62,7 @@ set(NATIVERT_TEST_DEPENDENCIES torch gtest_main) + + target_link_libraries(test_nativert PRIVATE ${NATIVERT_TEST_DEPENDENCIES}) + target_link_libraries(test_nativert PRIVATE fmt::fmt-header-only) ++target_link_libraries(test_nativert PRIVATE -ldl) + target_include_directories(test_nativert PRIVATE ${ATen_CPU_INCLUDE}) + + if(USE_CUDA) +diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt +index 1632147f02..3fa6ec87fd 100644 +--- a/torch/CMakeLists.txt ++++ b/torch/CMakeLists.txt +@@ -294,7 +294,8 @@ if(USE_DISTRIBUTED) + endif() + # Same for MPI. + if(USE_MPI) +- list(APPEND TORCH_PYTHON_LINK_LIBRARIES MPI::MPI_CXX) ++ list(APPEND TORCH_PYTHON_LINK_LIBRARIES ${MPI_CXX_LIBRARIES}) ++ list(APPEND TORCH_PYTHON_INCLUDE_DIRECTORIES ${MPI_CXX_INCLUDE_PATH}) + endif() + list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D) + diff --git a/llvm21.1.0/patch/tensorpipe.patch b/llvm21.1.0/patch/tensorpipe.patch new file mode 100644 index 0000000..87a2ba5 --- /dev/null +++ b/llvm21.1.0/patch/tensorpipe.patch @@ -0,0 +1,13 @@ +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 77df76d..bba7a14 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -10,6 +10,8 @@ project(tensorpipe LANGUAGES C CXX) + + set(CMAKE_CXX_STANDARD 17) + ++set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-missing-template-arg-list-after-template-kw") ++ + list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake") + + # Expose build options. diff --git a/run/mnist.py b/run/mnist.py new file mode 100644 index 0000000..29d81d6 --- /dev/null +++ b/run/mnist.py @@ -0,0 +1,145 @@ +from __future__ import print_function +import argparse +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +from torchvision import datasets, transforms +from torch.optim.lr_scheduler import StepLR + + +class Net(nn.Module): + def __init__(self): + super(Net, self).__init__() + self.conv1 = nn.Conv2d(1, 32, 3, 1) + self.conv2 = nn.Conv2d(32, 64, 3, 1) + self.dropout1 = nn.Dropout(0.25) + self.dropout2 = nn.Dropout(0.5) + self.fc1 = nn.Linear(9216, 128) + self.fc2 = nn.Linear(128, 10) + + def forward(self, x): + x = self.conv1(x) + x = F.relu(x) + x = self.conv2(x) + x = F.relu(x) + x = F.max_pool2d(x, 2) + x = self.dropout1(x) + x = torch.flatten(x, 1) + x = self.fc1(x) + x = F.relu(x) + x = self.dropout2(x) + x = self.fc2(x) + output = F.log_softmax(x, dim=1) + return output + + +def train(args, model, device, train_loader, optimizer, epoch): + model.train() + for batch_idx, (data, target) in enumerate(train_loader): + data, target = data.to(device), target.to(device) + optimizer.zero_grad() + output = model(data) + loss = F.nll_loss(output, target) + loss.backward() + optimizer.step() + if batch_idx % args.log_interval == 0: + print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( + epoch, batch_idx * len(data), len(train_loader.dataset), + 100. * batch_idx / len(train_loader), loss.item())) + if args.dry_run: + break + + +def test(model, device, test_loader): + model.eval() + test_loss = 0 + correct = 0 + with torch.no_grad(): + for data, target in test_loader: + data, target = data.to(device), target.to(device) + output = model(data) + test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss + pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability + correct += pred.eq(target.view_as(pred)).sum().item() + + test_loss /= len(test_loader.dataset) + + print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( + test_loss, correct, len(test_loader.dataset), + 100. * correct / len(test_loader.dataset))) + + +def main(): + # Training settings + parser = argparse.ArgumentParser(description='PyTorch MNIST Example') + parser.add_argument('--batch-size', type=int, default=64, metavar='N', + help='input batch size for training (default: 64)') + parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', + help='input batch size for testing (default: 1000)') + parser.add_argument('--epochs', type=int, default=14, metavar='N', + help='number of epochs to train (default: 14)') + parser.add_argument('--lr', type=float, default=1.0, metavar='LR', + help='learning rate (default: 1.0)') + parser.add_argument('--gamma', type=float, default=0.7, metavar='M', + help='Learning rate step gamma (default: 0.7)') + parser.add_argument('--no-cuda', action='store_true', default=False, + help='disables CUDA training') + parser.add_argument('--no-mps', action='store_true', default=False, + help='disables macOS GPU training') + parser.add_argument('--dry-run', action='store_true', default=False, + help='quickly check a single pass') + parser.add_argument('--seed', type=int, default=1, metavar='S', + help='random seed (default: 1)') + parser.add_argument('--log-interval', type=int, default=10, metavar='N', + help='how many batches to wait before logging training status') + parser.add_argument('--save-model', action='store_true', default=False, + help='For Saving the current Model') + args = parser.parse_args() + use_cuda = not args.no_cuda and torch.cuda.is_available() + use_mps = not args.no_mps and torch.backends.mps.is_available() + + torch.manual_seed(args.seed) + + if use_cuda: + device = torch.device("cuda") + elif use_mps: + device = torch.device("mps") + else: + device = torch.device("cpu") + + train_kwargs = {'batch_size': args.batch_size} + test_kwargs = {'batch_size': args.test_batch_size} + if use_cuda: + cuda_kwargs = {'num_workers': 1, + 'pin_memory': True, + 'shuffle': True} + train_kwargs.update(cuda_kwargs) + test_kwargs.update(cuda_kwargs) + + transform=transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) + ]) + dataset1 = datasets.MNIST('../data', train=True, download=True, + transform=transform) + dataset2 = datasets.MNIST('../data', train=False, + transform=transform) + train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs) + test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs) + + model = Net().to(device) + optimizer = optim.Adadelta(model.parameters(), lr=args.lr) + + scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) + for epoch in range(1, args.epochs + 1): + train(args, model, device, train_loader, optimizer, epoch) + test(model, device, test_loader) + scheduler.step() + + if args.save_model: + torch.save(model.state_dict(), "mnist_cnn.pt") + + +if __name__ == '__main__': + main() diff --git a/run/run1proc_mnist.sh b/run/run1proc_mnist.sh new file mode 100644 index 0000000..3ff138e --- /dev/null +++ b/run/run1proc_mnist.sh @@ -0,0 +1,36 @@ +#! /bin/bash +#PJM -L "rscunit=rscunit_ft01,rscgrp=small" +#PJM -L elapse=01:00:00 +#PJM -L "node=1" +#PJM -x PJM_LLIO_GFSCACHE=/vol0004 +#PJM -j +#PJM -S + +module purge + +set -euo pipefail + +script_basedir=$(cd $(dirname $0); pwd) +source $script_basedir/env.src +[ -v VENV_PATH ] && source $VENV_PATH/bin/activate + +set -x + +#export OMP_PROC_BIND=false +export OMP_NUM_THREADS=48 + +# For oneDNN debug +# Output debug message (CSV) to stdout. +# The message begin with 'dnnl_verbose,' which is the first entry in CSV. +#export DNNL_VERBOSE=1 # 0: (no output), 1: (exec), 2: (1 + cache hit/miss) +#export DNNL_VERBOSE_TIMESTAMP=1 + +ulimit -s 8192 + +if [ ${PMIX_RANK:-0} -eq 0 ]; then + env + pip3 list + KMP_SETTINGS=1 python3 -c "import torch; print(torch.__version__); print(torch.__config__.show()); print(torch.__config__.parallel_info())" +fi + +LD_PRELOAD=$PREFIX/lib/libtcmalloc.so python3 -u mnist.py --epoch 2 --no-cuda --no-mps