From 443838994fd6a69cfb12b2732085e46ccfa314e5 Mon Sep 17 00:00:00 2001
From: itono-yuichiro <itono.yuichiro@fujitsu.com>
Date: Thu, 5 Mar 2026 19:03:19 +0900
Subject: [PATCH] Pytorch v2.9.1

---
 README.md                         |  190 ++++-
 llvm19.1.4/patch/numpy.patch      |  102 +++
 llvm19.1.4/patch/pytorch.patch    | 1133 +++++++++++++++++++++++++++++
 llvm19.1.4/patch/tensorpipe.patch |   13 +
 llvm21.1.0/patch/numpy.patch      |  102 +++
 llvm21.1.0/patch/pytorch.patch    |  997 +++++++++++++++++++++++++
 llvm21.1.0/patch/tensorpipe.patch |   13 +
 run/mnist.py                      |  145 ++++
 run/run1proc_mnist.sh             |   36 +
 9 files changed, 2730 insertions(+), 1 deletion(-)
 create mode 100644 llvm19.1.4/patch/numpy.patch
 create mode 100644 llvm19.1.4/patch/pytorch.patch
 create mode 100644 llvm19.1.4/patch/tensorpipe.patch
 create mode 100644 llvm21.1.0/patch/numpy.patch
 create mode 100644 llvm21.1.0/patch/pytorch.patch
 create mode 100644 llvm21.1.0/patch/tensorpipe.patch
 create mode 100644 run/mnist.py
 create mode 100644 run/run1proc_mnist.sh

diff --git a/README.md b/README.md
index 11d0608..a30ba12 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,189 @@
-# rccs-pytorch
\ No newline at end of file
+# rccs-pytorch
+
+## はじめに
+
+本書では、「富岳」におけるAIフレームワークPyTorch v2.9系のビルド手順および標準的なテストデータ(mnist)を用いた動作確認の手順について述べる。
+
+## AIプレームワークPyTorchのバージョンアップ
+
+### PyTorchおよび主要モジュールの版数
+
+ビルド対象であるPyTorchおよび主要モジュールの版数を示す。本作業では、Python v3.10、PyTorch v2.9.1、Numpy v1.22.4、Scipy v1.10.1、OneDNN v3.7.1、Horovod v0.26.1を採用することとした。
+
+| モジュール名 | 版数 |
+| --- | --- |
+| Python | v3.10 |
+| PyTorch | v2.9.1 |
+| Numpy | v1.22.4 |
+| Scipy | v1.10.3 |
+| oneDNN | v3.7.1 |
+|Horovod | v0.26.1 |
+
+### ビルド環境の整備
+
+Pytorch v2.9.1の「富岳」向けビルドでは、富士通Githubで公開されている” 富士通 Supercomputer PRIMEHPC FX1000/FX700 上の PyTorch 構築手順”から入手可能なPytorch v1.13.1向けのビルド用スクリプトを利用する。言語環境としては、「富岳」にインストールされているllvm-v21.1.0を用いた。なお、現行の富士通製コンパイラはPytorch v2.9.1をビルドするために必要なC++言語規格要件を満たさない。
+
+#### (1) 富士通GithubからPyTorchをクローンする。
+
+```
+$ git clone https://github.com/fujitsu/pytorch.git
+```
+
+#### (2) pytorch/ディレクトリへ移動し、公式PyTorchのリポジトリを認識する。
+
+```
+$ PYTORCH_TOP=$(cd $(dirname ${BASH_SOURCE:-$0})/pytorch && pwd)
+$ PATCH_DIR=$(cd $(dirname ${BASH_SOURCE:-$0})/patch && pwd
+$ cd ${PYTORCH_TOP}
+$ git remote add upstream https://github.com/pytorch/pytorch.git
+$ git fetch upstream v2.9.1
+```
+
+#### (3) 公式v2.3.1をベースに新しいブランチを作成する。
+
+```
+$ git checkout -b r2.9.1_for_a64fx FETCH_HEAD
+```
+
+#### (4) 富士通PyTorch v1.13.1から、ビルド用スクリプト一式を取り込む。
+
+```
+$ git cherry-pick 17afed104f0a2ac47bab78aebf584fb3c578e707
+$ git reset --mixed HEAD^
+$ git add scripts/fujitsu --all
+$ git commit -m "add scripts/fujitsu"
+```
+
+#### (5) pytorchに対するパッチを適用し、numpyおよびtensorpipeに対するパッチを所定のディレクトリに置く。
+```
+$ cd ${PYTORCH_TOP} && patch -p 1 < ${PATCH_DIR}/pytorch.patch 
+$ cp ${PATCH_DIR}/numpy.patch ${PYTORCH_TOP}/scripts/fujitsu
+$ cp ${PATCH_DIR}/tensorpipe.patch ${PYTORCH_TOP}/scripts/fujitsu
+```
+
+
+### ビルド手順
+ビルド環境の整備後、計算ノード上にて以下のように実行する。なお、すべてのscriptを実行するのには15時間程度を要する。
+```
+$ cd ${PYTORCH_TOP}/pytorch/scripts/fujitsu
+$ . ./env.src
+$ bash 1_python.sh
+$ bash 3_venv.sh
+$ bash 4_numpy_scipy.sh
+$ bash 5_pytorch.sh
+$ bash 6_vision.sh 
+$ bash 7_horovod.sh 
+$ bash 8_libtcmalloc.sh
+```
+
+ビルド用のスクリプトの実行後に出力されるpip3 list(pip3_list.txt)の内容を示す。
+```
+Package            Version
+------------------ ------------------
+beniget            0.4.2.post1
+build              1.4.0
+certifi            2026.1.4
+cffi               2.0.0
+charset-normalizer 3.4.4
+cloudpickle        3.1.2
+cmake              4.2.1
+Cython             0.29.37
+exceptiongroup     1.3.1
+expecttest         0.3.0
+filelock           3.20.3
+fsspec             2026.1.0
+gast               0.6.0
+horovod            0.26.1
+hypothesis         6.151.4
+idna               3.11
+iniconfig          2.3.0
+Jinja2             3.1.6
+lintrunner         0.13.0
+MarkupSafe         3.0.3
+mpmath             1.3.0
+networkx           3.4.2
+ninja              1.13.0
+numpy              1.22.4
+optree             0.18.0
+packaging          26.0
+Pillow             8.4.0
+pip                25.3
+pluggy             1.6.0
+ply                3.11
+psutil             7.2.2
+pybind11           3.0.1
+pycparser          3.0
+Pygments           2.19.2
+pyproject_hooks    1.2.0
+pytest             9.0.2
+pythran            0.18.1
+PyYAML             6.0.3
+requests           2.32.5
+SciPy              1.10.1
+setuptools         73.0.1
+six                1.17.0
+sortedcontainers   2.4.0
+sympy              1.14.0
+tomli              2.4.0
+torch              2.9.1a0+gitcdd1b45
+torchvision        0.24.1+d801a34
+typing_extensions  4.15.0
+urllib3            2.6.3
+uv                 0.9.28
+wheel              0.46.3
+```
+
+### 標準的なテストデータ(mnist)を用いた動作確認 
+
+ビルドしたPyTorch v2.9.1の動作確認では、機械学習の画像認識の学習においてサンプルデータ
+としてよく利用される「mnist」を用いた。
+mnistを実行するコードは公式PyTorchのgithubのexamplesから入手した。
+(https://github.com/pytorch/examples/blob/main/mnist/main.py)
+また、mnistのコードを実行するスクリプトにはscripts/fujitsu/run1proc.shを流用した。
+
+#### mnistの実行環境の構築
+
+run/ディレクトリに格納されている以下の2つのファイルをscripts/fujitsu/配下にコピーする。
+- mnist.py
+- run1proc_mnist.sh
+
+#### mnistの実行
+mnistをジョブ実行する。
+```
+$ cd ${PYTORCH_TOP}/pytorch/scripts/Fujitsu
+$ pjsub ./run1proc_mnist.sh
+```
+
+以下の出力によりmnistがPyTorch v2.9.1で正常に動作していることを確認した。
+
+```
+Train Epoch: 1 [0/60000 (0%)]   Loss: 2.329474
+Train Epoch: 1 [640/60000 (1%)] Loss: 1.425025
+Train Epoch: 1 [1280/60000 (2%)]        Loss: 0.797880
+Train Epoch: 1 [1920/60000 (3%)]        Loss: 0.536055
+Train Epoch: 1 [2560/60000 (4%)]        Loss: 0.444745
+Train Epoch: 1 [3200/60000 (5%)]        Loss: 0.262757
+                            :
+Train Epoch: 1 [56960/60000 (95%)]      Loss: 0.050381
+Train Epoch: 1 [57600/60000 (96%)]      Loss: 0.137881
+Train Epoch: 1 [58240/60000 (97%)]      Loss: 0.006410
+Train Epoch: 1 [58880/60000 (98%)]      Loss: 0.003386
+Train Epoch: 1 [59520/60000 (99%)]      Loss: 0.002083
+
+Test set: Average loss: 0.0497, Accuracy: 9830/10000 (98%)
+
+Train Epoch: 2 [0/60000 (0%)]   Loss: 0.026067
+Train Epoch: 2 [640/60000 (1%)] Loss: 0.045588
+Train Epoch: 2 [1280/60000 (2%)]        Loss: 0.069181
+Train Epoch: 2 [1920/60000 (3%)]        Loss: 0.178524
+Train Epoch: 2 [2560/60000 (4%)]        Loss: 0.084490
+Train Epoch: 2 [3200/60000 (5%)]        Loss: 0.047848
+                            :
+Train Epoch: 2 [56960/60000 (95%)]      Loss: 0.038513
+Train Epoch: 2 [57600/60000 (96%)]      Loss: 0.112719
+Train Epoch: 2 [58240/60000 (97%)]      Loss: 0.022632
+Train Epoch: 2 [58880/60000 (98%)]      Loss: 0.009396
+Train Epoch: 2 [59520/60000 (99%)]      Loss: 0.002736
+
+Test set: Average loss: 0.0375, Accuracy: 9877/10000 (99%)
+```
diff --git a/llvm19.1.4/patch/numpy.patch b/llvm19.1.4/patch/numpy.patch
new file mode 100644
index 0000000..7a5ce4e
--- /dev/null
+++ b/llvm19.1.4/patch/numpy.patch
@@ -0,0 +1,102 @@
+diff --git a/numpy/distutils/fcompiler/__init__.py b/numpy/distutils/fcompiler/__init__.py
+index d8dcfa8..ebe0647 100644
+--- a/numpy/distutils/fcompiler/__init__.py
++++ b/numpy/distutils/fcompiler/__init__.py
+@@ -745,7 +745,7 @@ def wrap_unlinkable_objects(self, objects, output_dir, extra_dll_dir):
+     ('cygwin.*', ('gnu', 'intelv', 'absoft', 'compaqv', 'intelev', 'gnu95', 'g95')),
+     ('linux.*', ('arm', 'gnu95', 'intel', 'lahey', 'pg', 'nv', 'absoft', 'nag',
+                  'vast', 'compaq', 'intele', 'intelem', 'gnu', 'g95', 
+-                 'pathf95', 'nagfor', 'fujitsu')),
++                 'pathf95', 'nagfor', 'fujitsu', 'llvm')),
+     ('darwin.*', ('gnu95', 'nag', 'nagfor', 'absoft', 'ibm', 'intel', 'gnu',
+                  'g95', 'pg')),
+     ('sunos.*', ('sun', 'gnu', 'gnu95', 'g95')),
+diff --git a/numpy/distutils/fcompiler/llvm.py b/numpy/distutils/fcompiler/llvm.py
+new file mode 100644
+index 0000000..f3db492
+--- /dev/null
++++ b/numpy/distutils/fcompiler/llvm.py
+@@ -0,0 +1,71 @@
++from __future__ import division, absolute_import, print_function
++                                                                               
++import sys                                                                     
++                                                                               
++from numpy.distutils.fcompiler import FCompiler, dummy_fortran_file
++from sys import platform                                                       
++from os.path import join, dirname, normpath
++
++compilers = ['LlvmFlangFCompiler']
++
++import functools
++
++class LlvmFlangFCompiler(FCompiler):
++    compiler_type = 'llvm'
++    description = 'LLVM Fortran Compiler'
++    version_pattern = r'\s*flang.*version (?P<version>[\d.-]+).*'
++
++    possible_executables = ['flang']
++
++    executables = {
++        'version_cmd': ["", "--version"],
++        'compiler_f77': ["flang", "-fPIC"],
++        'compiler_fix': ["flang", "-fPIC", "-ffixed-form"],
++        'compiler_f90': ["flang", "-fPIC"],
++        'linker_so': ["flang", "-fPIC", "-shared"],
++        'archiver': ["ar", "-cr"],
++        'ranlib':  None
++    }
++
++    pic_flags = ["-fPIC", "-DPIC"]
++    c_compiler = 'clang'
++    module_dir_switch = '-module '  # Don't remove ending space!
++
++    def get_libraries(self):
++        opt = FCompiler.get_libraries(self)
++        return opt
++
++    @functools.lru_cache(maxsize=128)
++    def get_library_dirs(self):
++        """List of compiler library directories."""
++        opt = FCompiler.get_library_dirs(self)
++        flang_dir = dirname(self.executables['compiler_f77'][0])
++        opt.append(normpath(join(flang_dir, '..', 'lib')))
++
++        return opt
++
++    def get_flags(self):
++        return []
++
++    def get_flags_free(self):
++        return []
++
++    def get_flags_debug(self):
++        return ['-g']
++
++    def get_flags_opt(self):
++        return ['-O3']
++
++    def get_flags_arch(self):
++        return []
++
++    def runtime_library_dir_option(self, dir):
++        return '-Wl,-rpath=%s' % dir
++
++
++if __name__ == '__main__':
++    from distutils import log
++    log.set_verbosity(2)
++    from numpy.distutils import customized_fcompiler
++    print(customized_fcompiler(compiler='llvm').get_version())
++
+diff --git a/numpy/tests/test_public_api.py b/numpy/tests/test_public_api.py
+index bb15e10..9369424 100644
+--- a/numpy/tests/test_public_api.py
++++ b/numpy/tests/test_public_api.py
+@@ -233,6 +233,7 @@ def test_NPY_NO_EXPORT():
+     "distutils.fcompiler.sun",
+     "distutils.fcompiler.vast",
+     "distutils.fcompiler.fujitsu",
++    "distutils.fcompiler.llvm",
+     "distutils.from_template",
+     "distutils.intelccompiler",
+     "distutils.lib2def",
diff --git a/llvm19.1.4/patch/pytorch.patch b/llvm19.1.4/patch/pytorch.patch
new file mode 100644
index 0000000..7d9a172
--- /dev/null
+++ b/llvm19.1.4/patch/pytorch.patch
@@ -0,0 +1,1133 @@
+diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
+index 6ab41b6c84..61a0cd2f9e 100644
+--- a/caffe2/CMakeLists.txt
++++ b/caffe2/CMakeLists.txt
+@@ -1549,7 +1549,8 @@ target_link_libraries(torch_cpu PUBLIC ${Caffe2_PUBLIC_DEPENDENCY_LIBS})
+ target_link_libraries(torch_cpu PRIVATE ${Caffe2_DEPENDENCY_LIBS})
+ target_link_libraries(torch_cpu PRIVATE ${Caffe2_DEPENDENCY_WHOLE_LINK_LIBS})
+ if(USE_MPI)
+-  target_link_libraries(torch_cpu PRIVATE MPI::MPI_CXX)
++  target_link_libraries(torch_cpu PRIVATE ${MPI_CXX_LIBRARIES})
++  target_include_directories(torch_cpu PRIVATE ${MPI_CXX_INCLUDE_PATH})
+ endif()
+ target_include_directories(torch_cpu INTERFACE $<INSTALL_INTERFACE:include>)
+ target_include_directories(torch_cpu PRIVATE ${Caffe2_CPU_INCLUDE})
+@@ -1727,7 +1728,8 @@ if(BUILD_SHARED_LIBS)
+   endif()
+   set_target_properties(torch_global_deps PROPERTIES LINKER_LANGUAGE C)
+   if(USE_MPI)
+-    target_link_libraries(torch_global_deps MPI::MPI_CXX)
++    target_link_libraries(torch_global_deps ${MPI_CXX_LIBRARIES})
++    target_include_directories(torch_global_deps PUBLIC ${MPI_CXX_INCLUDE_PATH})
+   endif()
+   if(CAFFE2_USE_MKL)
+     target_link_libraries(torch_global_deps caffe2::mkl)
+diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
+index ef5c2fd4e9..a266e956aa 100644
+--- a/cmake/Dependencies.cmake
++++ b/cmake/Dependencies.cmake
+@@ -162,7 +162,7 @@ else()
+   set(AT_MKLDNN_ENABLED 0)
+   set(AT_MKL_ENABLED 0)
+ endif()
+-set_property(CACHE BLAS PROPERTY STRINGS "ATLAS;BLIS;Eigen;FLAME;Generic;MKL;OpenBLAS;vecLib;APL")
++set_property(CACHE BLAS PROPERTY STRINGS "ATLAS;BLIS;Eigen;FLAME;Generic;MKL;OpenBLAS;vecLib;APL;SSL2")
+ message(STATUS "Trying to find preferred BLAS backend of choice: " ${BLAS})
+ set(BLAS_CHECK_F2C 0)
+ 
+@@ -233,6 +233,20 @@ elseif(BLAS STREQUAL "FlexiBLAS")
+   include_directories(SYSTEM ${FlexiBLAS_INCLUDE_DIR})
+   list(APPEND Caffe2_DEPENDENCY_LIBS ${FlexiBLAS_LIB})
+   set(BLAS_CHECK_F2C 1)
++elseif(BLAS STREQUAL "SSL2")
++  if(CMAKE_CXX_COMPILER MATCHES ".*/clang\\+\\+$"
++      AND CMAKE_C_COMPILER MATCHES ".*/clang$")
++    message(STATUS "SSL2 Selected BLAS library")
++    list(APPEND Caffe2_PUBLIC_DEPENDENCY_LIBS "fjlapackexsve.so")
++    set(SSL2_FOUND ON)
++    message(STATUS "set CMAKE_SHARED_LINKER_FLAGS: -SSL2")
++    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -SSL2")
++    set(WITH_BLAS "ssl2")
++  else()
++    message(STATUS "Not built using clang and clang++.")
++    message(STATUS "CMAKE_C_COMPILER: ${CMAKE_C_COMPILER}")
++    message(STATUS "CMAKE_CXX_COMPILER: ${CMAKE_CXX_COMPILER}")
++  endif()
+ elseif(BLAS STREQUAL "APL")
+   find_package(APL REQUIRED)
+   include_directories(SYSTEM ${APL_INCLUDE_DIR})
+diff --git a/cmake/Modules/FindARM.cmake b/cmake/Modules/FindARM.cmake
+index 903025c5c2..a419c1aeed 100644
+--- a/cmake/Modules/FindARM.cmake
++++ b/cmake/Modules/FindARM.cmake
+@@ -153,7 +153,7 @@ IF(CMAKE_SYSTEM_NAME MATCHES "Linux")
+ 
+     # Check for SVE256 vector length
+     CHECK_COMPILES(CXX "SVE256" "-march=armv8.2-a+sve -msve-vector-bits=256" "${SVE_CODE}")
+-    CHECK_COMPILES(CXX "ARM_BF16" "-march=armv8.2-a+sve+bf16 -msve-vector-bits=256" "${ARM_BF16_CODE}")
++    #CHECK_COMPILES(CXX "ARM_BF16" "-march=armv8.2-a+sve+bf16 -msve-vector-bits=256" "${ARM_BF16_CODE}")
+ 
+     # If SVE256 support is not found, set CXX_SVE_FOUND to FALSE and notify the user
+     if(NOT CXX_SVE256_FOUND)
+diff --git a/cmake/Modules/FindBLAS.cmake b/cmake/Modules/FindBLAS.cmake
+index b4b158fc49..948f7e99ad 100644
+--- a/cmake/Modules/FindBLAS.cmake
++++ b/cmake/Modules/FindBLAS.cmake
+@@ -290,6 +290,28 @@ if((NOT BLAS_LIBRARIES)
+   endif()
+ endif()
+ 
++# BLAS in SSL2 library?
++if((NOT BLAS_LIBRARIES)
++    AND ((NOT WITH_BLAS) OR (WITH_BLAS STREQUAL "ssl2")))
++  if(CMAKE_CXX_COMPILER MATCHES ".*/clang\\+\\+$"
++      AND CMAKE_C_COMPILER MATCHES ".*/clang$")
++    check_fortran_libraries(
++    BLAS_LIBRARIES
++    BLAS
++    sgemm
++    "-SSL2"
++    "fjlapackexsve")
++    if (BLAS_LIBRARIES)
++      set(BLAS_INFO "ssl2")
++      set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -SSL2")
++    endif (BLAS_LIBRARIES)
++  else()
++    message(STATUS "Not built using clang and clang++.")
++    message(STATUS "CMAKE_C_COMPILER: ${CMAKE_C_COMPILER}")
++    message(STATUS "CMAKE_CXX_COMPILER: ${CMAKE_CXX_COMPILER}")
++  endif()
++endif()
++
+ # Generic BLAS library?
+ if((NOT BLAS_LIBRARIES)
+     AND ((NOT WITH_BLAS) OR (WITH_BLAS STREQUAL "generic")))
+diff --git a/cmake/Modules/FindLAPACK.cmake b/cmake/Modules/FindLAPACK.cmake
+index 500bec8cef..062610259e 100644
+--- a/cmake/Modules/FindLAPACK.cmake
++++ b/cmake/Modules/FindLAPACK.cmake
+@@ -208,6 +208,18 @@ if(BLAS_FOUND)
+     endif()
+   endif()
+ 
++  # SSL2
++  if((NOT LAPACK_INFO) AND (BLAS_INFO STREQUAL "ssl2"))
++    set(CMAKE_REQUIRED_LIBRARIES ${BLAS_LIBRARIES})
++    check_function_exists("cheev_" SSL2_LAPACK_WORKS)
++    set(CMAKE_REQUIRED_LIBRARIES)
++    if(SSL2_LAPACK_WORKS)
++      SET(LAPACK_INFO "ssl2")
++    else()
++      message(STATUS "Strangely, this SSL2 library does not support Lapack?!")
++    endif()
++  endif()
++
+   # Generic LAPACK library?
+   IF((NOT LAPACK_INFO) AND (BLAS_INFO STREQUAL "generic"))
+     check_lapack_libraries(
+diff --git a/cmake/Modules/FindMPI.cmake b/cmake/Modules/FindMPI.cmake
+new file mode 100644
+index 0000000000..8c5274199b
+--- /dev/null
++++ b/cmake/Modules/FindMPI.cmake
+@@ -0,0 +1,55 @@
++if(CMAKE_C_COMPILER MATCHES ".*/clang$" AND
++    CMAKE_CXX_COMPILER MATCHES ".*/clang\\+\\+$")
++  if(DEFINED ENV{MPI_HOME})
++    set(TCSMPI_EXEC_PATH "$ENV{MPI_HOME}/bin")
++  else()
++    string(REGEX REPLACE "/clang\\+\\+$" "" CMAKE_CXX_COMPILER_DIR "${CMAKE_CXX_COMPILER}")
++    set(TCSMPI_EXEC_PATH "${CMAKE_CXX_COMPILER_DIR}")
++  endif()
++  execute_process(COMMAND "${TCSMPI_EXEC_PATH}/mpiclang++" "--show"
++    RESULT_VARIABLE MPIFCC_EXEC_RESULT
++    OUTPUT_QUIET
++    ERROR_QUIET)
++  if(MPIFCC_EXEC_RESULT EQUAL 0)
++    message(STATUS "TCS-MPI ENABLED")
++    set(MPI_CXX_FOUND ON)
++    execute_process(COMMAND "${TCSMPI_EXEC_PATH}/mpiclang++" "--showme:compile"
++      OUTPUT_VARIABLE MPI_CXX_COMPILE_FLAGS)
++    string(REPLACE "\n" "" MPI_CXX_COMPILE_FLAGS "${MPI_CXX_COMPILE_FLAGS}")
++    execute_process(COMMAND "${TCSMPI_EXEC_PATH}/mpiclang++" "--showme:incdirs"
++      OUTPUT_VARIABLE MPI_CXX_INCLUDE_PATH)
++    string(REPLACE "\n" ";" MPI_CXX_INCLUDE_PATH "${MPI_CXX_INCLUDE_PATH}")
++    execute_process(COMMAND "${TCSMPI_EXEC_PATH}/mpiclang++" "--showme:link"
++      OUTPUT_VARIABLE MPI_CXX_LINK_FLAGS)
++    string(REPLACE "\n" "" MPI_CXX_LINK_FLAGS "${MPI_CXX_LINK_FLAGS}")
++    execute_process(COMMAND "${TCSMPI_EXEC_PATH}/mpiclang++" "--showme:libdirs"
++      OUTPUT_VARIABLE MPI_CXX_LIBRARY_DIRS)
++    string(REPLACE "\n" "" MPI_CXX_LIBRARY_DIRS "${MPI_CXX_LIBRARY_DIRS}")
++    string(REPLACE " " ";" MPI_CXX_LIBRARY_DIRS "${MPI_CXX_LIBRARY_DIRS}")
++    foreach(dir IN LISTS MPI_CXX_LIBRARY_DIRS)
++      if(dir MATCHES "llvm-v19\\.1\\.4" OR dir MATCHES "llvm-v19\\.1\\.0" OR dir MATCHES "llvm-v17\\.0\\.2")
++        set(MPI_CXX_LIBRARIES "${dir}/libmpi.so")
++      endif()
++    endforeach()
++    set(MPI_FOUND ON)
++    set(MPI_C_FOUND ON)
++    set(MPIEXEC "${TCSMPI_EXEC_PATH}/mpiexec")
++    set(MPI_COMPILE_FLAGS ${MPI_CXX_COMPILE_FLAGS})
++    set(MPI_C_COMPILE_FLAGS ${MPI_CXX_COMPILE_FLAGS})
++    set(MPI_INCLUDE_PATH ${MPI_CXX_INCLUDE_PATH})
++    set(MPI_C_INCLUDE_PATH ${MPI_CXX_INCLUDE_PATH})
++    set(MPI_LINK_FLAGS ${MPI_CXX_LINK_FLAGS})
++    set(MPI_C_LINK_FLAGS ${MPI_CXX_LINK_FLAGS})
++    set(MPI_LIBRARIES ${MPI_CXX_LIBRARIES})
++    set(MPI_C_LIBRARIES ${MPI_CXX_LIBRARIES})
++  else()
++    message(STATUS "TCS-MPI DISABLED")
++  endif()
++endif()
++if(NOT MPI_FOUND)
++  set(CMAKE_MODULE_PATH_TMP "${CMAKE_MODULE_PATH}")
++  unset(CMAKE_MODULE_PATH)
++  find_package(MPI)
++  set(CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH_TMP}")
++  unset(CMAKE_MODULE_PATH_TMP)
++endif()
+diff --git a/cmake/Modules/FindOpenMP.cmake b/cmake/Modules/FindOpenMP.cmake
+index 8a9abff398..6e82b2964c 100644
+--- a/cmake/Modules/FindOpenMP.cmake
++++ b/cmake/Modules/FindOpenMP.cmake
+@@ -98,6 +98,9 @@ function(_OPENMP_FLAG_CANDIDATES LANG)
+       # regular clang flags
+       set(OMP_FLAG_Clang "-fopenmp=libomp" "-fopenmp=libiomp5" "-fopenmp")
+     endif()
++    if(BLAS STREQUAL "SSL2")
++      set(OMP_FLAG_Clang "-Kopenmp")
++    endif()
+ 
+     if(WIN32)
+       # Prefer Intel OpenMP header which can be provided by CMAKE_INCLUDE_PATH.
+@@ -259,6 +262,29 @@ function(_OPENMP_GET_FLAGS LANG FLAG_MODE OPENMP_FLAG_VAR OPENMP_LIB_NAMES_VAR)
+       find_package(MKL QUIET)
+       unset(IN_FIND_OMP CACHE)
+     endif()
++      if(MKL_FOUND AND (NOT "${MKL_OPENMP_LIBRARY}" STREQUAL ""))
++        # If we already link OpenMP via MKL, use that. Otherwise at run-time
++        # OpenMP will complain about being initialized twice (OMP: Error #15),
++        # can may cause incorrect behavior.
++        set(OpenMP_libomp_LIBRARY "${MKL_OPENMP_LIBRARY}" CACHE STRING "libomp location for OpenMP")
++      elseif(BLAS STREQUAL "SSL2")
++	try_compile( OpenMP_COMPILE_RESULT_${FLAG_MODE}_${OPENMP_PLAIN_FLAG} ${CMAKE_BINARY_DIR} ${_OPENMP_TEST_SRC}
++          CMAKE_FLAGS "-DCOMPILE_DEFINITIONS:STRING=${OPENMP_FLAGS_TEST}"
++          OUTPUT_VARIABLE OpenMP_TRY_COMPILE_OUTPUT
++          )
++        if(OpenMP_COMPILE_RESULT_${FLAG_MODE}_${OPENMP_PLAIN_FLAG})
++	  set("${OPENMP_FLAG_VAR}" "${OPENMP_FLAG}" PARENT_SCOPE)
++          set("${OPENMP_LIB_NAMES_VAR}" "" PARENT_SCOPE)
++          break()
++	endif()
++      else()
++        find_library(OpenMP_libomp_LIBRARY
++          NAMES omp gomp iomp5
++          HINTS ${CMAKE_${LANG}_IMPLICIT_LINK_DIRECTORIES}
++          DOC "libomp location for OpenMP"
++        )
++      endif()
++      mark_as_advanced(OpenMP_libomp_LIBRARY)
+ 
+     if(MKL_OPENMP_LIBRARY)
+       # If we already link OpenMP via MKL, use that. Otherwise at run-time
+diff --git a/scripts/fujitsu/1_python.sh b/scripts/fujitsu/1_python.sh
+index 90790698f9..0c9f828549 100755
+--- a/scripts/fujitsu/1_python.sh
++++ b/scripts/fujitsu/1_python.sh
+@@ -39,7 +39,7 @@ source $script_basedir/env.src
+ 
+ if [ -v fjenv_debug ]; then set -x; fi
+ 
+-PYTHON_VER=3.9
++PYTHON_VER=3.10
+ PYTHON_DIR=cpython
+ 
+ #
+@@ -83,7 +83,7 @@ if [ "$fjenv_use_fcc" = "true" ]; then
+     # TODO: $ORIGIN sometimes parsed as 'RIGIN'.
+     # perhaps more backslashs are needed to protect $ORIGIN from parsing in shell.
+     # export LDFLAGS="-Wl,-rpath,\$ORIGIN/../lib"
+-    export LDFLAGS="-Wl,-rpath,${PREFIX}/lib -Wl,-rpath,${TCSDS_PATH}/lib64 -lpthread"
++    export LDFLAGS="-Wl,-rpath,${PREFIX}/lib -L/usr/lib64"
+ else
+     # Ditto.
+     #export LDFLAGS="-Wl,-rpath,\$ORIGIN/../lib"
+@@ -105,7 +105,7 @@ if [ "${fjenv_use_fcc}" = "true" ]; then
+     #   We used to link with '--linkfortran', which turned out to be unnecessary.
+     #   It was used to link with the module solery written by Fortran.
+     # ${CXX} --linkfortran -SSL2 -Kopenmp -Nlibomp -o python Programs/python.o -L. -lpython$PYTHON_VER $LDFLAGS
+-    ${CXX} -Kopenmp -Nlibomp -SSL2BLAMP -lfjlapackexsve -o python Programs/python.o -L. -lpython$PYTHON_VER $LDFLAGS
++    ${CXX} -Kopenmp -SSL2BLAMP -lfjlapackexsve -o python Programs/python.o -L. -lpython$PYTHON_VER $LDFLAGS
+ fi
+ 
+ make install
+@@ -119,7 +119,7 @@ hash -r
+ # Note that python 3.9 buildles setuptools 58.1.
+ #pip3 uninstall -y setuptools
+ 
+-pip3 install --upgrade ${PIP3_OPTIONS} 'setuptools>60.6.0'        # or setuptools<59.6.0
++pip3 install --upgrade ${PIP3_OPTIONS} 'setuptools==73.0.1'        # or setuptools<59.6.0
+ 
+ # Show configuration
+ 
+diff --git a/scripts/fujitsu/3_venv.sh b/scripts/fujitsu/3_venv.sh
+index b271fdc535..4169a4930d 100755
+--- a/scripts/fujitsu/3_venv.sh
++++ b/scripts/fujitsu/3_venv.sh
+@@ -81,9 +81,9 @@ fi
+ # Workaround is found in:
+ #    See https://stackoverflow.com/questions/70520120/attributeerror-module-setuptools-distutils-has-no-attribute-version
+ 
+-pip3 install --upgrade ${PIP3_OPTIONS} 'setuptools>60.6.0'        # or setuptools<59.6.0
++pip3 install --upgrade ${PIP3_OPTIONS} 'setuptools==73.0.1'        # or setuptools<59.6.0
+ 
+-pip3 install --upgrade ${PIP3_OPTIONS} pip future six wheel
++pip3 install --upgrade ${PIP3_OPTIONS} pip==25.3
+ 
+ pip3 list | tee $script_basedir/pip3_list.txt
+ 
+diff --git a/scripts/fujitsu/4_numpy_scipy.sh b/scripts/fujitsu/4_numpy_scipy.sh
+index fe07872ab3..6a50b7f880 100755
+--- a/scripts/fujitsu/4_numpy_scipy.sh
++++ b/scripts/fujitsu/4_numpy_scipy.sh
+@@ -41,8 +41,12 @@ if [ -v fjenv_debug ]; then set -x; fi
+ 
+ NUMPY_VER=v1.22.4
+ NUMPY_DIR=numpy
+-SCIPY_VER=v1.7.3
++SCIPY_VER=v1.10.1
+ SCIPY_DIR=scipy
++SCIPY_CHERRY_PICK=ab7d08c6148286059f6498ab5c3070268d13cbd9
++export NPY_BLAS_ORDER=openblas
++export NPY_LAPACK_ORDER=openblas
++
+ 
+ #
+ # Clean up
+@@ -61,17 +65,20 @@ fi
+ [ -d ${DOWNLOAD_PATH} ] || mkdir -p ${DOWNLOAD_PATH}
+ cd ${DOWNLOAD_PATH}
+ 
+-[ -d $NUMPY_DIR ] ||
++if [ ! -d $NUMPY_DIR ]; then
+     git clone ${GIT_OPTIONS} \
+ 	-b $NUMPY_VER \
+     	--depth 1 \
+ 	https://github.com/numpy/numpy.git $NUMPY_DIR
++    (cd $NUMPY_DIR && patch -p 1 < $script_basedir/numpy.patch)
++fi
+ 
+-[ -d $SCIPY_DIR ] ||
++if [ ! -d $SCIPY_DIR ]; then
+     git clone ${GIT_OPTIONS} --recursive \
+ 	-b $SCIPY_VER \
+-    	--depth 1 \
+ 	https://github.com/scipy/scipy.git $SCIPY_DIR
++    (cd $SCIPY_DIR && git cherry-pick ${SCIPY_CHERRY_PICK})
++fi
+ 
+ [ -v fjenv_download ] && fjenv_safe_exit 0
+ 
+@@ -94,7 +101,7 @@ fi
+ 
+ # NumPy maintenance/1.22.x requires Cythone >= 0.29.21
+ # NumPy maintenance/1.22.2 requires Cythone >= 0.29.30
+-pip3 install ${PIP3_OPTIONS} 'Cython>=0.29.30' ||
++pip3 install ${PIP3_OPTIONS} 'Cython>=0.29.33,<3.0' ||
+     pip3 install ${PIP3_OPTIONS} $PIP_PACKAGE_PATH/Cython*.whl
+ 
+ cd $DOWNLOAD_PATH/$NUMPY_DIR
+@@ -108,19 +115,19 @@ if [ "$fjenv_use_fcc" = "true" -a ! -f site.cfg ]; then
+     cat <<EOF>site.cfg
+ [openblas]
+ libraries = fjlapackexsve
+-library_dirs = $TCSDS_PATH/lib64
+-include_dirs = $TCSDS_PATH/include
++library_dirs = ${SSL2_ROOT}/lib64
++include_dirs = ${SSL2_ROOT}/include
+ extra_link_args = -SSL2BLAMP
+ 
+ [lapack]
+ lapack_libs = fjlapackexsve
+-library_dirs = $TCSDS_PATH/lib64
++library_dirs = ${SSL2_ROOT}/lib64
+ extra_link_args = -SSL2BLAMP
+ EOF
+ fi
+ 
+ NPY_NUM_BUILD_JOBS=$MAX_JOBS	\
+-    python3 setup.py build -j $MAX_JOBS install
++    python3 setup.py build -j $MAX_JOBS config_fc --fcompiler=llvm install
+ 
+ #
+ # Build SciPy
+@@ -130,7 +137,7 @@ NPY_NUM_BUILD_JOBS=$MAX_JOBS	\
+ # older than what NumPy is requiring, but running for reference purpose,
+ # such as in case of using older NumPy.
+ 
+-pip3 install ${PIP3_OPTIONS} 'Cython>=0.29.18'
++pip3 install ${PIP3_OPTIONS} 'Cython>=0.29.33,<3.0'
+ pip3 install ${PIP3_OPTIONS} pybind11 pythran
+ 
+ cd $DOWNLOAD_PATH/$SCIPY_DIR
+@@ -140,7 +147,7 @@ if [ -v fjenv_rebuild ]; then
+ fi
+ 
+ SCIPY_NUM_CYTHONIZE_JOBS=$MAX_JOBS	\
+-    python3 setup.py build -j $MAX_JOBS --fcompiler=fujitsu install
++    python3 setup.py build -j $MAX_JOBS config_fc --fcompiler=llvm install
+ 
+ pip3 list | tee $script_basedir/pip3_list.txt
+ 
+diff --git a/scripts/fujitsu/5_pytorch.sh b/scripts/fujitsu/5_pytorch.sh
+index 8128e584c8..ce820f847d 100755
+--- a/scripts/fujitsu/5_pytorch.sh
++++ b/scripts/fujitsu/5_pytorch.sh
+@@ -40,7 +40,7 @@ source $script_basedir/env.src
+ 
+ if [ -v fjenv_debug ]; then set -x; fi
+ 
+-ONEDNN_VER=v2.7
++ONEDNN_VER=v3.7.1
+ 
+ #
+ # Clean up
+@@ -59,8 +59,9 @@ fi
+ if [ ! -d $PYTORCH_TOP/third_party/ideep/mkl-dnn ]; then
+     cd $PYTORCH_TOP
+     git submodule update --init --recursive $GIT_OPTIONS
++    cd $PYTORCH_TOP/third_party/tensorpipe && patch -p 1 < $script_basedir/tensorpipe.patch
+ fi
+-cd $PYTORCH_TOP/third_party/ideep/mkl-dnn/third_party/oneDNN
++cd $PYTORCH_TOP/third_party/ideep/mkl-dnn
+ git checkout $GIT_OPTIONS $ONEDNN_VER
+ 
+ [ -v fjenv_download ] && fjenv_safe_exit 0
+@@ -96,7 +97,7 @@ fi
+ 
+ # 'setup.py' in PyTorch ensures that CFLAGS is used for both C and C++ compiler,
+ # but just in case...
+-CFLAGS=-O3 CXXFLAGS=-O3 python3 setup.py build -j $MAX_JOBS install
++BLAS=SSL2 CFLAGS='-O3 -Kopenmp' CXXFLAGS="-O3 -Kopenmp" python3 setup.py build -j $MAX_JOBS install
+ 
+ pip3 list | tee $script_basedir/pip3_list.txt
+ 
+diff --git a/scripts/fujitsu/6_vision.sh b/scripts/fujitsu/6_vision.sh
+index 68034079e4..19378d157f 100755
+--- a/scripts/fujitsu/6_vision.sh
++++ b/scripts/fujitsu/6_vision.sh
+@@ -42,9 +42,9 @@ if [ -v fjenv_debug ]; then set -x; fi
+ 
+ JPEG_ARCHIVE_NAME=jpegsrc.v9d
+ JPEG_DIR=jpeg-9d
+-PILLOW_VER=7.2.0
++PILLOW_VER=8.4.0
+ PILLOW_DIR=Pillow
+-TORCHVISION_VER=v0.14.1
++TORCHVISION_VER=v0.24.1
+ TORCHVISION_DIR=vision
+ 
+ #
+@@ -132,7 +132,7 @@ export LDFLAGS="-Wl,-rpath,${PREFIX}/lib"
+ if [ -v fjenv_rebuild ]; then
+     python3 setup.py clean
+ fi
+-python3 setup.py install
++pip3 install . --verbose --no-build-isolation
+ 
+ #
+ # Install torchvision
+@@ -145,7 +145,7 @@ fi
+ 
+ export TORCHVISION_INCLUDE=$PREFIX/include
+ export TORCHVISION_LIBRARY=$PREFIX/lib
+-CFLAGS="-Kfast" python3 setup.py build -j $MAX_JOBS install
++pip3 install . --verbose --no-build-isolation
+ 
+ pip3 list | tee $script_basedir/pip3_list.txt
+ 
+diff --git a/scripts/fujitsu/7_horovod.sh b/scripts/fujitsu/7_horovod.sh
+index 7aa6f302b9..4e0138da0d 100755
+--- a/scripts/fujitsu/7_horovod.sh
++++ b/scripts/fujitsu/7_horovod.sh
+@@ -41,6 +41,7 @@ if [ -v fjenv_debug ]; then set -x; fi
+ 
+ HOROVOD_VER=v0.26.1
+ HOROVOD_DIR=horovod
++FLATBUFFERS_CHERRY_PICK=20aad0c41e1252b04c72111c3eb221280a9c2009
+ 
+ #
+ # Clean up
+@@ -62,6 +63,7 @@ if [ ! -d horovod ]; then
+         -b $HOROVOD_VER \
+ 	--depth 1 \
+     	https://github.com/horovod/horovod.git
++    (cd horovod/third_party/flatbuffers && git cherry-pick ${FLATBUFFERS_CHERRY_PICK})
+     (cd horovod; patch -p 1 < $script_basedir/horovod.patch)
+     cp -p horovod/examples/pytorch/pytorch_synthetic_benchmark.py $script_basedir
+ fi
+@@ -81,7 +83,7 @@ fi
+ #
+ 
+ if [ "${fjenv_use_fcc}" != "true" ]; then
+-    echo "$0 works for FCC only for now"
++    echo "$0 works for clang++ only for now"
+     exit 1
+ fi
+ 
+diff --git a/scripts/fujitsu/env.src b/scripts/fujitsu/env.src
+index 7dd81f6d6c..47b2add059 100644
+--- a/scripts/fujitsu/env.src
++++ b/scripts/fujitsu/env.src
+@@ -43,10 +43,19 @@ fjenv_src_sourced="Y"
+ ########################################################################
+ ########################################################################
+ 
+-#TCSDS_PATH=/opt/FJSVxtclanga/tcsds-1.2.34	# TCS (FX1000)
+-TCSDS_PATH=/opt/FJSVstclanga/cp-1.0.21.01	# CP  (FX700)
+-VENV_PATH=~/venv
+-PREFIX=~/prefix
++module purge
++module load lang/tcsds-1.2.42
++. /vol0004/apps/oss/llvm-v19.1.4/init.sh
++export SSL2_ROOT=/vol0004/apps/oss/llvm-v19.1.4/compute_node/ssl2
++export MPI_HOME=/vol0004/apps/oss/llvm-v19.1.4/compute_node
++
++TCSDS_PATH=/opt/FJSVxtclanga/tcsds-1.2.42	# TCS (FX1000)
++#TCSDS_PATH=/opt/FJSVstclanga/cp-1.0.21.01	# CP  (FX700)
++#VENV_PATH=~/venv
++#PREFIX=~/prefix
++ROOT_DIR=$(cd $(dirname ${BASH_SOURCE:-$0})/../../..; pwd)
++VENV_PATH=$ROOT_DIR/venv
++PREFIX=$ROOT_DIR/prefix
+ 
+ ########################################################################
+ ########################################################################
+@@ -63,14 +72,18 @@ PIP_PACKAGE_PATH=${DOWNLOAD_PATH}/pip_packages
+ # MAX_JOBS should be 40 or less. (Note: TCS set this to 50 or 52)
+ : ${MAX_JOBS:=40}
+ if [ $MAX_JOBS -gt 40 ]; then MAX_JOBS=40; fi
++export MAX_JOBS=${MAX_JOBS}
+ 
+ #
+ # Env for Compilers
+ #
+ 
+ if [ "$fjenv_use_fcc" = "true" ]; then
+-    export CC="fcc -Nclang -Knolargepage"
+-    export CXX="FCC -Nclang -Knolargepage"
++    export CC="clang"
++    export CXX="clang++"
++    export FC="flang"
++    export F77="flang"
++    export F90="flang"
+     export LC_ALL=C
+ fi
+ 
+diff --git a/scripts/fujitsu/horovod.patch b/scripts/fujitsu/horovod.patch
+index ace3ba5866..c0b1f5b63b 100644
+--- a/scripts/fujitsu/horovod.patch
++++ b/scripts/fujitsu/horovod.patch
+@@ -1,6 +1,3 @@
+-#
+-# patch for v0.26.1 (Oct-14 2022, 34604870eabd9dc670c222deb1da9acc6b9d7c03)
+-#
+ diff --git a/examples/pytorch/pytorch_synthetic_benchmark.py b/examples/pytorch/pytorch_synthetic_benchmark.py
+ index d645a20..a3c838f 100644
+ --- a/examples/pytorch/pytorch_synthetic_benchmark.py
+@@ -60,6 +57,24 @@ index d645a20..a3c838f 100644
+      loss.backward()
+      optimizer.step()
+  
++diff --git a/horovod/torch/CMakeLists.txt b/horovod/torch/CMakeLists.txt
++index eecd198..04816b5 100644
++--- a/horovod/torch/CMakeLists.txt
+++++ b/horovod/torch/CMakeLists.txt
++@@ -63,9 +63,12 @@ endif()
++ parse_version(${Pytorch_VERSION} VERSION_DEC)
++ add_definitions(-DPYTORCH_VERSION=${VERSION_DEC} -DTORCH_API_INCLUDE_EXTENSION_H=1)
++ set(Pytorch_CXX11 ${Pytorch_CXX11} PARENT_SCOPE)
++-if(NOT Pytorch_VERSION VERSION_LESS "1.5.0")
+++if(Pytorch_VERSION VERSION_GREATER_EQUAL "1.5.0" AND Pytorch_VERSION VERSION_LESS "2.0.0")
++     set(CMAKE_CXX_STANDARD 14)
++ endif()
+++if(Pytorch_VERSION VERSION_GREATER_EQUAL "2.0.0")
+++    set(CMAKE_CXX_STANDARD 17)
+++endif()
++ 
++ # PyTorch SOURCES
++ # Later versions of PyTorch that use ROCm's hipify step will rename files.
+ diff --git a/horovod/torch/mpi_ops.py b/horovod/torch/mpi_ops.py
+ index ab764c5..b78a108 100644
+ --- a/horovod/torch/mpi_ops.py
+diff --git a/scripts/fujitsu/vision.patch b/scripts/fujitsu/vision.patch
+index 3a1b5da138..79c0525627 100644
+--- a/scripts/fujitsu/vision.patch
++++ b/scripts/fujitsu/vision.patch
+@@ -1,498 +1,42 @@
+-#
+-# patch for v0.14.1 (Dec-8 2022, 5e8e2f125f140d1e908cf424a6a85cacad758125)
+-#
+ diff --git a/setup.py b/setup.py
+-index 9519890..4a09c3f 100644
++index c3ba164..57ad652 100644
+ --- a/setup.py
+ +++ b/setup.py
+-@@ -209,6 +209,17 @@ def get_extensions():
+-         define_macros += [("USE_PYTHON", None)]
+-         extra_compile_args["cxx"].append("/MP")
++@@ -14,6 +14,7 @@ import torch
++ from pkg_resources import DistributionNotFound, get_distribution, parse_version
++ from setuptools import find_packages, setup
++ from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDA_HOME, CUDAExtension, ROCM_HOME
+++import re
++ 
++ FORCE_CUDA = os.getenv("FORCE_CUDA", "0") == "1"
++ FORCE_MPS = os.getenv("FORCE_MPS", "0") == "1"
++@@ -140,6 +141,29 @@ def get_macros_and_flags():
++         if sysconfig.get_config_var("Py_GIL_DISABLED"):
++             extra_compile_args["cxx"].append("-DPy_GIL_DISABLED")
+  
+ +    # As long as torch is utilizing OpenMP,
+ +    # FCC requires -fopenmp for all submodules even though it doesn't use OpenMP.
+ +    if torch.has_openmp:
+ +        if sys.platform == 'linux':
+-+            try:
+-+                extra_compile_args['cxx'].append('-fopenmp')
+-+            except KeyError:
+-+                extra_compile_args = {
+-+                    'cxx': ['-fopenmp']
+-+                }
+-+
+-     if debug_mode:
+-         print("Compiling in debug mode")
+++            config_output = torch.__config__.show()
+++            cxx_flags_match = re.search(r'CXX_FLAGS=(.*)', config_output)
+++            if cxx_flags_match:
+++                cxx_flags = cxx_flags_match.group(1)
+++                if re.search(r'-fopenmp', cxx_flags):
+++                    try:
+++                        extra_compile_args['cxx'].append('-fopenmp')
+++                    except KeyError:
+++                        extra_compile_args = {
+++                            'cxx': ['-fopenmp']
+++                        }
+++                if re.search(r'-Kopenmp', cxx_flags):
+++                    try:
+++                        extra_compile_args['cxx'].append('-Kopenmp')
+++                    except KeyError:
+++                        extra_compile_args = {
+++                            'cxx': ['-Kopenmp']
+++                        }
+++
++     if DEBUG:
+          extra_compile_args["cxx"].append("-g")
+-diff --git a/torchvision/csrc/ops/cpu/nms_kernel.cpp b/torchvision/csrc/ops/cpu/nms_kernel.cpp
+-index c54d1f0..369b6a9 100644
+---- a/torchvision/csrc/ops/cpu/nms_kernel.cpp
+-+++ b/torchvision/csrc/ops/cpu/nms_kernel.cpp
+-@@ -20,13 +20,6 @@ at::Tensor nms_kernel_impl(
+-   if (dets.numel() == 0)
+-     return at::empty({0}, dets.options().dtype(at::kLong));
+- 
+--  auto x1_t = dets.select(1, 0).contiguous();
+--  auto y1_t = dets.select(1, 1).contiguous();
+--  auto x2_t = dets.select(1, 2).contiguous();
+--  auto y2_t = dets.select(1, 3).contiguous();
+--
+--  at::Tensor areas_t = (x2_t - x1_t) * (y2_t - y1_t);
+--
+-   auto order_t = std::get<1>(
+-       scores.sort(/*stable=*/true, /*dim=*/0, /* descending=*/true));
+- 
+-@@ -34,6 +27,15 @@ at::Tensor nms_kernel_impl(
+-   at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte));
+-   at::Tensor keep_t = at::zeros({ndets}, dets.options().dtype(at::kLong));
+- 
+-+  auto dets_sorted = dets.index_select(0, order_t);
+-+
+-+  auto x1_t = dets_sorted.select(1, 0).contiguous();
+-+  auto y1_t = dets_sorted.select(1, 1).contiguous();
+-+  auto x2_t = dets_sorted.select(1, 2).contiguous();
+-+  auto y2_t = dets_sorted.select(1, 3).contiguous();
+-+
+-+  at::Tensor areas_t = (x2_t - x1_t) * (y2_t - y1_t);
+-+
+-   auto suppressed = suppressed_t.data_ptr<uint8_t>();
+-   auto keep = keep_t.data_ptr<int64_t>();
+-   auto order = order_t.data_ptr<int64_t>();
+-@@ -45,19 +47,16 @@ at::Tensor nms_kernel_impl(
+- 
+-   int64_t num_to_keep = 0;
+- 
+--  for (int64_t _i = 0; _i < ndets; _i++) {
+--    auto i = order[_i];
+-+  for (int64_t i = 0; i < ndets; i++) {
+-     if (suppressed[i] == 1)
+-       continue;
+--    keep[num_to_keep++] = i;
+-     auto ix1 = x1[i];
+-     auto iy1 = y1[i];
+-     auto ix2 = x2[i];
+-     auto iy2 = y2[i];
+-     auto iarea = areas[i];
+- 
+--    for (int64_t _j = _i + 1; _j < ndets; _j++) {
+--      auto j = order[_j];
+-+    for (int64_t j = i + 1; j < ndets; j++) {
+-       if (suppressed[j] == 1)
+-         continue;
+-       auto xx1 = std::max(ix1, x1[j]);
+-@@ -73,6 +72,11 @@ at::Tensor nms_kernel_impl(
+-         suppressed[j] = 1;
+-     }
+-   }
+-+  for (int64_t i = 0; i < ndets; i++) {
+-+    if (suppressed[i] == 1)
+-+      continue;
+-+    keep[num_to_keep++] = order[i];
+-+  }
+-   return keep_t.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep);
+- }
+- 
+-diff --git a/torchvision/csrc/ops/cpu/roi_align_kernel.cpp b/torchvision/csrc/ops/cpu/roi_align_kernel.cpp
+-index e6684e9..d4c6b0e 100644
+---- a/torchvision/csrc/ops/cpu/roi_align_kernel.cpp
+-+++ b/torchvision/csrc/ops/cpu/roi_align_kernel.cpp
+-@@ -1,4 +1,5 @@
+- #include <ATen/ATen.h>
+-+#include <ATen/Parallel.h>
+- #include <torch/library.h>
+- 
+- #include "./roi_align_common.h"
+-@@ -24,87 +25,89 @@ void roi_align_forward_kernel_impl(
+-     T* output) {
+-   // (n, c, ph, pw) is an element in the pooled output
+-   // can be parallelized using omp
+--  // #pragma omp parallel for num_threads(32)
+--  for (int n = 0; n < n_rois; n++) {
+--    int index_n = n * channels * pooled_width * pooled_height;
+--
+--    const T* offset_rois = rois + n * 5;
+--    int roi_batch_ind = offset_rois[0];
+--
+--    // Do not using rounding; this implementation detail is critical
+--    T offset = aligned ? (T)0.5 : (T)0.0;
+--    T roi_start_w = offset_rois[1] * spatial_scale - offset;
+--    T roi_start_h = offset_rois[2] * spatial_scale - offset;
+--    T roi_end_w = offset_rois[3] * spatial_scale - offset;
+--    T roi_end_h = offset_rois[4] * spatial_scale - offset;
+--
+--    T roi_width = roi_end_w - roi_start_w;
+--    T roi_height = roi_end_h - roi_start_h;
+--    if (!aligned) {
+--      // Force malformed ROIs to be 1x1
+--      roi_width = std::max(roi_width, (T)1.);
+--      roi_height = std::max(roi_height, (T)1.);
+--    }
+--
+--    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+--    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+--
+--    // We use roi_bin_grid to sample the grid and mimic integral
+--    int roi_bin_grid_h = (sampling_ratio > 0)
+--        ? sampling_ratio
+--        : ceil(roi_height / pooled_height); // e.g., = 2
+--    int roi_bin_grid_w =
+--        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+--
+--    // We do average (integral) pooling inside a bin
+--    // When the grid is empty, output zeros.
+--    const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
+--
+--    // we want to precalculate indices and weights shared by all chanels,
+--    // this is the key point of optimization
+--    std::vector<detail::PreCalc<T>> pre_calc(
+--        roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
+--    detail::pre_calc_for_bilinear_interpolate(
+--        height,
+--        width,
+--        pooled_height,
+--        pooled_width,
+--        roi_start_h,
+--        roi_start_w,
+--        bin_size_h,
+--        bin_size_w,
+--        roi_bin_grid_h,
+--        roi_bin_grid_w,
+--        pre_calc);
+--
+--    for (int c = 0; c < channels; c++) {
+--      int index_n_c = index_n + c * pooled_width * pooled_height;
+--      const T* offset_input =
+--          input + (roi_batch_ind * channels + c) * height * width;
+--      int pre_calc_index = 0;
+--
+--      for (int ph = 0; ph < pooled_height; ph++) {
+--        for (int pw = 0; pw < pooled_width; pw++) {
+--          int index = index_n_c + ph * pooled_width + pw;
+--
+--          T output_val = 0.;
+--          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+--            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+--              detail::PreCalc<T> pc = pre_calc[pre_calc_index];
+--              output_val += pc.w1 * offset_input[pc.pos1] +
+--                  pc.w2 * offset_input[pc.pos2] +
+--                  pc.w3 * offset_input[pc.pos3] + pc.w4 * offset_input[pc.pos4];
+--
+--              pre_calc_index += 1;
+--            }
+--          }
+--          output_val /= count; // Average pooling
+--
+--          output[index] = output_val;
+--        } // for pw
+--      } // for ph
+--    } // for c
+--  } // for n
+-+  int grain_size = ceil(n_rois / at::get_num_threads());
+-+  at::parallel_for(0, n_rois, grain_size, [&](int64_t start, int64_t end) {
+-+    for (int n = start; n < end; n++) {
+-+      int index_n = n * channels * pooled_width * pooled_height;
+-+
+-+      const T* offset_rois = rois + n * 5;
+-+      int roi_batch_ind = offset_rois[0];
+-+
+-+      // Do not using rounding; this implementation detail is critical
+-+      T offset = aligned ? (T)0.5 : (T)0.0;
+-+      T roi_start_w = offset_rois[1] * spatial_scale - offset;
+-+      T roi_start_h = offset_rois[2] * spatial_scale - offset;
+-+      T roi_end_w = offset_rois[3] * spatial_scale - offset;
+-+      T roi_end_h = offset_rois[4] * spatial_scale - offset;
+-+
+-+      T roi_width = roi_end_w - roi_start_w;
+-+      T roi_height = roi_end_h - roi_start_h;
+-+      if (!aligned) {
+-+	// Force malformed ROIs to be 1x1
+-+	roi_width = std::max(roi_width, (T)1.);
+-+	roi_height = std::max(roi_height, (T)1.);
+-+      }
+-+
+-+      T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+-+      T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+-+
+-+      // We use roi_bin_grid to sample the grid and mimic integral
+-+      int roi_bin_grid_h = (sampling_ratio > 0)
+-+          ? sampling_ratio
+-+          : ceil(roi_height / pooled_height); // e.g., = 2
+-+      int roi_bin_grid_w =
+-+          (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+-+
+-+      // We do average (integral) pooling inside a bin
+-+      // When the grid is empty, output zeros.
+-+      const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
+-+
+-+      // we want to precalculate indices and weights shared by all chanels,
+-+      // this is the key point of optimization
+-+      std::vector<detail::PreCalc<T>> pre_calc(
+-+          roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
+-+      detail::pre_calc_for_bilinear_interpolate(
+-+          height,
+-+          width,
+-+          pooled_height,
+-+          pooled_width,
+-+          roi_start_h,
+-+          roi_start_w,
+-+          bin_size_h,
+-+          bin_size_w,
+-+          roi_bin_grid_h,
+-+          roi_bin_grid_w,
+-+          pre_calc);
+-+
+-+      for (int c = 0; c < channels; c++) {
+-+	int index_n_c = index_n + c * pooled_width * pooled_height;
+-+	const T* offset_input =
+-+            input + (roi_batch_ind * channels + c) * height * width;
+-+	int pre_calc_index = 0;
+-+
+-+	for (int ph = 0; ph < pooled_height; ph++) {
+-+	  for (int pw = 0; pw < pooled_width; pw++) {
+-+	    int index = index_n_c + ph * pooled_width + pw;
+-+
+-+	    T output_val = 0.;
+-+	    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+-+	      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+-+		detail::PreCalc<T> pc = pre_calc[pre_calc_index];
+-+		output_val += pc.w1 * offset_input[pc.pos1] +
+-+                    pc.w2 * offset_input[pc.pos2] +
+-+                    pc.w3 * offset_input[pc.pos3] + pc.w4 * offset_input[pc.pos4];
+-+
+-+		pre_calc_index += 1;
+-+	      }
+-+	    }
+-+	    output_val /= count; // Average pooling
+-+
+-+	    output[index] = output_val;
+-+	  } // for pw
+-+	} // for ph
+-+      } // for c
+-+    } // for n
+-+  });
+- }
+- 
+- template <typename T>
+-@@ -183,100 +186,105 @@ void roi_align_backward_kernel_impl(
+-     int pooled_width,
+-     int sampling_ratio,
+-     bool aligned,
+--    T* grad_input,
+-+    const int64_t grad_input_size,
+-+    T* grad_input_buffer,
+-     const T* rois,
+-     int n_stride,
+-     int c_stride,
+-     int h_stride,
+-     int w_stride) {
+--  for (int index = 0; index < nthreads; index++) {
+--    // (n, c, ph, pw) is an element in the pooled output
+--    int pw = index % pooled_width;
+--    int ph = (index / pooled_width) % pooled_height;
+--    int c = (index / pooled_width / pooled_height) % channels;
+--    int n = index / pooled_width / pooled_height / channels;
+--
+--    const T* offset_rois = rois + n * 5;
+--    int roi_batch_ind = offset_rois[0];
+--
+--    // Do not using rounding; this implementation detail is critical
+--    T offset = aligned ? (T)0.5 : (T)0.0;
+--    T roi_start_w = offset_rois[1] * spatial_scale - offset;
+--    T roi_start_h = offset_rois[2] * spatial_scale - offset;
+--    T roi_end_w = offset_rois[3] * spatial_scale - offset;
+--    T roi_end_h = offset_rois[4] * spatial_scale - offset;
+--
+--    T roi_width = roi_end_w - roi_start_w;
+--    T roi_height = roi_end_h - roi_start_h;
+--    if (!aligned) {
+--      // Force malformed ROIs to be 1x1
+--      roi_width = std::max(roi_width, (T)1.);
+--      roi_height = std::max(roi_height, (T)1.);
+--    }
+--
+--    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+--    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+--
+--    T* offset_grad_input =
+--        grad_input + ((roi_batch_ind * channels + c) * height * width);
+--
+--    int output_offset = n * n_stride + c * c_stride;
+--    const T* offset_grad_output = grad_output + output_offset;
+--    const T grad_output_this_bin =
+--        offset_grad_output[ph * h_stride + pw * w_stride];
+--
+--    // We use roi_bin_grid to sample the grid and mimic integral
+--    int roi_bin_grid_h = (sampling_ratio > 0)
+--        ? sampling_ratio
+--        : ceil(roi_height / pooled_height); // e.g., = 2
+--    int roi_bin_grid_w =
+--        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+--
+--    // We do average (integral) pooling inside a bin
+--    const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+--
+--    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+--      const T y = roi_start_h + ph * bin_size_h +
+--          static_cast<T>(iy + .5f) * bin_size_h /
+--              static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+--      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+--        const T x = roi_start_w + pw * bin_size_w +
+--            static_cast<T>(ix + .5f) * bin_size_w /
+--                static_cast<T>(roi_bin_grid_w);
+--
+--        T w1, w2, w3, w4;
+--        int x_low, x_high, y_low, y_high;
+--
+--        bilinear_interpolate_gradient(
+--            height,
+--            width,
+--            y,
+--            x,
+--            w1,
+--            w2,
+--            w3,
+--            w4,
+--            x_low,
+--            x_high,
+--            y_low,
+--            y_high,
+--            index);
+--
+--        T g1 = grad_output_this_bin * w1 / count;
+--        T g2 = grad_output_this_bin * w2 / count;
+--        T g3 = grad_output_this_bin * w3 / count;
+--        T g4 = grad_output_this_bin * w4 / count;
+--
+--        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+--          // atomic add is not needed for now since it is single threaded
+--          add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
+--          add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
+--          add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
+--          add(offset_grad_input + y_high * width + x_high, static_cast<T>(g4));
+--        } // if
+--      } // ix
+--    } // iy
+--  } // for
+-+  int grain_size = ceil(nthreads / at::get_num_threads());
+-+  at::parallel_for(0, nthreads, grain_size, [&](int64_t start, int64_t end) {
+-+    for (int index = start; index < end; index++) {
+-+      int thread_no = at::get_thread_num();
+-+      // (n, c, ph, pw) is an element in the pooled output
+-+      int pw = index % pooled_width;
+-+      int ph = (index / pooled_width) % pooled_height;
+-+      int c = (index / pooled_width / pooled_height) % channels;
+-+      int n = index / pooled_width / pooled_height / channels;
+-+
+-+      const T* offset_rois = rois + n * 5;
+-+      int roi_batch_ind = offset_rois[0];
+-+
+-+      // Do not using rounding; this implementation detail is critical
+-+      T offset = aligned ? (T)0.5 : (T)0.0;
+-+      T roi_start_w = offset_rois[1] * spatial_scale - offset;
+-+      T roi_start_h = offset_rois[2] * spatial_scale - offset;
+-+      T roi_end_w = offset_rois[3] * spatial_scale - offset;
+-+      T roi_end_h = offset_rois[4] * spatial_scale - offset;
+-+
+-+      T roi_width = roi_end_w - roi_start_w;
+-+      T roi_height = roi_end_h - roi_start_h;
+-+      if (!aligned) {
+-+	// Force malformed ROIs to be 1x1
+-+	roi_width = std::max(roi_width, (T)1.);
+-+	roi_height = std::max(roi_height, (T)1.);
+-+      }
+-+
+-+      T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+-+      T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+-+
+-+      T* offset_grad_input = grad_input_buffer + (thread_no * grad_input_size) +
+-+        ((roi_batch_ind * channels + c) * height * width);
+-+
+-+      int output_offset = n * n_stride + c * c_stride;
+-+      const T* offset_grad_output = grad_output + output_offset;
+-+      const T grad_output_this_bin =
+-+          offset_grad_output[ph * h_stride + pw * w_stride];
+-+
+-+      // We use roi_bin_grid to sample the grid and mimic integral
+-+      int roi_bin_grid_h = (sampling_ratio > 0)
+-+          ? sampling_ratio
+-+          : ceil(roi_height / pooled_height); // e.g., = 2
+-+      int roi_bin_grid_w =
+-+          (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+-+
+-+      // We do average (integral) pooling inside a bin
+-+      const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+-+
+-+      for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+-+	const T y = roi_start_h + ph * bin_size_h +
+-+            static_cast<T>(iy + .5f) * bin_size_h /
+-+                static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+-+	for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+-+	  const T x = roi_start_w + pw * bin_size_w +
+-+              static_cast<T>(ix + .5f) * bin_size_w /
+-+                  static_cast<T>(roi_bin_grid_w);
+-+
+-+	  T w1, w2, w3, w4;
+-+	  int x_low, x_high, y_low, y_high;
+-+
+-+	  bilinear_interpolate_gradient(
+-+              height,
+-+              width,
+-+              y,
+-+              x,
+-+              w1,
+-+              w2,
+-+              w3,
+-+              w4,
+-+              x_low,
+-+              x_high,
+-+              y_low,
+-+              y_high,
+-+              index);
+-+
+-+	  T g1 = grad_output_this_bin * w1 / count;
+-+	  T g2 = grad_output_this_bin * w2 / count;
+-+	  T g3 = grad_output_this_bin * w3 / count;
+-+	  T g4 = grad_output_this_bin * w4 / count;
+-+
+-+	  if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+-+	    // atomic add is not needed for now since it is single threaded
+-+	    add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
+-+	    add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
+-+	    add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
+-+	    add(offset_grad_input + y_high * width + x_high, static_cast<T>(g4));
+-+	  } // if
+-+	} // ix
+-+      } // iy
+-+    } // for
+-+  });
+- }
+- 
+- at::Tensor roi_align_forward_kernel(
+-@@ -355,6 +363,9 @@ at::Tensor roi_align_backward_kernel(
+-     return grad_input;
+-   }
+- 
+-+  const int num_threads = at::get_num_threads();
+-+  at::Tensor grad_input_buffer =
+-+    at::zeros({num_threads, batch_size, channels, height, width}, grad.options());
+-   // get stride values to ensure indexing into gradients is correct.
+-   int n_stride = grad.stride(0);
+-   int c_stride = grad.stride(1);
+-@@ -375,13 +386,17 @@ at::Tensor roi_align_backward_kernel(
+-             pooled_width,
+-             sampling_ratio,
+-             aligned,
+--            grad_input.data_ptr<scalar_t>(),
+-+	    grad_input.numel(),
+-+	    grad_input_buffer.data_ptr<scalar_t>(),
+-             rois_.data_ptr<scalar_t>(),
+-             n_stride,
+-             c_stride,
+-             h_stride,
+-             w_stride);
+-       });
+-+  for (int64_t i = 0; i < num_threads; ++i) {
+-+    grad_input.add_(grad_input_buffer.select(0, i));
+-+  }
+-   return grad_input;
+- }
+- 
++         extra_compile_args["cxx"].append("-O0")
+diff --git a/test/cpp/c10d/CMakeLists.txt b/test/cpp/c10d/CMakeLists.txt
+index 285a5dd2a7..9b440e217f 100644
+--- a/test/cpp/c10d/CMakeLists.txt
++++ b/test/cpp/c10d/CMakeLists.txt
+@@ -20,6 +20,7 @@ function(c10d_add_test test_src)
+       $<BUILD_INTERFACE:${TORCH_SRC_DIR}/csrc/distributed>
+       $<TARGET_PROPERTY:fmt::fmt-header-only,INTERFACE_INCLUDE_DIRECTORIES>
+   )
++  target_include_directories(${test_name} PRIVATE ${MPI_CXX_INCLUDE_PATH})
+   target_link_libraries(${test_name} PRIVATE
+     fmt::fmt-header-only
+     ${ARG_LINK_LIBRARIES}
+@@ -83,7 +84,7 @@ if(USE_MPI AND USE_C10D_MPI)
+   # private headers of libtorch, which in turn include MPI. As a hacky
+   # alternative to making MPI a public dependency of libtorch, we make it
+   # a private dependency of the tests as well.
+-  c10d_add_test(ProcessGroupMPITest.cpp LINK_LIBRARIES torch_cpu MPI::MPI_CXX INSTALL_TEST ${INSTALL_TEST})
++  c10d_add_test(ProcessGroupMPITest.cpp LINK_LIBRARIES torch_cpu ${MPI_CXX_LIBRARIES} INSTALL_TEST ${INSTALL_TEST})
+ endif()
+ 
+ if(LINUX AND USE_GLOO AND USE_C10D_GLOO)
+diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
+index 1632147f02..3fa6ec87fd 100644
+--- a/torch/CMakeLists.txt
++++ b/torch/CMakeLists.txt
+@@ -294,7 +294,8 @@ if(USE_DISTRIBUTED)
+     endif()
+     # Same for MPI.
+     if(USE_MPI)
+-      list(APPEND TORCH_PYTHON_LINK_LIBRARIES MPI::MPI_CXX)
++      list(APPEND TORCH_PYTHON_LINK_LIBRARIES ${MPI_CXX_LIBRARIES})
++      list(APPEND TORCH_PYTHON_INCLUDE_DIRECTORIES ${MPI_CXX_INCLUDE_PATH})
+     endif()
+     list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D)
+ 
diff --git a/llvm19.1.4/patch/tensorpipe.patch b/llvm19.1.4/patch/tensorpipe.patch
new file mode 100644
index 0000000..87a2ba5
--- /dev/null
+++ b/llvm19.1.4/patch/tensorpipe.patch
@@ -0,0 +1,13 @@
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 77df76d..bba7a14 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -10,6 +10,8 @@ project(tensorpipe LANGUAGES C CXX)
+ 
+ set(CMAKE_CXX_STANDARD 17)
+ 
++set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-missing-template-arg-list-after-template-kw")
++
+ list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
+ 
+ # Expose build options.
diff --git a/llvm21.1.0/patch/numpy.patch b/llvm21.1.0/patch/numpy.patch
new file mode 100644
index 0000000..7a5ce4e
--- /dev/null
+++ b/llvm21.1.0/patch/numpy.patch
@@ -0,0 +1,102 @@
+diff --git a/numpy/distutils/fcompiler/__init__.py b/numpy/distutils/fcompiler/__init__.py
+index d8dcfa8..ebe0647 100644
+--- a/numpy/distutils/fcompiler/__init__.py
++++ b/numpy/distutils/fcompiler/__init__.py
+@@ -745,7 +745,7 @@ def wrap_unlinkable_objects(self, objects, output_dir, extra_dll_dir):
+     ('cygwin.*', ('gnu', 'intelv', 'absoft', 'compaqv', 'intelev', 'gnu95', 'g95')),
+     ('linux.*', ('arm', 'gnu95', 'intel', 'lahey', 'pg', 'nv', 'absoft', 'nag',
+                  'vast', 'compaq', 'intele', 'intelem', 'gnu', 'g95', 
+-                 'pathf95', 'nagfor', 'fujitsu')),
++                 'pathf95', 'nagfor', 'fujitsu', 'llvm')),
+     ('darwin.*', ('gnu95', 'nag', 'nagfor', 'absoft', 'ibm', 'intel', 'gnu',
+                  'g95', 'pg')),
+     ('sunos.*', ('sun', 'gnu', 'gnu95', 'g95')),
+diff --git a/numpy/distutils/fcompiler/llvm.py b/numpy/distutils/fcompiler/llvm.py
+new file mode 100644
+index 0000000..f3db492
+--- /dev/null
++++ b/numpy/distutils/fcompiler/llvm.py
+@@ -0,0 +1,71 @@
++from __future__ import division, absolute_import, print_function
++                                                                               
++import sys                                                                     
++                                                                               
++from numpy.distutils.fcompiler import FCompiler, dummy_fortran_file
++from sys import platform                                                       
++from os.path import join, dirname, normpath
++
++compilers = ['LlvmFlangFCompiler']
++
++import functools
++
++class LlvmFlangFCompiler(FCompiler):
++    compiler_type = 'llvm'
++    description = 'LLVM Fortran Compiler'
++    version_pattern = r'\s*flang.*version (?P<version>[\d.-]+).*'
++
++    possible_executables = ['flang']
++
++    executables = {
++        'version_cmd': ["", "--version"],
++        'compiler_f77': ["flang", "-fPIC"],
++        'compiler_fix': ["flang", "-fPIC", "-ffixed-form"],
++        'compiler_f90': ["flang", "-fPIC"],
++        'linker_so': ["flang", "-fPIC", "-shared"],
++        'archiver': ["ar", "-cr"],
++        'ranlib':  None
++    }
++
++    pic_flags = ["-fPIC", "-DPIC"]
++    c_compiler = 'clang'
++    module_dir_switch = '-module '  # Don't remove ending space!
++
++    def get_libraries(self):
++        opt = FCompiler.get_libraries(self)
++        return opt
++
++    @functools.lru_cache(maxsize=128)
++    def get_library_dirs(self):
++        """List of compiler library directories."""
++        opt = FCompiler.get_library_dirs(self)
++        flang_dir = dirname(self.executables['compiler_f77'][0])
++        opt.append(normpath(join(flang_dir, '..', 'lib')))
++
++        return opt
++
++    def get_flags(self):
++        return []
++
++    def get_flags_free(self):
++        return []
++
++    def get_flags_debug(self):
++        return ['-g']
++
++    def get_flags_opt(self):
++        return ['-O3']
++
++    def get_flags_arch(self):
++        return []
++
++    def runtime_library_dir_option(self, dir):
++        return '-Wl,-rpath=%s' % dir
++
++
++if __name__ == '__main__':
++    from distutils import log
++    log.set_verbosity(2)
++    from numpy.distutils import customized_fcompiler
++    print(customized_fcompiler(compiler='llvm').get_version())
++
+diff --git a/numpy/tests/test_public_api.py b/numpy/tests/test_public_api.py
+index bb15e10..9369424 100644
+--- a/numpy/tests/test_public_api.py
++++ b/numpy/tests/test_public_api.py
+@@ -233,6 +233,7 @@ def test_NPY_NO_EXPORT():
+     "distutils.fcompiler.sun",
+     "distutils.fcompiler.vast",
+     "distutils.fcompiler.fujitsu",
++    "distutils.fcompiler.llvm",
+     "distutils.from_template",
+     "distutils.intelccompiler",
+     "distutils.lib2def",
diff --git a/llvm21.1.0/patch/pytorch.patch b/llvm21.1.0/patch/pytorch.patch
new file mode 100644
index 0000000..7287b1f
--- /dev/null
+++ b/llvm21.1.0/patch/pytorch.patch
@@ -0,0 +1,997 @@
+diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
+index 6ab41b6c84..61a0cd2f9e 100644
+--- a/caffe2/CMakeLists.txt
++++ b/caffe2/CMakeLists.txt
+@@ -1549,7 +1549,8 @@ target_link_libraries(torch_cpu PUBLIC ${Caffe2_PUBLIC_DEPENDENCY_LIBS})
+ target_link_libraries(torch_cpu PRIVATE ${Caffe2_DEPENDENCY_LIBS})
+ target_link_libraries(torch_cpu PRIVATE ${Caffe2_DEPENDENCY_WHOLE_LINK_LIBS})
+ if(USE_MPI)
+-  target_link_libraries(torch_cpu PRIVATE MPI::MPI_CXX)
++  target_link_libraries(torch_cpu PRIVATE ${MPI_CXX_LIBRARIES})
++  target_include_directories(torch_cpu PRIVATE ${MPI_CXX_INCLUDE_PATH})
+ endif()
+ target_include_directories(torch_cpu INTERFACE $<INSTALL_INTERFACE:include>)
+ target_include_directories(torch_cpu PRIVATE ${Caffe2_CPU_INCLUDE})
+@@ -1727,7 +1728,8 @@ if(BUILD_SHARED_LIBS)
+   endif()
+   set_target_properties(torch_global_deps PROPERTIES LINKER_LANGUAGE C)
+   if(USE_MPI)
+-    target_link_libraries(torch_global_deps MPI::MPI_CXX)
++    target_link_libraries(torch_global_deps ${MPI_CXX_LIBRARIES})
++    target_include_directories(torch_global_deps PUBLIC ${MPI_CXX_INCLUDE_PATH})
+   endif()
+   if(CAFFE2_USE_MKL)
+     target_link_libraries(torch_global_deps caffe2::mkl)
+diff --git a/cmake/Modules/FindARM.cmake b/cmake/Modules/FindARM.cmake
+index 903025c5c2..a419c1aeed 100644
+--- a/cmake/Modules/FindARM.cmake
++++ b/cmake/Modules/FindARM.cmake
+@@ -153,7 +153,7 @@ IF(CMAKE_SYSTEM_NAME MATCHES "Linux")
+ 
+     # Check for SVE256 vector length
+     CHECK_COMPILES(CXX "SVE256" "-march=armv8.2-a+sve -msve-vector-bits=256" "${SVE_CODE}")
+-    CHECK_COMPILES(CXX "ARM_BF16" "-march=armv8.2-a+sve+bf16 -msve-vector-bits=256" "${ARM_BF16_CODE}")
++    #CHECK_COMPILES(CXX "ARM_BF16" "-march=armv8.2-a+sve+bf16 -msve-vector-bits=256" "${ARM_BF16_CODE}")
+ 
+     # If SVE256 support is not found, set CXX_SVE_FOUND to FALSE and notify the user
+     if(NOT CXX_SVE256_FOUND)
+diff --git a/cmake/Modules/FindMPI.cmake b/cmake/Modules/FindMPI.cmake
+new file mode 100644
+index 0000000000..cd77062a48
+--- /dev/null
++++ b/cmake/Modules/FindMPI.cmake
+@@ -0,0 +1,53 @@
++if(CMAKE_C_COMPILER MATCHES ".*/clang$" AND
++    CMAKE_CXX_COMPILER MATCHES ".*/clang\\+\\+$")
++  if(DEFINED ENV{MPI_HOME})
++    set(TCSMPI_EXEC_PATH "$ENV{MPI_HOME}/bin")
++  else()
++    string(REGEX REPLACE "/clang\\+\\+$" "" CMAKE_CXX_COMPILER_DIR "${CMAKE_CXX_COMPILER}")
++    set(TCSMPI_EXEC_PATH "${CMAKE_CXX_COMPILER_DIR}")
++  endif()
++  execute_process(COMMAND "${TCSMPI_EXEC_PATH}/mpiclang++" "--show"
++    RESULT_VARIABLE MPIFCC_EXEC_RESULT
++    OUTPUT_QUIET
++    ERROR_QUIET)
++  if(MPIFCC_EXEC_RESULT EQUAL 0)
++    message(STATUS "TCS-MPI ENABLED")
++    set(MPI_CXX_FOUND ON)
++    execute_process(COMMAND "${TCSMPI_EXEC_PATH}/mpiclang++" "--showme:compile"
++      OUTPUT_VARIABLE MPI_CXX_COMPILE_FLAGS)
++    string(REPLACE "\n" "" MPI_CXX_COMPILE_FLAGS "${MPI_CXX_COMPILE_FLAGS}")
++    execute_process(COMMAND "${TCSMPI_EXEC_PATH}/mpiclang++" "--showme:incdirs"
++      OUTPUT_VARIABLE MPI_CXX_INCLUDE_PATH)
++    string(REPLACE "\n" ";" MPI_CXX_INCLUDE_PATH "${MPI_CXX_INCLUDE_PATH}")
++    execute_process(COMMAND "${TCSMPI_EXEC_PATH}/mpiclang++" "--showme:link"
++      OUTPUT_VARIABLE MPI_CXX_LINK_FLAGS)
++    string(REPLACE "\n" "" MPI_CXX_LINK_FLAGS "${MPI_CXX_LINK_FLAGS}")
++    execute_process(COMMAND "${TCSMPI_EXEC_PATH}/mpiclang++" "--showme:libdirs"
++      OUTPUT_VARIABLE MPI_CXX_LIBRARY_DIRS)
++    string(REPLACE "\n" "" MPI_CXX_LIBRARY_DIRS "${MPI_CXX_LIBRARY_DIRS}")
++    string(REPLACE " " ";" MPI_CXX_LIBRARY_DIRS "${MPI_CXX_LIBRARY_DIRS}")
++    foreach(dir IN LISTS MPI_CXX_LIBRARY_DIRS)
++      set(MPI_CXX_LIBRARIES "${dir}/libmpi.so")
++    endforeach()
++    set(MPI_FOUND ON)
++    set(MPI_C_FOUND ON)
++    set(MPIEXEC "${TCSMPI_EXEC_PATH}/mpiexec")
++    set(MPI_COMPILE_FLAGS ${MPI_CXX_COMPILE_FLAGS})
++    set(MPI_C_COMPILE_FLAGS ${MPI_CXX_COMPILE_FLAGS})
++    set(MPI_INCLUDE_PATH ${MPI_CXX_INCLUDE_PATH})
++    set(MPI_C_INCLUDE_PATH ${MPI_CXX_INCLUDE_PATH})
++    set(MPI_LINK_FLAGS ${MPI_CXX_LINK_FLAGS})
++    set(MPI_C_LINK_FLAGS ${MPI_CXX_LINK_FLAGS})
++    set(MPI_LIBRARIES ${MPI_CXX_LIBRARIES})
++    set(MPI_C_LIBRARIES ${MPI_CXX_LIBRARIES})
++  else()
++    message(STATUS "TCS-MPI DISABLED")
++  endif()
++endif()
++if(NOT MPI_FOUND)
++  set(CMAKE_MODULE_PATH_TMP "${CMAKE_MODULE_PATH}")
++  unset(CMAKE_MODULE_PATH)
++  find_package(MPI)
++  set(CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH_TMP}")
++  unset(CMAKE_MODULE_PATH_TMP)
++endif()
+diff --git a/scripts/fujitsu/1_python.sh b/scripts/fujitsu/1_python.sh
+index 90790698f9..98a02f52c6 100755
+--- a/scripts/fujitsu/1_python.sh
++++ b/scripts/fujitsu/1_python.sh
+@@ -39,7 +39,7 @@ source $script_basedir/env.src
+ 
+ if [ -v fjenv_debug ]; then set -x; fi
+ 
+-PYTHON_VER=3.9
++PYTHON_VER=3.10
+ PYTHON_DIR=cpython
+ 
+ #
+@@ -83,7 +83,7 @@ if [ "$fjenv_use_fcc" = "true" ]; then
+     # TODO: $ORIGIN sometimes parsed as 'RIGIN'.
+     # perhaps more backslashs are needed to protect $ORIGIN from parsing in shell.
+     # export LDFLAGS="-Wl,-rpath,\$ORIGIN/../lib"
+-    export LDFLAGS="-Wl,-rpath,${PREFIX}/lib -Wl,-rpath,${TCSDS_PATH}/lib64 -lpthread"
++    export LDFLAGS="-Wl,-rpath,${PREFIX}/lib -L/usr/lib64"
+ else
+     # Ditto.
+     #export LDFLAGS="-Wl,-rpath,\$ORIGIN/../lib"
+@@ -105,7 +105,7 @@ if [ "${fjenv_use_fcc}" = "true" ]; then
+     #   We used to link with '--linkfortran', which turned out to be unnecessary.
+     #   It was used to link with the module solery written by Fortran.
+     # ${CXX} --linkfortran -SSL2 -Kopenmp -Nlibomp -o python Programs/python.o -L. -lpython$PYTHON_VER $LDFLAGS
+-    ${CXX} -Kopenmp -Nlibomp -SSL2BLAMP -lfjlapackexsve -o python Programs/python.o -L. -lpython$PYTHON_VER $LDFLAGS
++    ${CXX} -fopenmp -L${OpenBLAS_HOME}/lib -lopenblas -lflang_rt.runtime -o python Programs/python.o -L. -lpython$PYTHON_VER $LDFLAGS
+ fi
+ 
+ make install
+@@ -119,7 +119,7 @@ hash -r
+ # Note that python 3.9 buildles setuptools 58.1.
+ #pip3 uninstall -y setuptools
+ 
+-pip3 install --upgrade ${PIP3_OPTIONS} 'setuptools>60.6.0'        # or setuptools<59.6.0
++pip3 install --upgrade ${PIP3_OPTIONS} 'setuptools==73.0.1'        # or setuptools<59.6.0
+ 
+ # Show configuration
+ 
+diff --git a/scripts/fujitsu/3_venv.sh b/scripts/fujitsu/3_venv.sh
+index b271fdc535..4169a4930d 100755
+--- a/scripts/fujitsu/3_venv.sh
++++ b/scripts/fujitsu/3_venv.sh
+@@ -81,9 +81,9 @@ fi
+ # Workaround is found in:
+ #    See https://stackoverflow.com/questions/70520120/attributeerror-module-setuptools-distutils-has-no-attribute-version
+ 
+-pip3 install --upgrade ${PIP3_OPTIONS} 'setuptools>60.6.0'        # or setuptools<59.6.0
++pip3 install --upgrade ${PIP3_OPTIONS} 'setuptools==73.0.1'        # or setuptools<59.6.0
+ 
+-pip3 install --upgrade ${PIP3_OPTIONS} pip future six wheel
++pip3 install --upgrade ${PIP3_OPTIONS} pip==25.3
+ 
+ pip3 list | tee $script_basedir/pip3_list.txt
+ 
+diff --git a/scripts/fujitsu/4_numpy_scipy.sh b/scripts/fujitsu/4_numpy_scipy.sh
+index fe07872ab3..02e52f2c7a 100755
+--- a/scripts/fujitsu/4_numpy_scipy.sh
++++ b/scripts/fujitsu/4_numpy_scipy.sh
+@@ -41,8 +41,9 @@ if [ -v fjenv_debug ]; then set -x; fi
+ 
+ NUMPY_VER=v1.22.4
+ NUMPY_DIR=numpy
+-SCIPY_VER=v1.7.3
++SCIPY_VER=v1.10.1
+ SCIPY_DIR=scipy
++SCIPY_CHERRY_PICK=ab7d08c6148286059f6498ab5c3070268d13cbd9
+ 
+ #
+ # Clean up
+@@ -61,17 +62,20 @@ fi
+ [ -d ${DOWNLOAD_PATH} ] || mkdir -p ${DOWNLOAD_PATH}
+ cd ${DOWNLOAD_PATH}
+ 
+-[ -d $NUMPY_DIR ] ||
++if [ ! -d $NUMPY_DIR ]; then
+     git clone ${GIT_OPTIONS} \
+ 	-b $NUMPY_VER \
+     	--depth 1 \
+ 	https://github.com/numpy/numpy.git $NUMPY_DIR
++    (cd $NUMPY_DIR && patch -p 1 < $script_basedir/numpy.patch)
++fi
+ 
+-[ -d $SCIPY_DIR ] ||
++if [ ! -d $SCIPY_DIR ]; then
+     git clone ${GIT_OPTIONS} --recursive \
+ 	-b $SCIPY_VER \
+-    	--depth 1 \
+ 	https://github.com/scipy/scipy.git $SCIPY_DIR
++    (cd $SCIPY_DIR && git cherry-pick ${SCIPY_CHERRY_PICK})
++fi
+ 
+ [ -v fjenv_download ] && fjenv_safe_exit 0
+ 
+@@ -94,7 +98,7 @@ fi
+ 
+ # NumPy maintenance/1.22.x requires Cythone >= 0.29.21
+ # NumPy maintenance/1.22.2 requires Cythone >= 0.29.30
+-pip3 install ${PIP3_OPTIONS} 'Cython>=0.29.30' ||
++pip3 install ${PIP3_OPTIONS} 'Cython>=0.29.33,<3.0' ||
+     pip3 install ${PIP3_OPTIONS} $PIP_PACKAGE_PATH/Cython*.whl
+ 
+ cd $DOWNLOAD_PATH/$NUMPY_DIR
+@@ -107,20 +111,18 @@ fi
+ if [ "$fjenv_use_fcc" = "true" -a ! -f site.cfg ]; then
+     cat <<EOF>site.cfg
+ [openblas]
+-libraries = fjlapackexsve
+-library_dirs = $TCSDS_PATH/lib64
+-include_dirs = $TCSDS_PATH/include
+-extra_link_args = -SSL2BLAMP
++library_dirs = ${OpenBLAS_HOME}/lib
++include_dirs = ${OpenBLAS_HOME}/include
++extra_link_args = -lopenblas -lflang_rt.runtime
+ 
+ [lapack]
+-lapack_libs = fjlapackexsve
+-library_dirs = $TCSDS_PATH/lib64
+-extra_link_args = -SSL2BLAMP
++library_dirs = ${OpenBLAS_HOME}/lib
++extra_link_args = -lopenblas -lflang_rtruntime
+ EOF
+ fi
+ 
+ NPY_NUM_BUILD_JOBS=$MAX_JOBS	\
+-    python3 setup.py build -j $MAX_JOBS install
++    python3 setup.py build -j $MAX_JOBS config_fc --fcompiler=llvm install
+ 
+ #
+ # Build SciPy
+@@ -130,7 +132,7 @@ NPY_NUM_BUILD_JOBS=$MAX_JOBS	\
+ # older than what NumPy is requiring, but running for reference purpose,
+ # such as in case of using older NumPy.
+ 
+-pip3 install ${PIP3_OPTIONS} 'Cython>=0.29.18'
++pip3 install ${PIP3_OPTIONS} 'Cython>=0.29.33,<3.0'
+ pip3 install ${PIP3_OPTIONS} pybind11 pythran
+ 
+ cd $DOWNLOAD_PATH/$SCIPY_DIR
+@@ -140,7 +142,7 @@ if [ -v fjenv_rebuild ]; then
+ fi
+ 
+ SCIPY_NUM_CYTHONIZE_JOBS=$MAX_JOBS	\
+-    python3 setup.py build -j $MAX_JOBS --fcompiler=fujitsu install
++    python3 setup.py build -j $MAX_JOBS config_fc --fcompiler=llvm install
+ 
+ pip3 list | tee $script_basedir/pip3_list.txt
+ 
+diff --git a/scripts/fujitsu/5_pytorch.sh b/scripts/fujitsu/5_pytorch.sh
+index 8128e584c8..0942df8b4e 100755
+--- a/scripts/fujitsu/5_pytorch.sh
++++ b/scripts/fujitsu/5_pytorch.sh
+@@ -40,7 +40,8 @@ source $script_basedir/env.src
+ 
+ if [ -v fjenv_debug ]; then set -x; fi
+ 
+-ONEDNN_VER=v2.7
++ONEDNN_VER=v3.7.1
++GOOGLETEST_CHERRY_PICK=fa8438ae6b70c57010177de47a9f13d7041a6328
+ 
+ #
+ # Clean up
+@@ -59,8 +60,10 @@ fi
+ if [ ! -d $PYTORCH_TOP/third_party/ideep/mkl-dnn ]; then
+     cd $PYTORCH_TOP
+     git submodule update --init --recursive $GIT_OPTIONS
++    cd $PYTORCH_TOP/third_party/googletest && git cherry-pick ${GOOGLETEST_CHERRY_PICK}
++    cd $PYTORCH_TOP/third_party/tensorpipe && patch -p 1 < $script_basedir/tensorpipe.patch
+ fi
+-cd $PYTORCH_TOP/third_party/ideep/mkl-dnn/third_party/oneDNN
++cd $PYTORCH_TOP/third_party/ideep/mkl-dnn
+ git checkout $GIT_OPTIONS $ONEDNN_VER
+ 
+ [ -v fjenv_download ] && fjenv_safe_exit 0
+@@ -96,7 +99,7 @@ fi
+ 
+ # 'setup.py' in PyTorch ensures that CFLAGS is used for both C and C++ compiler,
+ # but just in case...
+-CFLAGS=-O3 CXXFLAGS=-O3 python3 setup.py build -j $MAX_JOBS install
++CFLAGS='-O3 -fopenmp' CXXFLAGS="-O3 -fopenmp" LDFLAGS="-lflang_rt.runtime" python3 setup.py build -j $MAX_JOBS install
+ 
+ pip3 list | tee $script_basedir/pip3_list.txt
+ 
+diff --git a/scripts/fujitsu/6_vision.sh b/scripts/fujitsu/6_vision.sh
+index 68034079e4..2b8aa5bc5c 100755
+--- a/scripts/fujitsu/6_vision.sh
++++ b/scripts/fujitsu/6_vision.sh
+@@ -42,9 +42,9 @@ if [ -v fjenv_debug ]; then set -x; fi
+ 
+ JPEG_ARCHIVE_NAME=jpegsrc.v9d
+ JPEG_DIR=jpeg-9d
+-PILLOW_VER=7.2.0
++PILLOW_VER=8.4.0
+ PILLOW_DIR=Pillow
+-TORCHVISION_VER=v0.14.1
++TORCHVISION_VER=v0.24.1
+ TORCHVISION_DIR=vision
+ 
+ #
+@@ -89,7 +89,6 @@ if [ ! -d $TORCHVISION_DIR ]; then
+     git clone ${GIT_OPTIONS} -b $TORCHVISION_VER \
+ 	--depth 1 \
+     	https://github.com/pytorch/vision.git
+-    (cd vision; patch -p 1 < $script_basedir/vision.patch)
+ fi
+ 
+ [ -v fjenv_download ] && fjenv_safe_exit 0
+@@ -132,7 +131,7 @@ export LDFLAGS="-Wl,-rpath,${PREFIX}/lib"
+ if [ -v fjenv_rebuild ]; then
+     python3 setup.py clean
+ fi
+-python3 setup.py install
++pip3 install . --verbose --no-build-isolation
+ 
+ #
+ # Install torchvision
+@@ -145,7 +144,7 @@ fi
+ 
+ export TORCHVISION_INCLUDE=$PREFIX/include
+ export TORCHVISION_LIBRARY=$PREFIX/lib
+-CFLAGS="-Kfast" python3 setup.py build -j $MAX_JOBS install
++pip3 install . --verbose --no-build-isolation
+ 
+ pip3 list | tee $script_basedir/pip3_list.txt
+ 
+diff --git a/scripts/fujitsu/7_horovod.sh b/scripts/fujitsu/7_horovod.sh
+index 7aa6f302b9..4e0138da0d 100755
+--- a/scripts/fujitsu/7_horovod.sh
++++ b/scripts/fujitsu/7_horovod.sh
+@@ -41,6 +41,7 @@ if [ -v fjenv_debug ]; then set -x; fi
+ 
+ HOROVOD_VER=v0.26.1
+ HOROVOD_DIR=horovod
++FLATBUFFERS_CHERRY_PICK=20aad0c41e1252b04c72111c3eb221280a9c2009
+ 
+ #
+ # Clean up
+@@ -62,6 +63,7 @@ if [ ! -d horovod ]; then
+         -b $HOROVOD_VER \
+ 	--depth 1 \
+     	https://github.com/horovod/horovod.git
++    (cd horovod/third_party/flatbuffers && git cherry-pick ${FLATBUFFERS_CHERRY_PICK})
+     (cd horovod; patch -p 1 < $script_basedir/horovod.patch)
+     cp -p horovod/examples/pytorch/pytorch_synthetic_benchmark.py $script_basedir
+ fi
+@@ -81,7 +83,7 @@ fi
+ #
+ 
+ if [ "${fjenv_use_fcc}" != "true" ]; then
+-    echo "$0 works for FCC only for now"
++    echo "$0 works for clang++ only for now"
+     exit 1
+ fi
+ 
+diff --git a/scripts/fujitsu/env.src b/scripts/fujitsu/env.src
+index 7dd81f6d6c..aee93aaff4 100644
+--- a/scripts/fujitsu/env.src
++++ b/scripts/fujitsu/env.src
+@@ -43,10 +43,21 @@ fjenv_src_sourced="Y"
+ ########################################################################
+ ########################################################################
+ 
+-#TCSDS_PATH=/opt/FJSVxtclanga/tcsds-1.2.34	# TCS (FX1000)
+-TCSDS_PATH=/opt/FJSVstclanga/cp-1.0.21.01	# CP  (FX700)
+-VENV_PATH=~/venv
+-PREFIX=~/prefix
++module purge
++. /vol0004/apps/oss/spack/share/spack/setup-env.sh
++spack load gcc@12.2.0 arch=linux-rhel8-a64fx
++module load lang/tcsds-1.2.42
++module load LLVM/llvmorg-21.1.0
++export OpenBLAS_HOME=/vol0004/apps/r/OSS_CN/llvm/openblas-omp
++export GCC_INSTALL_DIR=/vol0004/apps/oss/spack-v1.0.1/opt/spack/linux-a64fx/gcc-12.2.0-f57uyl2rzc74cow54td7bdy77xajibir/lib/gcc/aarch64-unknown-linux-gnu/12.2.0
++
++TCSDS_PATH=/opt/FJSVxtclanga/tcsds-1.2.42	# TCS (FX1000)
++#TCSDS_PATH=/opt/FJSVstclanga/cp-1.0.21.01	# CP  (FX700)
++#VENV_PATH=~/venv
++#PREFIX=~/prefix
++ROOT_DIR=$(cd $(dirname ${BASH_SOURCE:-$0})/../../..; pwd)
++VENV_PATH=$ROOT_DIR/venv
++PREFIX=$ROOT_DIR/prefix
+ 
+ ########################################################################
+ ########################################################################
+@@ -63,20 +74,24 @@ PIP_PACKAGE_PATH=${DOWNLOAD_PATH}/pip_packages
+ # MAX_JOBS should be 40 or less. (Note: TCS set this to 50 or 52)
+ : ${MAX_JOBS:=40}
+ if [ $MAX_JOBS -gt 40 ]; then MAX_JOBS=40; fi
++export MAX_JOBS=${MAX_JOBS}
+ 
+ #
+ # Env for Compilers
+ #
+ 
+ if [ "$fjenv_use_fcc" = "true" ]; then
+-    export CC="fcc -Nclang -Knolargepage"
+-    export CXX="FCC -Nclang -Knolargepage"
++    export CC="clang --gcc-install-dir=${GCC_INSTALL_DIR}"
++    export CXX="clang++ --gcc-install-dir=${GCC_INSTALL_DIR}"
++    export FC="flang"
++    export F77="flang"
++    export F90="flang"
+     export LC_ALL=C
+ fi
+ 
+ if [ ! -v fjenv_clean -a ! -v fjenv_download ]; then
+     if [ "$fjenv_use_fcc" = "true" ]; then
+-        export LD_LIBRARY_PATH=${TCSDS_PATH}/lib64${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}
++        export LD_LIBRARY_PATH=${OpenBLAS_HOME}/lib:${TCSDS_PATH}/lib64${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}
+     fi
+     if [ ! -z "$PREFIX" ]; then
+ 	PATH=${TCSDS_PATH}/bin:${PREFIX}/bin:${PATH}
+diff --git a/scripts/fujitsu/horovod.patch b/scripts/fujitsu/horovod.patch
+index ace3ba5866..c0b1f5b63b 100644
+--- a/scripts/fujitsu/horovod.patch
++++ b/scripts/fujitsu/horovod.patch
+@@ -1,6 +1,3 @@
+-#
+-# patch for v0.26.1 (Oct-14 2022, 34604870eabd9dc670c222deb1da9acc6b9d7c03)
+-#
+ diff --git a/examples/pytorch/pytorch_synthetic_benchmark.py b/examples/pytorch/pytorch_synthetic_benchmark.py
+ index d645a20..a3c838f 100644
+ --- a/examples/pytorch/pytorch_synthetic_benchmark.py
+@@ -60,6 +57,24 @@ index d645a20..a3c838f 100644
+      loss.backward()
+      optimizer.step()
+  
++diff --git a/horovod/torch/CMakeLists.txt b/horovod/torch/CMakeLists.txt
++index eecd198..04816b5 100644
++--- a/horovod/torch/CMakeLists.txt
+++++ b/horovod/torch/CMakeLists.txt
++@@ -63,9 +63,12 @@ endif()
++ parse_version(${Pytorch_VERSION} VERSION_DEC)
++ add_definitions(-DPYTORCH_VERSION=${VERSION_DEC} -DTORCH_API_INCLUDE_EXTENSION_H=1)
++ set(Pytorch_CXX11 ${Pytorch_CXX11} PARENT_SCOPE)
++-if(NOT Pytorch_VERSION VERSION_LESS "1.5.0")
+++if(Pytorch_VERSION VERSION_GREATER_EQUAL "1.5.0" AND Pytorch_VERSION VERSION_LESS "2.0.0")
++     set(CMAKE_CXX_STANDARD 14)
++ endif()
+++if(Pytorch_VERSION VERSION_GREATER_EQUAL "2.0.0")
+++    set(CMAKE_CXX_STANDARD 17)
+++endif()
++ 
++ # PyTorch SOURCES
++ # Later versions of PyTorch that use ROCm's hipify step will rename files.
+ diff --git a/horovod/torch/mpi_ops.py b/horovod/torch/mpi_ops.py
+ index ab764c5..b78a108 100644
+ --- a/horovod/torch/mpi_ops.py
+diff --git a/scripts/fujitsu/vision.patch b/scripts/fujitsu/vision.patch
+deleted file mode 100644
+index 3a1b5da138..0000000000
+--- a/scripts/fujitsu/vision.patch
++++ /dev/null
+@@ -1,498 +0,0 @@
+-#
+-# patch for v0.14.1 (Dec-8 2022, 5e8e2f125f140d1e908cf424a6a85cacad758125)
+-#
+-diff --git a/setup.py b/setup.py
+-index 9519890..4a09c3f 100644
+---- a/setup.py
+-+++ b/setup.py
+-@@ -209,6 +209,17 @@ def get_extensions():
+-         define_macros += [("USE_PYTHON", None)]
+-         extra_compile_args["cxx"].append("/MP")
+- 
+-+    # As long as torch is utilizing OpenMP,
+-+    # FCC requires -fopenmp for all submodules even though it doesn't use OpenMP.
+-+    if torch.has_openmp:
+-+        if sys.platform == 'linux':
+-+            try:
+-+                extra_compile_args['cxx'].append('-fopenmp')
+-+            except KeyError:
+-+                extra_compile_args = {
+-+                    'cxx': ['-fopenmp']
+-+                }
+-+
+-     if debug_mode:
+-         print("Compiling in debug mode")
+-         extra_compile_args["cxx"].append("-g")
+-diff --git a/torchvision/csrc/ops/cpu/nms_kernel.cpp b/torchvision/csrc/ops/cpu/nms_kernel.cpp
+-index c54d1f0..369b6a9 100644
+---- a/torchvision/csrc/ops/cpu/nms_kernel.cpp
+-+++ b/torchvision/csrc/ops/cpu/nms_kernel.cpp
+-@@ -20,13 +20,6 @@ at::Tensor nms_kernel_impl(
+-   if (dets.numel() == 0)
+-     return at::empty({0}, dets.options().dtype(at::kLong));
+- 
+--  auto x1_t = dets.select(1, 0).contiguous();
+--  auto y1_t = dets.select(1, 1).contiguous();
+--  auto x2_t = dets.select(1, 2).contiguous();
+--  auto y2_t = dets.select(1, 3).contiguous();
+--
+--  at::Tensor areas_t = (x2_t - x1_t) * (y2_t - y1_t);
+--
+-   auto order_t = std::get<1>(
+-       scores.sort(/*stable=*/true, /*dim=*/0, /* descending=*/true));
+- 
+-@@ -34,6 +27,15 @@ at::Tensor nms_kernel_impl(
+-   at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte));
+-   at::Tensor keep_t = at::zeros({ndets}, dets.options().dtype(at::kLong));
+- 
+-+  auto dets_sorted = dets.index_select(0, order_t);
+-+
+-+  auto x1_t = dets_sorted.select(1, 0).contiguous();
+-+  auto y1_t = dets_sorted.select(1, 1).contiguous();
+-+  auto x2_t = dets_sorted.select(1, 2).contiguous();
+-+  auto y2_t = dets_sorted.select(1, 3).contiguous();
+-+
+-+  at::Tensor areas_t = (x2_t - x1_t) * (y2_t - y1_t);
+-+
+-   auto suppressed = suppressed_t.data_ptr<uint8_t>();
+-   auto keep = keep_t.data_ptr<int64_t>();
+-   auto order = order_t.data_ptr<int64_t>();
+-@@ -45,19 +47,16 @@ at::Tensor nms_kernel_impl(
+- 
+-   int64_t num_to_keep = 0;
+- 
+--  for (int64_t _i = 0; _i < ndets; _i++) {
+--    auto i = order[_i];
+-+  for (int64_t i = 0; i < ndets; i++) {
+-     if (suppressed[i] == 1)
+-       continue;
+--    keep[num_to_keep++] = i;
+-     auto ix1 = x1[i];
+-     auto iy1 = y1[i];
+-     auto ix2 = x2[i];
+-     auto iy2 = y2[i];
+-     auto iarea = areas[i];
+- 
+--    for (int64_t _j = _i + 1; _j < ndets; _j++) {
+--      auto j = order[_j];
+-+    for (int64_t j = i + 1; j < ndets; j++) {
+-       if (suppressed[j] == 1)
+-         continue;
+-       auto xx1 = std::max(ix1, x1[j]);
+-@@ -73,6 +72,11 @@ at::Tensor nms_kernel_impl(
+-         suppressed[j] = 1;
+-     }
+-   }
+-+  for (int64_t i = 0; i < ndets; i++) {
+-+    if (suppressed[i] == 1)
+-+      continue;
+-+    keep[num_to_keep++] = order[i];
+-+  }
+-   return keep_t.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep);
+- }
+- 
+-diff --git a/torchvision/csrc/ops/cpu/roi_align_kernel.cpp b/torchvision/csrc/ops/cpu/roi_align_kernel.cpp
+-index e6684e9..d4c6b0e 100644
+---- a/torchvision/csrc/ops/cpu/roi_align_kernel.cpp
+-+++ b/torchvision/csrc/ops/cpu/roi_align_kernel.cpp
+-@@ -1,4 +1,5 @@
+- #include <ATen/ATen.h>
+-+#include <ATen/Parallel.h>
+- #include <torch/library.h>
+- 
+- #include "./roi_align_common.h"
+-@@ -24,87 +25,89 @@ void roi_align_forward_kernel_impl(
+-     T* output) {
+-   // (n, c, ph, pw) is an element in the pooled output
+-   // can be parallelized using omp
+--  // #pragma omp parallel for num_threads(32)
+--  for (int n = 0; n < n_rois; n++) {
+--    int index_n = n * channels * pooled_width * pooled_height;
+--
+--    const T* offset_rois = rois + n * 5;
+--    int roi_batch_ind = offset_rois[0];
+--
+--    // Do not using rounding; this implementation detail is critical
+--    T offset = aligned ? (T)0.5 : (T)0.0;
+--    T roi_start_w = offset_rois[1] * spatial_scale - offset;
+--    T roi_start_h = offset_rois[2] * spatial_scale - offset;
+--    T roi_end_w = offset_rois[3] * spatial_scale - offset;
+--    T roi_end_h = offset_rois[4] * spatial_scale - offset;
+--
+--    T roi_width = roi_end_w - roi_start_w;
+--    T roi_height = roi_end_h - roi_start_h;
+--    if (!aligned) {
+--      // Force malformed ROIs to be 1x1
+--      roi_width = std::max(roi_width, (T)1.);
+--      roi_height = std::max(roi_height, (T)1.);
+--    }
+--
+--    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+--    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+--
+--    // We use roi_bin_grid to sample the grid and mimic integral
+--    int roi_bin_grid_h = (sampling_ratio > 0)
+--        ? sampling_ratio
+--        : ceil(roi_height / pooled_height); // e.g., = 2
+--    int roi_bin_grid_w =
+--        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+--
+--    // We do average (integral) pooling inside a bin
+--    // When the grid is empty, output zeros.
+--    const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
+--
+--    // we want to precalculate indices and weights shared by all chanels,
+--    // this is the key point of optimization
+--    std::vector<detail::PreCalc<T>> pre_calc(
+--        roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
+--    detail::pre_calc_for_bilinear_interpolate(
+--        height,
+--        width,
+--        pooled_height,
+--        pooled_width,
+--        roi_start_h,
+--        roi_start_w,
+--        bin_size_h,
+--        bin_size_w,
+--        roi_bin_grid_h,
+--        roi_bin_grid_w,
+--        pre_calc);
+--
+--    for (int c = 0; c < channels; c++) {
+--      int index_n_c = index_n + c * pooled_width * pooled_height;
+--      const T* offset_input =
+--          input + (roi_batch_ind * channels + c) * height * width;
+--      int pre_calc_index = 0;
+--
+--      for (int ph = 0; ph < pooled_height; ph++) {
+--        for (int pw = 0; pw < pooled_width; pw++) {
+--          int index = index_n_c + ph * pooled_width + pw;
+--
+--          T output_val = 0.;
+--          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+--            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+--              detail::PreCalc<T> pc = pre_calc[pre_calc_index];
+--              output_val += pc.w1 * offset_input[pc.pos1] +
+--                  pc.w2 * offset_input[pc.pos2] +
+--                  pc.w3 * offset_input[pc.pos3] + pc.w4 * offset_input[pc.pos4];
+--
+--              pre_calc_index += 1;
+--            }
+--          }
+--          output_val /= count; // Average pooling
+--
+--          output[index] = output_val;
+--        } // for pw
+--      } // for ph
+--    } // for c
+--  } // for n
+-+  int grain_size = ceil(n_rois / at::get_num_threads());
+-+  at::parallel_for(0, n_rois, grain_size, [&](int64_t start, int64_t end) {
+-+    for (int n = start; n < end; n++) {
+-+      int index_n = n * channels * pooled_width * pooled_height;
+-+
+-+      const T* offset_rois = rois + n * 5;
+-+      int roi_batch_ind = offset_rois[0];
+-+
+-+      // Do not using rounding; this implementation detail is critical
+-+      T offset = aligned ? (T)0.5 : (T)0.0;
+-+      T roi_start_w = offset_rois[1] * spatial_scale - offset;
+-+      T roi_start_h = offset_rois[2] * spatial_scale - offset;
+-+      T roi_end_w = offset_rois[3] * spatial_scale - offset;
+-+      T roi_end_h = offset_rois[4] * spatial_scale - offset;
+-+
+-+      T roi_width = roi_end_w - roi_start_w;
+-+      T roi_height = roi_end_h - roi_start_h;
+-+      if (!aligned) {
+-+	// Force malformed ROIs to be 1x1
+-+	roi_width = std::max(roi_width, (T)1.);
+-+	roi_height = std::max(roi_height, (T)1.);
+-+      }
+-+
+-+      T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+-+      T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+-+
+-+      // We use roi_bin_grid to sample the grid and mimic integral
+-+      int roi_bin_grid_h = (sampling_ratio > 0)
+-+          ? sampling_ratio
+-+          : ceil(roi_height / pooled_height); // e.g., = 2
+-+      int roi_bin_grid_w =
+-+          (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+-+
+-+      // We do average (integral) pooling inside a bin
+-+      // When the grid is empty, output zeros.
+-+      const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
+-+
+-+      // we want to precalculate indices and weights shared by all chanels,
+-+      // this is the key point of optimization
+-+      std::vector<detail::PreCalc<T>> pre_calc(
+-+          roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
+-+      detail::pre_calc_for_bilinear_interpolate(
+-+          height,
+-+          width,
+-+          pooled_height,
+-+          pooled_width,
+-+          roi_start_h,
+-+          roi_start_w,
+-+          bin_size_h,
+-+          bin_size_w,
+-+          roi_bin_grid_h,
+-+          roi_bin_grid_w,
+-+          pre_calc);
+-+
+-+      for (int c = 0; c < channels; c++) {
+-+	int index_n_c = index_n + c * pooled_width * pooled_height;
+-+	const T* offset_input =
+-+            input + (roi_batch_ind * channels + c) * height * width;
+-+	int pre_calc_index = 0;
+-+
+-+	for (int ph = 0; ph < pooled_height; ph++) {
+-+	  for (int pw = 0; pw < pooled_width; pw++) {
+-+	    int index = index_n_c + ph * pooled_width + pw;
+-+
+-+	    T output_val = 0.;
+-+	    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+-+	      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+-+		detail::PreCalc<T> pc = pre_calc[pre_calc_index];
+-+		output_val += pc.w1 * offset_input[pc.pos1] +
+-+                    pc.w2 * offset_input[pc.pos2] +
+-+                    pc.w3 * offset_input[pc.pos3] + pc.w4 * offset_input[pc.pos4];
+-+
+-+		pre_calc_index += 1;
+-+	      }
+-+	    }
+-+	    output_val /= count; // Average pooling
+-+
+-+	    output[index] = output_val;
+-+	  } // for pw
+-+	} // for ph
+-+      } // for c
+-+    } // for n
+-+  });
+- }
+- 
+- template <typename T>
+-@@ -183,100 +186,105 @@ void roi_align_backward_kernel_impl(
+-     int pooled_width,
+-     int sampling_ratio,
+-     bool aligned,
+--    T* grad_input,
+-+    const int64_t grad_input_size,
+-+    T* grad_input_buffer,
+-     const T* rois,
+-     int n_stride,
+-     int c_stride,
+-     int h_stride,
+-     int w_stride) {
+--  for (int index = 0; index < nthreads; index++) {
+--    // (n, c, ph, pw) is an element in the pooled output
+--    int pw = index % pooled_width;
+--    int ph = (index / pooled_width) % pooled_height;
+--    int c = (index / pooled_width / pooled_height) % channels;
+--    int n = index / pooled_width / pooled_height / channels;
+--
+--    const T* offset_rois = rois + n * 5;
+--    int roi_batch_ind = offset_rois[0];
+--
+--    // Do not using rounding; this implementation detail is critical
+--    T offset = aligned ? (T)0.5 : (T)0.0;
+--    T roi_start_w = offset_rois[1] * spatial_scale - offset;
+--    T roi_start_h = offset_rois[2] * spatial_scale - offset;
+--    T roi_end_w = offset_rois[3] * spatial_scale - offset;
+--    T roi_end_h = offset_rois[4] * spatial_scale - offset;
+--
+--    T roi_width = roi_end_w - roi_start_w;
+--    T roi_height = roi_end_h - roi_start_h;
+--    if (!aligned) {
+--      // Force malformed ROIs to be 1x1
+--      roi_width = std::max(roi_width, (T)1.);
+--      roi_height = std::max(roi_height, (T)1.);
+--    }
+--
+--    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+--    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+--
+--    T* offset_grad_input =
+--        grad_input + ((roi_batch_ind * channels + c) * height * width);
+--
+--    int output_offset = n * n_stride + c * c_stride;
+--    const T* offset_grad_output = grad_output + output_offset;
+--    const T grad_output_this_bin =
+--        offset_grad_output[ph * h_stride + pw * w_stride];
+--
+--    // We use roi_bin_grid to sample the grid and mimic integral
+--    int roi_bin_grid_h = (sampling_ratio > 0)
+--        ? sampling_ratio
+--        : ceil(roi_height / pooled_height); // e.g., = 2
+--    int roi_bin_grid_w =
+--        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+--
+--    // We do average (integral) pooling inside a bin
+--    const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+--
+--    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+--      const T y = roi_start_h + ph * bin_size_h +
+--          static_cast<T>(iy + .5f) * bin_size_h /
+--              static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+--      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+--        const T x = roi_start_w + pw * bin_size_w +
+--            static_cast<T>(ix + .5f) * bin_size_w /
+--                static_cast<T>(roi_bin_grid_w);
+--
+--        T w1, w2, w3, w4;
+--        int x_low, x_high, y_low, y_high;
+--
+--        bilinear_interpolate_gradient(
+--            height,
+--            width,
+--            y,
+--            x,
+--            w1,
+--            w2,
+--            w3,
+--            w4,
+--            x_low,
+--            x_high,
+--            y_low,
+--            y_high,
+--            index);
+--
+--        T g1 = grad_output_this_bin * w1 / count;
+--        T g2 = grad_output_this_bin * w2 / count;
+--        T g3 = grad_output_this_bin * w3 / count;
+--        T g4 = grad_output_this_bin * w4 / count;
+--
+--        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+--          // atomic add is not needed for now since it is single threaded
+--          add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
+--          add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
+--          add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
+--          add(offset_grad_input + y_high * width + x_high, static_cast<T>(g4));
+--        } // if
+--      } // ix
+--    } // iy
+--  } // for
+-+  int grain_size = ceil(nthreads / at::get_num_threads());
+-+  at::parallel_for(0, nthreads, grain_size, [&](int64_t start, int64_t end) {
+-+    for (int index = start; index < end; index++) {
+-+      int thread_no = at::get_thread_num();
+-+      // (n, c, ph, pw) is an element in the pooled output
+-+      int pw = index % pooled_width;
+-+      int ph = (index / pooled_width) % pooled_height;
+-+      int c = (index / pooled_width / pooled_height) % channels;
+-+      int n = index / pooled_width / pooled_height / channels;
+-+
+-+      const T* offset_rois = rois + n * 5;
+-+      int roi_batch_ind = offset_rois[0];
+-+
+-+      // Do not using rounding; this implementation detail is critical
+-+      T offset = aligned ? (T)0.5 : (T)0.0;
+-+      T roi_start_w = offset_rois[1] * spatial_scale - offset;
+-+      T roi_start_h = offset_rois[2] * spatial_scale - offset;
+-+      T roi_end_w = offset_rois[3] * spatial_scale - offset;
+-+      T roi_end_h = offset_rois[4] * spatial_scale - offset;
+-+
+-+      T roi_width = roi_end_w - roi_start_w;
+-+      T roi_height = roi_end_h - roi_start_h;
+-+      if (!aligned) {
+-+	// Force malformed ROIs to be 1x1
+-+	roi_width = std::max(roi_width, (T)1.);
+-+	roi_height = std::max(roi_height, (T)1.);
+-+      }
+-+
+-+      T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+-+      T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+-+
+-+      T* offset_grad_input = grad_input_buffer + (thread_no * grad_input_size) +
+-+        ((roi_batch_ind * channels + c) * height * width);
+-+
+-+      int output_offset = n * n_stride + c * c_stride;
+-+      const T* offset_grad_output = grad_output + output_offset;
+-+      const T grad_output_this_bin =
+-+          offset_grad_output[ph * h_stride + pw * w_stride];
+-+
+-+      // We use roi_bin_grid to sample the grid and mimic integral
+-+      int roi_bin_grid_h = (sampling_ratio > 0)
+-+          ? sampling_ratio
+-+          : ceil(roi_height / pooled_height); // e.g., = 2
+-+      int roi_bin_grid_w =
+-+          (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+-+
+-+      // We do average (integral) pooling inside a bin
+-+      const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+-+
+-+      for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+-+	const T y = roi_start_h + ph * bin_size_h +
+-+            static_cast<T>(iy + .5f) * bin_size_h /
+-+                static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+-+	for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+-+	  const T x = roi_start_w + pw * bin_size_w +
+-+              static_cast<T>(ix + .5f) * bin_size_w /
+-+                  static_cast<T>(roi_bin_grid_w);
+-+
+-+	  T w1, w2, w3, w4;
+-+	  int x_low, x_high, y_low, y_high;
+-+
+-+	  bilinear_interpolate_gradient(
+-+              height,
+-+              width,
+-+              y,
+-+              x,
+-+              w1,
+-+              w2,
+-+              w3,
+-+              w4,
+-+              x_low,
+-+              x_high,
+-+              y_low,
+-+              y_high,
+-+              index);
+-+
+-+	  T g1 = grad_output_this_bin * w1 / count;
+-+	  T g2 = grad_output_this_bin * w2 / count;
+-+	  T g3 = grad_output_this_bin * w3 / count;
+-+	  T g4 = grad_output_this_bin * w4 / count;
+-+
+-+	  if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+-+	    // atomic add is not needed for now since it is single threaded
+-+	    add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
+-+	    add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
+-+	    add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
+-+	    add(offset_grad_input + y_high * width + x_high, static_cast<T>(g4));
+-+	  } // if
+-+	} // ix
+-+      } // iy
+-+    } // for
+-+  });
+- }
+- 
+- at::Tensor roi_align_forward_kernel(
+-@@ -355,6 +363,9 @@ at::Tensor roi_align_backward_kernel(
+-     return grad_input;
+-   }
+- 
+-+  const int num_threads = at::get_num_threads();
+-+  at::Tensor grad_input_buffer =
+-+    at::zeros({num_threads, batch_size, channels, height, width}, grad.options());
+-   // get stride values to ensure indexing into gradients is correct.
+-   int n_stride = grad.stride(0);
+-   int c_stride = grad.stride(1);
+-@@ -375,13 +386,17 @@ at::Tensor roi_align_backward_kernel(
+-             pooled_width,
+-             sampling_ratio,
+-             aligned,
+--            grad_input.data_ptr<scalar_t>(),
+-+	    grad_input.numel(),
+-+	    grad_input_buffer.data_ptr<scalar_t>(),
+-             rois_.data_ptr<scalar_t>(),
+-             n_stride,
+-             c_stride,
+-             h_stride,
+-             w_stride);
+-       });
+-+  for (int64_t i = 0; i < num_threads; ++i) {
+-+    grad_input.add_(grad_input_buffer.select(0, i));
+-+  }
+-   return grad_input;
+- }
+- 
+diff --git a/test/cpp/c10d/CMakeLists.txt b/test/cpp/c10d/CMakeLists.txt
+index 285a5dd2a7..387d54835a 100644
+--- a/test/cpp/c10d/CMakeLists.txt
++++ b/test/cpp/c10d/CMakeLists.txt
+@@ -20,6 +20,8 @@ function(c10d_add_test test_src)
+       $<BUILD_INTERFACE:${TORCH_SRC_DIR}/csrc/distributed>
+       $<TARGET_PROPERTY:fmt::fmt-header-only,INTERFACE_INCLUDE_DIRECTORIES>
+   )
++  target_include_directories(${test_name} PRIVATE ${MPI_CXX_INCLUDE_PATH})
++  target_link_libraries(${test_name} PRIVATE ${MPI_CXX_LIBRARIES})
+   target_link_libraries(${test_name} PRIVATE
+     fmt::fmt-header-only
+     ${ARG_LINK_LIBRARIES}
+@@ -83,7 +85,7 @@ if(USE_MPI AND USE_C10D_MPI)
+   # private headers of libtorch, which in turn include MPI. As a hacky
+   # alternative to making MPI a public dependency of libtorch, we make it
+   # a private dependency of the tests as well.
+-  c10d_add_test(ProcessGroupMPITest.cpp LINK_LIBRARIES torch_cpu MPI::MPI_CXX INSTALL_TEST ${INSTALL_TEST})
++  c10d_add_test(ProcessGroupMPITest.cpp LINK_LIBRARIES torch_cpu ${MPI_CXX_LIBRARIES} INSTALL_TEST ${INSTALL_TEST})
+ endif()
+ 
+ if(LINUX AND USE_GLOO AND USE_C10D_GLOO)
+diff --git a/test/cpp/nativert/CMakeLists.txt b/test/cpp/nativert/CMakeLists.txt
+index 1b4752ed90..39a0f187bc 100644
+--- a/test/cpp/nativert/CMakeLists.txt
++++ b/test/cpp/nativert/CMakeLists.txt
+@@ -62,6 +62,7 @@ set(NATIVERT_TEST_DEPENDENCIES torch gtest_main)
+ 
+ target_link_libraries(test_nativert PRIVATE ${NATIVERT_TEST_DEPENDENCIES})
+ target_link_libraries(test_nativert PRIVATE fmt::fmt-header-only)
++target_link_libraries(test_nativert PRIVATE -ldl)
+ target_include_directories(test_nativert PRIVATE ${ATen_CPU_INCLUDE})
+ 
+ if(USE_CUDA)
+diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
+index 1632147f02..3fa6ec87fd 100644
+--- a/torch/CMakeLists.txt
++++ b/torch/CMakeLists.txt
+@@ -294,7 +294,8 @@ if(USE_DISTRIBUTED)
+     endif()
+     # Same for MPI.
+     if(USE_MPI)
+-      list(APPEND TORCH_PYTHON_LINK_LIBRARIES MPI::MPI_CXX)
++      list(APPEND TORCH_PYTHON_LINK_LIBRARIES ${MPI_CXX_LIBRARIES})
++      list(APPEND TORCH_PYTHON_INCLUDE_DIRECTORIES ${MPI_CXX_INCLUDE_PATH})
+     endif()
+     list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D)
+ 
diff --git a/llvm21.1.0/patch/tensorpipe.patch b/llvm21.1.0/patch/tensorpipe.patch
new file mode 100644
index 0000000..87a2ba5
--- /dev/null
+++ b/llvm21.1.0/patch/tensorpipe.patch
@@ -0,0 +1,13 @@
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 77df76d..bba7a14 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -10,6 +10,8 @@ project(tensorpipe LANGUAGES C CXX)
+ 
+ set(CMAKE_CXX_STANDARD 17)
+ 
++set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-missing-template-arg-list-after-template-kw")
++
+ list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
+ 
+ # Expose build options.
diff --git a/run/mnist.py b/run/mnist.py
new file mode 100644
index 0000000..29d81d6
--- /dev/null
+++ b/run/mnist.py
@@ -0,0 +1,145 @@
+from __future__ import print_function
+import argparse
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from torchvision import datasets, transforms
+from torch.optim.lr_scheduler import StepLR
+
+
+class Net(nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.conv1 = nn.Conv2d(1, 32, 3, 1)
+        self.conv2 = nn.Conv2d(32, 64, 3, 1)
+        self.dropout1 = nn.Dropout(0.25)
+        self.dropout2 = nn.Dropout(0.5)
+        self.fc1 = nn.Linear(9216, 128)
+        self.fc2 = nn.Linear(128, 10)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = F.relu(x)
+        x = self.conv2(x)
+        x = F.relu(x)
+        x = F.max_pool2d(x, 2)
+        x = self.dropout1(x)
+        x = torch.flatten(x, 1)
+        x = self.fc1(x)
+        x = F.relu(x)
+        x = self.dropout2(x)
+        x = self.fc2(x)
+        output = F.log_softmax(x, dim=1)
+        return output
+
+
+def train(args, model, device, train_loader, optimizer, epoch):
+    model.train()
+    for batch_idx, (data, target) in enumerate(train_loader):
+        data, target = data.to(device), target.to(device)
+        optimizer.zero_grad()
+        output = model(data)
+        loss = F.nll_loss(output, target)
+        loss.backward()
+        optimizer.step()
+        if batch_idx % args.log_interval == 0:
+            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
+                epoch, batch_idx * len(data), len(train_loader.dataset),
+                100. * batch_idx / len(train_loader), loss.item()))
+            if args.dry_run:
+                break
+
+
+def test(model, device, test_loader):
+    model.eval()
+    test_loss = 0
+    correct = 0
+    with torch.no_grad():
+        for data, target in test_loader:
+            data, target = data.to(device), target.to(device)
+            output = model(data)
+            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
+            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
+            correct += pred.eq(target.view_as(pred)).sum().item()
+
+    test_loss /= len(test_loader.dataset)
+
+    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
+        test_loss, correct, len(test_loader.dataset),
+        100. * correct / len(test_loader.dataset)))
+
+
+def main():
+    # Training settings
+    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
+    parser.add_argument('--batch-size', type=int, default=64, metavar='N',
+                        help='input batch size for training (default: 64)')
+    parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
+                        help='input batch size for testing (default: 1000)')
+    parser.add_argument('--epochs', type=int, default=14, metavar='N',
+                        help='number of epochs to train (default: 14)')
+    parser.add_argument('--lr', type=float, default=1.0, metavar='LR',
+                        help='learning rate (default: 1.0)')
+    parser.add_argument('--gamma', type=float, default=0.7, metavar='M',
+                        help='Learning rate step gamma (default: 0.7)')
+    parser.add_argument('--no-cuda', action='store_true', default=False,
+                        help='disables CUDA training')
+    parser.add_argument('--no-mps', action='store_true', default=False,
+                        help='disables macOS GPU training')
+    parser.add_argument('--dry-run', action='store_true', default=False,
+                        help='quickly check a single pass')
+    parser.add_argument('--seed', type=int, default=1, metavar='S',
+                        help='random seed (default: 1)')
+    parser.add_argument('--log-interval', type=int, default=10, metavar='N',
+                        help='how many batches to wait before logging training status')
+    parser.add_argument('--save-model', action='store_true', default=False,
+                        help='For Saving the current Model')
+    args = parser.parse_args()
+    use_cuda = not args.no_cuda and torch.cuda.is_available()
+    use_mps = not args.no_mps and torch.backends.mps.is_available()
+
+    torch.manual_seed(args.seed)
+
+    if use_cuda:
+        device = torch.device("cuda")
+    elif use_mps:
+        device = torch.device("mps")
+    else:
+        device = torch.device("cpu")
+
+    train_kwargs = {'batch_size': args.batch_size}
+    test_kwargs = {'batch_size': args.test_batch_size}
+    if use_cuda:
+        cuda_kwargs = {'num_workers': 1,
+                       'pin_memory': True,
+                       'shuffle': True}
+        train_kwargs.update(cuda_kwargs)
+        test_kwargs.update(cuda_kwargs)
+
+    transform=transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize((0.1307,), (0.3081,))
+        ])
+    dataset1 = datasets.MNIST('../data', train=True, download=True,
+                       transform=transform)
+    dataset2 = datasets.MNIST('../data', train=False,
+                       transform=transform)
+    train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)
+    test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)
+
+    model = Net().to(device)
+    optimizer = optim.Adadelta(model.parameters(), lr=args.lr)
+
+    scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
+    for epoch in range(1, args.epochs + 1):
+        train(args, model, device, train_loader, optimizer, epoch)
+        test(model, device, test_loader)
+        scheduler.step()
+
+    if args.save_model:
+        torch.save(model.state_dict(), "mnist_cnn.pt")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/run/run1proc_mnist.sh b/run/run1proc_mnist.sh
new file mode 100644
index 0000000..3ff138e
--- /dev/null
+++ b/run/run1proc_mnist.sh
@@ -0,0 +1,36 @@
+#! /bin/bash
+#PJM -L "rscunit=rscunit_ft01,rscgrp=small"
+#PJM -L elapse=01:00:00
+#PJM -L "node=1"
+#PJM -x PJM_LLIO_GFSCACHE=/vol0004
+#PJM -j
+#PJM -S
+
+module purge
+
+set -euo pipefail
+
+script_basedir=$(cd $(dirname $0); pwd)
+source $script_basedir/env.src
+[ -v VENV_PATH ] && source $VENV_PATH/bin/activate
+
+set -x
+
+#export OMP_PROC_BIND=false
+export OMP_NUM_THREADS=48
+
+# For oneDNN debug
+# Output debug message (CSV) to stdout.
+# The message begin with 'dnnl_verbose,' which is the first entry in CSV.
+#export DNNL_VERBOSE=1                  #  0: (no output), 1: (exec), 2: (1 + cache hit/miss)
+#export DNNL_VERBOSE_TIMESTAMP=1
+
+ulimit -s 8192
+
+if [ ${PMIX_RANK:-0} -eq 0 ]; then
+    env
+    pip3 list
+    KMP_SETTINGS=1 python3 -c "import torch; print(torch.__version__); print(torch.__config__.show()); print(torch.__config__.parallel_info())"
+fi
+
+LD_PRELOAD=$PREFIX/lib/libtcmalloc.so python3 -u mnist.py --epoch 2 --no-cuda --no-mps