From 6db751e8add3e118bbf58a0bdb289f89e8a84acc Mon Sep 17 00:00:00 2001 From: David Seddon Date: Thu, 24 Apr 2025 13:42:53 +0100 Subject: [PATCH 1/2] Add test for multiprocessing chunking --- tests/adaptors/modulefinder.py | 16 +++++++++ tests/unit/application/test_usecases.py | 44 ++++++++++++++++++++++++- 2 files changed, 59 insertions(+), 1 deletion(-) create mode 100644 tests/adaptors/modulefinder.py diff --git a/tests/adaptors/modulefinder.py b/tests/adaptors/modulefinder.py new file mode 100644 index 00000000..2a29cff0 --- /dev/null +++ b/tests/adaptors/modulefinder.py @@ -0,0 +1,16 @@ +from grimp.application.ports.modulefinder import AbstractModuleFinder, FoundPackage, ModuleFile +from grimp.application.ports.filesystem import AbstractFileSystem +from typing import FrozenSet, Dict + + +class BaseFakeModuleFinder(AbstractModuleFinder): + module_files_by_package_name: Dict[str, FrozenSet[ModuleFile]] = {} + + def find_package( + self, package_name: str, package_directory: str, file_system: AbstractFileSystem + ) -> FoundPackage: + return FoundPackage( + name=package_name, + directory=package_directory, + module_files=self.module_files_by_package_name[package_name], + ) diff --git a/tests/unit/application/test_usecases.py b/tests/unit/application/test_usecases.py index 35d07206..d421560e 100644 --- a/tests/unit/application/test_usecases.py +++ b/tests/unit/application/test_usecases.py @@ -1,6 +1,7 @@ from typing import Dict, Optional, Set -from unittest.mock import sentinel +from unittest.mock import sentinel, patch +import joblib # type: ignore import pytest # type: ignore from grimp.application import usecases @@ -9,6 +10,7 @@ from grimp.domain.valueobjects import DirectImport, Module from tests.adaptors.filesystem import FakeFileSystem from tests.adaptors.packagefinder import BaseFakePackageFinder +from tests.adaptors.modulefinder import BaseFakeModuleFinder from tests.config import override_settings @@ -131,3 +133,43 @@ def write( if supplied_cache_dir is not sentinel.not_supplied: kwargs["cache_dir"] = supplied_cache_dir usecases.build_graph("mypackage", **kwargs) + + @patch.object(usecases, "_scan_chunks", return_value={}) + @patch.object(joblib, "cpu_count", return_value=8) + @pytest.mark.parametrize( + "number_of_modules, expected_number_of_chunks", + [ + (49, 1), # Below threshold - just use one. + (50, 8), # At threshold - use number of CPUs. + (1000, 8), # Above threshold - use number of CPUs. + ], + ) + def test_scanning_multiprocessing_respects_min_number_of_modules( + self, mock_cpu_count, mock_scan_chunks, number_of_modules, expected_number_of_chunks + ): + class FakePackageFinder(BaseFakePackageFinder): + directory_map = {"mypackage": "/path/to/mypackage"} + + class FakeModuleFinder(BaseFakeModuleFinder): + module_files_by_package_name = { + "mypackage": frozenset( + { + ModuleFile( + module=Module(f"mypackage.mod_{i}"), + mtime=999, + ) + for i in range(number_of_modules) + } + ) + } + + with override_settings( + FILE_SYSTEM=FakeFileSystem(), + PACKAGE_FINDER=FakePackageFinder(), + MODULE_FINDER=FakeModuleFinder(), + ): + usecases.build_graph("mypackage", cache_dir=None) + + [call] = mock_scan_chunks.call_args_list + chunks = call.args[0] + assert len(chunks) == expected_number_of_chunks From f54b9a11d40ee23fbaca4c75390370bec5bba4f6 Mon Sep 17 00:00:00 2001 From: David Seddon Date: Thu, 24 Apr 2025 13:59:02 +0100 Subject: [PATCH 2/2] Allow control of multiprocessing using env var This provides the ability to adjust the arbitrary cut off, or turn off multiprocessing altogether. --- CHANGELOG.rst | 6 +++ docs/usage.rst | 7 ++- src/grimp/application/usecases.py | 15 +++++- tests/functional/test_build_and_use_graph.py | 3 +- tests/unit/application/test_usecases.py | 49 +++++++++++++++++--- 5 files changed, 69 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 4761e7e9..345eb3c9 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -2,6 +2,12 @@ Changelog ========= +latest +------ + +* Provide more control of multiprocessing via ``GRIMP_MIN_MULTIPROCESSING_MODULES`` + environment variable. + 3.8.1 (2025-04-23) ------------------ diff --git a/docs/usage.rst b/docs/usage.rst index ac463b2a..c342f06d 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -82,7 +82,12 @@ Building the graph :param str, optional cache_dir: The directory to use for caching the graph. Defaults to ``.grimp_cache``. To disable caching, pass ``None``. See :doc:`caching`. :return: An import graph that you can use to analyse the package. - :rtype: ImportGraph + :rtype: ``ImportGraph`` + + This method uses multiple operating system processes to build the graph, if the number of modules to scan (not + including modules in the cache) is 50 or more. This threshold can be adjusted by setting the ``GRIMP_MIN_MULTIPROCESSING_MODULES`` + environment variable to a different number. To disable multiprocessing altogether, set it to a large number (more than + the number of modules in the codebase being analyzed). .. _typing module documentation: https://docs.python.org/3/library/typing.html#typing.TYPE_CHECKING diff --git a/src/grimp/application/usecases.py b/src/grimp/application/usecases.py index 606befd1..63f70a05 100644 --- a/src/grimp/application/usecases.py +++ b/src/grimp/application/usecases.py @@ -15,14 +15,19 @@ from ..application.ports.packagefinder import AbstractPackageFinder from ..domain.valueobjects import DirectImport, Module from .config import settings +import os class NotSupplied: pass +# Calling code can set this environment variable if it wants to tune when to switch to +# multiprocessing, or set it to a large number to disable it altogether. +MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING_ENV_NAME = "GRIMP_MIN_MULTIPROCESSING_MODULES" # This is an arbitrary number, but setting it too low slows down our functional tests considerably. -MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING = 50 +# If you change this, update docs/usage.rst too! +DEFAULT_MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING = 50 def build_graph( @@ -238,7 +243,13 @@ def _create_chunks(module_files: Collection[ModuleFile]) -> tuple[tuple[ModuleFi def _decide_number_of_processes(number_of_module_files: int) -> int: - if number_of_module_files < MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING: + min_number_of_modules = int( + os.environ.get( + MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING_ENV_NAME, + DEFAULT_MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING, + ) + ) + if number_of_module_files < min_number_of_modules: # Don't incur the overhead of multiple processes. return 1 return min(joblib.cpu_count(), number_of_module_files) diff --git a/tests/functional/test_build_and_use_graph.py b/tests/functional/test_build_and_use_graph.py index 76b728e7..e061a82e 100644 --- a/tests/functional/test_build_and_use_graph.py +++ b/tests/functional/test_build_and_use_graph.py @@ -4,6 +4,7 @@ from unittest.mock import patch from grimp.application import usecases + """ For ease of reference, these are the imports of all the files: @@ -55,7 +56,7 @@ def test_modules(): } -@patch.object(usecases, "MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING", 0) +@patch.object(usecases, "DEFAULT_MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING", 0) def test_modules_multiprocessing(): """ This test runs relatively slowly, but it's important we cover the multiprocessing code. diff --git a/tests/unit/application/test_usecases.py b/tests/unit/application/test_usecases.py index d421560e..a8f14207 100644 --- a/tests/unit/application/test_usecases.py +++ b/tests/unit/application/test_usecases.py @@ -1,3 +1,4 @@ +import os from typing import Dict, Optional, Set from unittest.mock import sentinel, patch @@ -13,6 +14,8 @@ from tests.adaptors.modulefinder import BaseFakeModuleFinder from tests.config import override_settings +SOME_CPU_COUNT = 8 + class TestBuildGraph: @pytest.mark.parametrize("include_external_packages", (True, False)) @@ -135,17 +138,49 @@ def write( usecases.build_graph("mypackage", **kwargs) @patch.object(usecases, "_scan_chunks", return_value={}) - @patch.object(joblib, "cpu_count", return_value=8) + @patch.object(joblib, "cpu_count", return_value=SOME_CPU_COUNT) @pytest.mark.parametrize( - "number_of_modules, expected_number_of_chunks", + "number_of_modules, fake_environ, expected_number_of_chunks", [ - (49, 1), # Below threshold - just use one. - (50, 8), # At threshold - use number of CPUs. - (1000, 8), # Above threshold - use number of CPUs. + ( + usecases.DEFAULT_MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING - 1, + {}, + 1, + ), + ( + usecases.DEFAULT_MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING, + {}, + SOME_CPU_COUNT, + ), + ( + usecases.DEFAULT_MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING + 1, + {}, + SOME_CPU_COUNT, + ), + ( + 149, + {usecases.MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING_ENV_NAME: 150}, + 1, + ), + ( + 150, + {usecases.MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING_ENV_NAME: 150}, + SOME_CPU_COUNT, + ), + ( + 151, + {usecases.MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING_ENV_NAME: 150}, + SOME_CPU_COUNT, + ), ], ) def test_scanning_multiprocessing_respects_min_number_of_modules( - self, mock_cpu_count, mock_scan_chunks, number_of_modules, expected_number_of_chunks + self, + mock_cpu_count, + mock_scan_chunks, + number_of_modules, + fake_environ, + expected_number_of_chunks, ): class FakePackageFinder(BaseFakePackageFinder): directory_map = {"mypackage": "/path/to/mypackage"} @@ -167,7 +202,7 @@ class FakeModuleFinder(BaseFakeModuleFinder): FILE_SYSTEM=FakeFileSystem(), PACKAGE_FINDER=FakePackageFinder(), MODULE_FINDER=FakeModuleFinder(), - ): + ), patch.object(os, "environ", fake_environ): usecases.build_graph("mypackage", cache_dir=None) [call] = mock_scan_chunks.call_args_list