diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 4761e7e9..345eb3c9 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -2,6 +2,12 @@ Changelog ========= +latest +------ + +* Provide more control of multiprocessing via ``GRIMP_MIN_MULTIPROCESSING_MODULES`` + environment variable. + 3.8.1 (2025-04-23) ------------------ diff --git a/docs/usage.rst b/docs/usage.rst index ac463b2a..c342f06d 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -82,7 +82,12 @@ Building the graph :param str, optional cache_dir: The directory to use for caching the graph. Defaults to ``.grimp_cache``. To disable caching, pass ``None``. See :doc:`caching`. :return: An import graph that you can use to analyse the package. - :rtype: ImportGraph + :rtype: ``ImportGraph`` + + This method uses multiple operating system processes to build the graph, if the number of modules to scan (not + including modules in the cache) is 50 or more. This threshold can be adjusted by setting the ``GRIMP_MIN_MULTIPROCESSING_MODULES`` + environment variable to a different number. To disable multiprocessing altogether, set it to a large number (more than + the number of modules in the codebase being analyzed). .. _typing module documentation: https://docs.python.org/3/library/typing.html#typing.TYPE_CHECKING diff --git a/src/grimp/application/usecases.py b/src/grimp/application/usecases.py index 606befd1..63f70a05 100644 --- a/src/grimp/application/usecases.py +++ b/src/grimp/application/usecases.py @@ -15,14 +15,19 @@ from ..application.ports.packagefinder import AbstractPackageFinder from ..domain.valueobjects import DirectImport, Module from .config import settings +import os class NotSupplied: pass +# Calling code can set this environment variable if it wants to tune when to switch to +# multiprocessing, or set it to a large number to disable it altogether. +MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING_ENV_NAME = "GRIMP_MIN_MULTIPROCESSING_MODULES" # This is an arbitrary number, but setting it too low slows down our functional tests considerably. -MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING = 50 +# If you change this, update docs/usage.rst too! +DEFAULT_MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING = 50 def build_graph( @@ -238,7 +243,13 @@ def _create_chunks(module_files: Collection[ModuleFile]) -> tuple[tuple[ModuleFi def _decide_number_of_processes(number_of_module_files: int) -> int: - if number_of_module_files < MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING: + min_number_of_modules = int( + os.environ.get( + MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING_ENV_NAME, + DEFAULT_MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING, + ) + ) + if number_of_module_files < min_number_of_modules: # Don't incur the overhead of multiple processes. return 1 return min(joblib.cpu_count(), number_of_module_files) diff --git a/tests/adaptors/modulefinder.py b/tests/adaptors/modulefinder.py new file mode 100644 index 00000000..2a29cff0 --- /dev/null +++ b/tests/adaptors/modulefinder.py @@ -0,0 +1,16 @@ +from grimp.application.ports.modulefinder import AbstractModuleFinder, FoundPackage, ModuleFile +from grimp.application.ports.filesystem import AbstractFileSystem +from typing import FrozenSet, Dict + + +class BaseFakeModuleFinder(AbstractModuleFinder): + module_files_by_package_name: Dict[str, FrozenSet[ModuleFile]] = {} + + def find_package( + self, package_name: str, package_directory: str, file_system: AbstractFileSystem + ) -> FoundPackage: + return FoundPackage( + name=package_name, + directory=package_directory, + module_files=self.module_files_by_package_name[package_name], + ) diff --git a/tests/functional/test_build_and_use_graph.py b/tests/functional/test_build_and_use_graph.py index 76b728e7..e061a82e 100644 --- a/tests/functional/test_build_and_use_graph.py +++ b/tests/functional/test_build_and_use_graph.py @@ -4,6 +4,7 @@ from unittest.mock import patch from grimp.application import usecases + """ For ease of reference, these are the imports of all the files: @@ -55,7 +56,7 @@ def test_modules(): } -@patch.object(usecases, "MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING", 0) +@patch.object(usecases, "DEFAULT_MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING", 0) def test_modules_multiprocessing(): """ This test runs relatively slowly, but it's important we cover the multiprocessing code. diff --git a/tests/unit/application/test_usecases.py b/tests/unit/application/test_usecases.py index 35d07206..a8f14207 100644 --- a/tests/unit/application/test_usecases.py +++ b/tests/unit/application/test_usecases.py @@ -1,6 +1,8 @@ +import os from typing import Dict, Optional, Set -from unittest.mock import sentinel +from unittest.mock import sentinel, patch +import joblib # type: ignore import pytest # type: ignore from grimp.application import usecases @@ -9,8 +11,11 @@ from grimp.domain.valueobjects import DirectImport, Module from tests.adaptors.filesystem import FakeFileSystem from tests.adaptors.packagefinder import BaseFakePackageFinder +from tests.adaptors.modulefinder import BaseFakeModuleFinder from tests.config import override_settings +SOME_CPU_COUNT = 8 + class TestBuildGraph: @pytest.mark.parametrize("include_external_packages", (True, False)) @@ -131,3 +136,75 @@ def write( if supplied_cache_dir is not sentinel.not_supplied: kwargs["cache_dir"] = supplied_cache_dir usecases.build_graph("mypackage", **kwargs) + + @patch.object(usecases, "_scan_chunks", return_value={}) + @patch.object(joblib, "cpu_count", return_value=SOME_CPU_COUNT) + @pytest.mark.parametrize( + "number_of_modules, fake_environ, expected_number_of_chunks", + [ + ( + usecases.DEFAULT_MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING - 1, + {}, + 1, + ), + ( + usecases.DEFAULT_MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING, + {}, + SOME_CPU_COUNT, + ), + ( + usecases.DEFAULT_MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING + 1, + {}, + SOME_CPU_COUNT, + ), + ( + 149, + {usecases.MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING_ENV_NAME: 150}, + 1, + ), + ( + 150, + {usecases.MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING_ENV_NAME: 150}, + SOME_CPU_COUNT, + ), + ( + 151, + {usecases.MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING_ENV_NAME: 150}, + SOME_CPU_COUNT, + ), + ], + ) + def test_scanning_multiprocessing_respects_min_number_of_modules( + self, + mock_cpu_count, + mock_scan_chunks, + number_of_modules, + fake_environ, + expected_number_of_chunks, + ): + class FakePackageFinder(BaseFakePackageFinder): + directory_map = {"mypackage": "/path/to/mypackage"} + + class FakeModuleFinder(BaseFakeModuleFinder): + module_files_by_package_name = { + "mypackage": frozenset( + { + ModuleFile( + module=Module(f"mypackage.mod_{i}"), + mtime=999, + ) + for i in range(number_of_modules) + } + ) + } + + with override_settings( + FILE_SYSTEM=FakeFileSystem(), + PACKAGE_FINDER=FakePackageFinder(), + MODULE_FINDER=FakeModuleFinder(), + ), patch.object(os, "environ", fake_environ): + usecases.build_graph("mypackage", cache_dir=None) + + [call] = mock_scan_chunks.call_args_list + chunks = call.args[0] + assert len(chunks) == expected_number_of_chunks