python-grimp · seddonym · Apr 24, 2025 · Apr 24, 2025 · Apr 24, 2025
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -2,6 +2,12 @@
 Changelog
 =========
 
+latest
+------
+
+* Provide more control of multiprocessing via ``GRIMP_MIN_MULTIPROCESSING_MODULES``
+  environment variable.
+
 3.8.1 (2025-04-23)
 ------------------
 

diff --git a/docs/usage.rst b/docs/usage.rst
@@ -82,7 +82,12 @@ Building the graph
     :param str, optional cache_dir: The directory to use for caching the graph. Defaults to ``.grimp_cache``. To disable caching,
         pass ``None``. See :doc:`caching`.
     :return: An import graph that you can use to analyse the package.
-    :rtype: ImportGraph
+    :rtype: ``ImportGraph``
+
+    This method uses multiple operating system processes to build the graph, if the number of modules to scan (not
+    including modules in the cache) is 50 or more. This threshold can be adjusted by setting the ``GRIMP_MIN_MULTIPROCESSING_MODULES``
+    environment variable to a different number. To disable multiprocessing altogether, set it to a large number (more than
+    the number of modules in the codebase being analyzed).
 
 .. _typing module documentation: https://docs.python.org/3/library/typing.html#typing.TYPE_CHECKING
 

diff --git a/src/grimp/application/usecases.py b/src/grimp/application/usecases.py
@@ -15,14 +15,19 @@
 from ..application.ports.packagefinder import AbstractPackageFinder
 from ..domain.valueobjects import DirectImport, Module
 from .config import settings
+import os
 
 
 class NotSupplied:
     pass
 
 
+# Calling code can set this environment variable if it wants to tune when to switch to
+# multiprocessing, or set it to a large number to disable it altogether.
+MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING_ENV_NAME = "GRIMP_MIN_MULTIPROCESSING_MODULES"
 # This is an arbitrary number, but setting it too low slows down our functional tests considerably.
-MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING = 50
+# If you change this, update docs/usage.rst too!
+DEFAULT_MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING = 50
 
 
 def build_graph(
@@ -238,7 +243,13 @@ def _create_chunks(module_files: Collection[ModuleFile]) -> tuple[tuple[ModuleFi
 
 
 def _decide_number_of_processes(number_of_module_files: int) -> int:
-    if number_of_module_files < MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING:
+    min_number_of_modules = int(
+        os.environ.get(
+            MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING_ENV_NAME,
+            DEFAULT_MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING,
+        )
+    )
+    if number_of_module_files < min_number_of_modules:
         # Don't incur the overhead of multiple processes.
         return 1
     return min(joblib.cpu_count(), number_of_module_files)

diff --git a/tests/adaptors/modulefinder.py b/tests/adaptors/modulefinder.py
@@ -0,0 +1,16 @@
+from grimp.application.ports.modulefinder import AbstractModuleFinder, FoundPackage, ModuleFile
+from grimp.application.ports.filesystem import AbstractFileSystem
+from typing import FrozenSet, Dict
+
+
+class BaseFakeModuleFinder(AbstractModuleFinder):
+    module_files_by_package_name: Dict[str, FrozenSet[ModuleFile]] = {}
+
+    def find_package(
+        self, package_name: str, package_directory: str, file_system: AbstractFileSystem
+    ) -> FoundPackage:
+        return FoundPackage(
+            name=package_name,
+            directory=package_directory,
+            module_files=self.module_files_by_package_name[package_name],
+        )
diff --git a/tests/functional/test_build_and_use_graph.py b/tests/functional/test_build_and_use_graph.py
@@ -4,6 +4,7 @@
 from unittest.mock import patch
 from grimp.application import usecases
 
+
 """
 For ease of reference, these are the imports of all the files:
 
@@ -55,7 +56,7 @@ def test_modules():
     }
 
 
-@patch.object(usecases, "MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING", 0)
+@patch.object(usecases, "DEFAULT_MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING", 0)
 def test_modules_multiprocessing():
     """
     This test runs relatively slowly, but it's important we cover the multiprocessing code.

diff --git a/tests/unit/application/test_usecases.py b/tests/unit/application/test_usecases.py
@@ -1,6 +1,8 @@
+import os
 from typing import Dict, Optional, Set
-from unittest.mock import sentinel
+from unittest.mock import sentinel, patch
 
+import joblib  # type: ignore
 import pytest  # type: ignore
 
 from grimp.application import usecases
@@ -9,8 +11,11 @@
 from grimp.domain.valueobjects import DirectImport, Module
 from tests.adaptors.filesystem import FakeFileSystem
 from tests.adaptors.packagefinder import BaseFakePackageFinder
+from tests.adaptors.modulefinder import BaseFakeModuleFinder
 from tests.config import override_settings
 
+SOME_CPU_COUNT = 8
+
 
 class TestBuildGraph:
     @pytest.mark.parametrize("include_external_packages", (True, False))
@@ -131,3 +136,75 @@ def write(
             if supplied_cache_dir is not sentinel.not_supplied:
                 kwargs["cache_dir"] = supplied_cache_dir
             usecases.build_graph("mypackage", **kwargs)
+
+    @patch.object(usecases, "_scan_chunks", return_value={})
+    @patch.object(joblib, "cpu_count", return_value=SOME_CPU_COUNT)
+    @pytest.mark.parametrize(
+        "number_of_modules, fake_environ, expected_number_of_chunks",
+        [
+            (
+                usecases.DEFAULT_MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING - 1,
+                {},
+                1,
+            ),
+            (
+                usecases.DEFAULT_MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING,
+                {},
+                SOME_CPU_COUNT,
+            ),
+            (
+                usecases.DEFAULT_MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING + 1,
+                {},
+                SOME_CPU_COUNT,
+            ),
+            (
+                149,
+                {usecases.MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING_ENV_NAME: 150},
+                1,
+            ),
+            (
+                150,
+                {usecases.MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING_ENV_NAME: 150},
+                SOME_CPU_COUNT,
+            ),
+            (
+                151,
+                {usecases.MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING_ENV_NAME: 150},
+                SOME_CPU_COUNT,
+            ),
+        ],
+    )
+    def test_scanning_multiprocessing_respects_min_number_of_modules(
+        self,
+        mock_cpu_count,
+        mock_scan_chunks,
+        number_of_modules,
+        fake_environ,
+        expected_number_of_chunks,
+    ):
+        class FakePackageFinder(BaseFakePackageFinder):
+            directory_map = {"mypackage": "/path/to/mypackage"}
+
+        class FakeModuleFinder(BaseFakeModuleFinder):
+            module_files_by_package_name = {
+                "mypackage": frozenset(
+                    {
+                        ModuleFile(
+                            module=Module(f"mypackage.mod_{i}"),
+                            mtime=999,
+                        )
+                        for i in range(number_of_modules)
+                    }
+                )
+            }
+
+        with override_settings(
+            FILE_SYSTEM=FakeFileSystem(),
+            PACKAGE_FINDER=FakePackageFinder(),
+            MODULE_FINDER=FakeModuleFinder(),
+        ), patch.object(os, "environ", fake_environ):
+            usecases.build_graph("mypackage", cache_dir=None)
+
+        [call] = mock_scan_chunks.call_args_list
+        chunks = call.args[0]
+        assert len(chunks) == expected_number_of_chunks