lene · lene · Nov 16, 2025
diff --git a/.flake8 b/.flake8
@@ -1,3 +1,3 @@
 [flake8]
 max-line-length = 100
-ignore = F401,S101
+ignore = F401,S101,W503
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,14 @@
 # Changelog
 
+## [0.11.11] - 2025-11-16
+
+### Fixed
+- Fixed issue #19: Hash cache corruption when files are replaced at the same path. The hash
+  database now uses composite keys (path, modification time, file size) to detect when a file
+  has been replaced or modified, forcing recalculation of the hash. This prevents dangerous
+  false positives that could lead to incorrect duplicate detection and accidental data loss
+  with delete actions. Old hash databases are automatically migrated to the new format.
+
 ## [0.11.10] - 2025-11-04
 
 ### Added

diff --git a/duplicate_images/function_types.py b/duplicate_images/function_types.py
@@ -19,7 +19,8 @@
 ResultsGenerator = Generator[List[Path], None, None]
 ResultsGrouper = Callable[[ResultsGenerator], Results]
 CacheEntry = Tuple[Path, Optional[Hash]]
-Cache = Dict[Path, Hash]
+CacheKey = Tuple[Path, float, int]  # (path, mtime, size)
+Cache = Dict[CacheKey, Hash]
 
 
 def is_hash(x: Any) -> bool:

diff --git a/duplicate_images/hash_store.py b/duplicate_images/hash_store.py
@@ -12,7 +12,7 @@
 from imagehash import hex_to_hash
 
 from duplicate_images.common import log_execution_time
-from duplicate_images.function_types import Cache, Hash, is_hash
+from duplicate_images.function_types import Cache, CacheKey, Hash, is_hash
 
 
 class NullHashStore:
@@ -89,19 +89,32 @@ def __exit__(self, exc_type: Any, _: Any, __: Any) -> None:
         self.dump()
 
     def add(self, file: Path, image_hash: Hash) -> None:
-        self.values[file] = image_hash
+        stat = file.stat()
+        resolved_path = file.resolve()
+
+        # Remove any existing entries for this path to prevent cache bloat
+        self.values = {k: v for k, v in self.values.items() if k[0] != resolved_path}
+
+        # Add new entry with composite key (path, mtime, size)
+        new_key = (resolved_path, stat.st_mtime, stat.st_size)
+        self.values[new_key] = image_hash
         self.dirty = True
 
     def get(self, file: Path) -> Optional[Hash]:
-        return self.values.get(file)
+        stat = file.stat()
+        key = (file.resolve(), stat.st_mtime, stat.st_size)
+        return self.values.get(key)
 
     def metadata(self) -> Dict:
         return {'algorithm': self.algorithm, **self.hash_size_kwargs}
 
     def values_with_metadata(self) -> Tuple[Dict, Dict]:
         return self.values, self.metadata()
 
-    def checked_load(self, file: IO, load: Callable[[IO], Tuple[Cache, Dict]]) -> None:
+    def checked_load(
+            self, file: IO,
+            load: Callable[[IO], Tuple[Dict[Union[Path, CacheKey], Hash], Dict]]
+    ) -> None:
         try:
             values, metadata = load(file)  # nosec
         except IndexError as error:
@@ -112,10 +125,34 @@ def checked_load(self, file: IO, load: Callable[[IO], Tuple[Cache, Dict]]) -> No
             raise ValueError('Metadata empty')
         if not isinstance(metadata, dict):
             raise ValueError(f'Metadata not a dict: {metadata}')
-        bad_keys = [key for key in values.keys() if not isinstance(key, Path)]
+
+        # Detect and migrate old format: Dict[Path, Hash] -> Dict[(Path, mtime, size), Hash]
+        migrated_values: Cache = {}
+        if values and not isinstance(next(iter(values.keys())), tuple):
+            logging.info('Migrating old hash database format to new format with file metadata')
+            # Old format: keys are Path objects, convert to composite keys with dummy metadata
+            # Files with changed mtime/size will be recalculated on first access
+            for path, hash_val in values.items():
+                if isinstance(path, Path):
+                    migrated_values[(path, 0.0, 0)] = hash_val
+        else:
+            # New format or already migrated - just cast to the right type
+            for k, v in values.items():
+                if isinstance(k, tuple):
+                    migrated_values[k] = v
+
+        # Validate keys are tuples of (Path, float, int)
+        def is_valid_key(key):
+            return (isinstance(key, tuple) and len(key) == 3
+                    and isinstance(key[0], Path)
+                    and isinstance(key[1], (int, float))
+                    and isinstance(key[2], int))
+
+        bad_keys = [key for key in migrated_values.keys() if not is_valid_key(key)]
         if bad_keys:
-            raise ValueError(f'Not a Path: {bad_keys}')
-        bad_values = [value for value in values.values() if not is_hash(value)]
+            raise ValueError(f'Invalid cache key format: {bad_keys}')
+
+        bad_values = [value for value in migrated_values.values() if not is_hash(value)]
         if bad_values:
             raise ValueError(f'Not an image hash: {bad_values}')
         if metadata['algorithm'] != self.algorithm:
@@ -124,7 +161,7 @@ def checked_load(self, file: IO, load: Callable[[IO], Tuple[Cache, Dict]]) -> No
             raise ValueError(f'Metadata mismatch: {metadata} != {self.metadata()}')
         if metadata != self.metadata():
             raise ValueError(f'Metadata mismatch: {metadata} != {self.metadata()}')
-        self.values = values
+        self.values = migrated_values
 
     def load(self) -> None:
         raise NotImplementedError()
@@ -150,7 +187,7 @@ def dump(self) -> None:
             pickle.dump(self.values_with_metadata(), file)  # nosec
 
 
-def load_values_and_metadata(file: IO) -> Tuple[Cache, Dict]:
+def load_values_and_metadata(file: IO) -> Tuple[Dict[Union[Path, CacheKey], Hash], Dict]:
     try:
         valds = json.load(file)
     except json.JSONDecodeError as error:
@@ -161,7 +198,22 @@ def load_values_and_metadata(file: IO) -> Tuple[Cache, Dict]:
         raise ValueError(f'Not a dict: {valds[0]}')
     if not isinstance(valds[1], dict):
         raise ValueError(f'Metadata not a dict: {valds[1]}')
-    return {Path(k).resolve(): hex_to_hash(str(v)) for k, v in valds[0].items()}, valds[1]
+
+    # Parse keys: can be either "path" (old format) or "path|mtime|size" (new format)
+    parsed_values: Dict[Union[Path, CacheKey], Hash] = {}
+    for k, v in valds[0].items():
+        if '|' in k:
+            # New format: "path|mtime|size"
+            parts = k.rsplit('|', 2)
+            path = Path(parts[0]).resolve()
+            mtime = float(parts[1])
+            size = int(parts[2])
+            parsed_values[(path, mtime, size)] = hex_to_hash(str(v))
+        else:
+            # Old format: just path (will be migrated in checked_load)
+            parsed_values[Path(k).resolve()] = hex_to_hash(str(v))
+
+    return parsed_values, valds[1]
 
 
 class JSONHashStore(FileHashStore):
@@ -170,15 +222,6 @@ class JSONHashStore(FileHashStore):
     image hashes in JSON format
     """
 
-    def add(self, file: Path, image_hash: Hash) -> None:
-        # Resolve path to ensure consistent key format
-        self.values[file.resolve()] = image_hash
-        self.dirty = True
-
-    def get(self, file: Path) -> Optional[Hash]:
-        # Resolve path to match how paths are stored
-        return self.values.get(file.resolve())
-
     @log_execution_time()
     def load(self) -> None:
         with self.store_path.open('r') as file:
@@ -187,7 +230,11 @@ def load(self) -> None:
     # see https://bugs.python.org/issue18820 for why this pain is necessary (Python does not allow
     # to automatically convert dict keys for JSON export
     def converted_values(self):
-        return {str(k.resolve()): str(v) for k, v in self.values.items()}
+        # Serialize composite keys as "path|mtime|size"
+        return {
+            f'{k[0].resolve()}|{k[1]}|{k[2]}': str(v)
+            for k, v in self.values.items()
+        }
 
     @log_execution_time()
     def dump(self) -> None:

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "duplicate_images"
-version = "0.11.10"
+version = "0.11.11"
 description = "Finds equal or similar images in a directory containing (many) image files"
 authors = ["Lene Preuss <lene.preuss@gmail.com>"]
 repository = "https://github.com/lene/DuplicateImages.git"

diff --git a/tests/unit/test_persistent_storage.py b/tests/unit/test_persistent_storage.py
@@ -199,3 +199,49 @@ def check_correct_results(finder: ImagePairFinder, images: List[Path]) -> None:
     for pair in expected_pairs:
         assert pair in pairs or (pair[1], pair[0]) in pairs, \
             f'{pair[0].name}, {pair[1].name} not in {expected_pairs_string}'
+
+
+@pytest.mark.parametrize('file_type', ['pickle', 'json'])
+def test_file_replacement_recalculates_hash(
+        top_directory: TemporaryDirectory, hash_store_path: Path,
+        reset_call_count  # pylint: disable=unused-argument
+) -> None:
+    """Test that replacing a file at the same path forces hash recalculation (issue #19)"""
+    from time import sleep
+    from .conftest import create_image, IMAGE_WIDTH
+
+    # Create initial file and populate cache
+    file_path = Path(top_directory.name) / 'test_image.jpg'
+    create_image(file_path, IMAGE_WIDTH)
+    initial_mtime = file_path.stat().st_mtime
+
+    with FileHashStore.create(hash_store_path, DEFAULT_ALGORITHM, DEFAULT_HASH_SIZE) as hash_store:
+        finder = ImagePairFinder.create(
+            [file_path], mock_algorithm, options=PairFinderOptions(slow=True),
+            hash_store=hash_store
+        )
+        finder.get_equal_groups()
+
+    initial_call_count = mock_algorithm.call_count
+    assert initial_call_count > 0, 'Hash should be calculated for new file'
+
+    # Sleep to ensure different mtime
+    sleep(0.01)
+
+    # Replace file with different content (different size)
+    file_path.unlink()
+    create_image(file_path, IMAGE_WIDTH * 2)  # Different size
+    new_mtime = file_path.stat().st_mtime
+    assert new_mtime != initial_mtime, 'File replacement should change mtime'
+
+    # Scan again with same cache - should recalculate because file changed
+    mock_algorithm.call_count = 0
+    with FileHashStore.create(hash_store_path, DEFAULT_ALGORITHM, DEFAULT_HASH_SIZE) as hash_store:
+        finder = ImagePairFinder.create(
+            [file_path], mock_algorithm, options=PairFinderOptions(slow=True),
+            hash_store=hash_store
+        )
+        finder.get_equal_groups()
+
+    assert mock_algorithm.call_count > 0, \
+        'Hash should be recalculated when file is replaced (issue #19 fix)'