diff --git a/.flake8 b/.flake8 index a73c2fb..6785138 100644 --- a/.flake8 +++ b/.flake8 @@ -1,3 +1,3 @@ [flake8] max-line-length = 100 -ignore = F401,S101 +ignore = F401,S101,W503 diff --git a/CHANGELOG.md b/CHANGELOG.md index c53d18a..8c5b363 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,14 @@ # Changelog +## [0.11.11] - 2025-11-16 + +### Fixed +- Fixed issue #19: Hash cache corruption when files are replaced at the same path. The hash + database now uses composite keys (path, modification time, file size) to detect when a file + has been replaced or modified, forcing recalculation of the hash. This prevents dangerous + false positives that could lead to incorrect duplicate detection and accidental data loss + with delete actions. Old hash databases are automatically migrated to the new format. + ## [0.11.10] - 2025-11-04 ### Added diff --git a/duplicate_images/function_types.py b/duplicate_images/function_types.py index 76a428d..56de097 100644 --- a/duplicate_images/function_types.py +++ b/duplicate_images/function_types.py @@ -19,7 +19,8 @@ ResultsGenerator = Generator[List[Path], None, None] ResultsGrouper = Callable[[ResultsGenerator], Results] CacheEntry = Tuple[Path, Optional[Hash]] -Cache = Dict[Path, Hash] +CacheKey = Tuple[Path, float, int] # (path, mtime, size) +Cache = Dict[CacheKey, Hash] def is_hash(x: Any) -> bool: diff --git a/duplicate_images/hash_store.py b/duplicate_images/hash_store.py index 984026b..4089a39 100644 --- a/duplicate_images/hash_store.py +++ b/duplicate_images/hash_store.py @@ -12,7 +12,7 @@ from imagehash import hex_to_hash from duplicate_images.common import log_execution_time -from duplicate_images.function_types import Cache, Hash, is_hash +from duplicate_images.function_types import Cache, CacheKey, Hash, is_hash class NullHashStore: @@ -89,11 +89,21 @@ def __exit__(self, exc_type: Any, _: Any, __: Any) -> None: self.dump() def add(self, file: Path, image_hash: Hash) -> None: - self.values[file] = image_hash + stat = file.stat() + resolved_path = file.resolve() + + # Remove any existing entries for this path to prevent cache bloat + self.values = {k: v for k, v in self.values.items() if k[0] != resolved_path} + + # Add new entry with composite key (path, mtime, size) + new_key = (resolved_path, stat.st_mtime, stat.st_size) + self.values[new_key] = image_hash self.dirty = True def get(self, file: Path) -> Optional[Hash]: - return self.values.get(file) + stat = file.stat() + key = (file.resolve(), stat.st_mtime, stat.st_size) + return self.values.get(key) def metadata(self) -> Dict: return {'algorithm': self.algorithm, **self.hash_size_kwargs} @@ -101,7 +111,10 @@ def metadata(self) -> Dict: def values_with_metadata(self) -> Tuple[Dict, Dict]: return self.values, self.metadata() - def checked_load(self, file: IO, load: Callable[[IO], Tuple[Cache, Dict]]) -> None: + def checked_load( + self, file: IO, + load: Callable[[IO], Tuple[Dict[Union[Path, CacheKey], Hash], Dict]] + ) -> None: try: values, metadata = load(file) # nosec except IndexError as error: @@ -112,10 +125,34 @@ def checked_load(self, file: IO, load: Callable[[IO], Tuple[Cache, Dict]]) -> No raise ValueError('Metadata empty') if not isinstance(metadata, dict): raise ValueError(f'Metadata not a dict: {metadata}') - bad_keys = [key for key in values.keys() if not isinstance(key, Path)] + + # Detect and migrate old format: Dict[Path, Hash] -> Dict[(Path, mtime, size), Hash] + migrated_values: Cache = {} + if values and not isinstance(next(iter(values.keys())), tuple): + logging.info('Migrating old hash database format to new format with file metadata') + # Old format: keys are Path objects, convert to composite keys with dummy metadata + # Files with changed mtime/size will be recalculated on first access + for path, hash_val in values.items(): + if isinstance(path, Path): + migrated_values[(path, 0.0, 0)] = hash_val + else: + # New format or already migrated - just cast to the right type + for k, v in values.items(): + if isinstance(k, tuple): + migrated_values[k] = v + + # Validate keys are tuples of (Path, float, int) + def is_valid_key(key): + return (isinstance(key, tuple) and len(key) == 3 + and isinstance(key[0], Path) + and isinstance(key[1], (int, float)) + and isinstance(key[2], int)) + + bad_keys = [key for key in migrated_values.keys() if not is_valid_key(key)] if bad_keys: - raise ValueError(f'Not a Path: {bad_keys}') - bad_values = [value for value in values.values() if not is_hash(value)] + raise ValueError(f'Invalid cache key format: {bad_keys}') + + bad_values = [value for value in migrated_values.values() if not is_hash(value)] if bad_values: raise ValueError(f'Not an image hash: {bad_values}') if metadata['algorithm'] != self.algorithm: @@ -124,7 +161,7 @@ def checked_load(self, file: IO, load: Callable[[IO], Tuple[Cache, Dict]]) -> No raise ValueError(f'Metadata mismatch: {metadata} != {self.metadata()}') if metadata != self.metadata(): raise ValueError(f'Metadata mismatch: {metadata} != {self.metadata()}') - self.values = values + self.values = migrated_values def load(self) -> None: raise NotImplementedError() @@ -150,7 +187,7 @@ def dump(self) -> None: pickle.dump(self.values_with_metadata(), file) # nosec -def load_values_and_metadata(file: IO) -> Tuple[Cache, Dict]: +def load_values_and_metadata(file: IO) -> Tuple[Dict[Union[Path, CacheKey], Hash], Dict]: try: valds = json.load(file) except json.JSONDecodeError as error: @@ -161,7 +198,22 @@ def load_values_and_metadata(file: IO) -> Tuple[Cache, Dict]: raise ValueError(f'Not a dict: {valds[0]}') if not isinstance(valds[1], dict): raise ValueError(f'Metadata not a dict: {valds[1]}') - return {Path(k).resolve(): hex_to_hash(str(v)) for k, v in valds[0].items()}, valds[1] + + # Parse keys: can be either "path" (old format) or "path|mtime|size" (new format) + parsed_values: Dict[Union[Path, CacheKey], Hash] = {} + for k, v in valds[0].items(): + if '|' in k: + # New format: "path|mtime|size" + parts = k.rsplit('|', 2) + path = Path(parts[0]).resolve() + mtime = float(parts[1]) + size = int(parts[2]) + parsed_values[(path, mtime, size)] = hex_to_hash(str(v)) + else: + # Old format: just path (will be migrated in checked_load) + parsed_values[Path(k).resolve()] = hex_to_hash(str(v)) + + return parsed_values, valds[1] class JSONHashStore(FileHashStore): @@ -170,15 +222,6 @@ class JSONHashStore(FileHashStore): image hashes in JSON format """ - def add(self, file: Path, image_hash: Hash) -> None: - # Resolve path to ensure consistent key format - self.values[file.resolve()] = image_hash - self.dirty = True - - def get(self, file: Path) -> Optional[Hash]: - # Resolve path to match how paths are stored - return self.values.get(file.resolve()) - @log_execution_time() def load(self) -> None: with self.store_path.open('r') as file: @@ -187,7 +230,11 @@ def load(self) -> None: # see https://bugs.python.org/issue18820 for why this pain is necessary (Python does not allow # to automatically convert dict keys for JSON export def converted_values(self): - return {str(k.resolve()): str(v) for k, v in self.values.items()} + # Serialize composite keys as "path|mtime|size" + return { + f'{k[0].resolve()}|{k[1]}|{k[2]}': str(v) + for k, v in self.values.items() + } @log_execution_time() def dump(self) -> None: diff --git a/pyproject.toml b/pyproject.toml index b0df44c..f6bc82e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "duplicate_images" -version = "0.11.10" +version = "0.11.11" description = "Finds equal or similar images in a directory containing (many) image files" authors = ["Lene Preuss "] repository = "https://github.com/lene/DuplicateImages.git" diff --git a/tests/unit/test_persistent_storage.py b/tests/unit/test_persistent_storage.py index 58fe975..8bb989c 100644 --- a/tests/unit/test_persistent_storage.py +++ b/tests/unit/test_persistent_storage.py @@ -199,3 +199,49 @@ def check_correct_results(finder: ImagePairFinder, images: List[Path]) -> None: for pair in expected_pairs: assert pair in pairs or (pair[1], pair[0]) in pairs, \ f'{pair[0].name}, {pair[1].name} not in {expected_pairs_string}' + + +@pytest.mark.parametrize('file_type', ['pickle', 'json']) +def test_file_replacement_recalculates_hash( + top_directory: TemporaryDirectory, hash_store_path: Path, + reset_call_count # pylint: disable=unused-argument +) -> None: + """Test that replacing a file at the same path forces hash recalculation (issue #19)""" + from time import sleep + from .conftest import create_image, IMAGE_WIDTH + + # Create initial file and populate cache + file_path = Path(top_directory.name) / 'test_image.jpg' + create_image(file_path, IMAGE_WIDTH) + initial_mtime = file_path.stat().st_mtime + + with FileHashStore.create(hash_store_path, DEFAULT_ALGORITHM, DEFAULT_HASH_SIZE) as hash_store: + finder = ImagePairFinder.create( + [file_path], mock_algorithm, options=PairFinderOptions(slow=True), + hash_store=hash_store + ) + finder.get_equal_groups() + + initial_call_count = mock_algorithm.call_count + assert initial_call_count > 0, 'Hash should be calculated for new file' + + # Sleep to ensure different mtime + sleep(0.01) + + # Replace file with different content (different size) + file_path.unlink() + create_image(file_path, IMAGE_WIDTH * 2) # Different size + new_mtime = file_path.stat().st_mtime + assert new_mtime != initial_mtime, 'File replacement should change mtime' + + # Scan again with same cache - should recalculate because file changed + mock_algorithm.call_count = 0 + with FileHashStore.create(hash_store_path, DEFAULT_ALGORITHM, DEFAULT_HASH_SIZE) as hash_store: + finder = ImagePairFinder.create( + [file_path], mock_algorithm, options=PairFinderOptions(slow=True), + hash_store=hash_store + ) + finder.get_equal_groups() + + assert mock_algorithm.call_count > 0, \ + 'Hash should be recalculated when file is replaced (issue #19 fix)'