Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .flake8
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
[flake8]
max-line-length = 100
ignore = F401,S101
ignore = F401,S101,W503
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,14 @@
# Changelog

## [0.11.11] - 2025-11-16

### Fixed
- Fixed issue #19: Hash cache corruption when files are replaced at the same path. The hash
database now uses composite keys (path, modification time, file size) to detect when a file
has been replaced or modified, forcing recalculation of the hash. This prevents dangerous
false positives that could lead to incorrect duplicate detection and accidental data loss
with delete actions. Old hash databases are automatically migrated to the new format.

## [0.11.10] - 2025-11-04

### Added
Expand Down
3 changes: 2 additions & 1 deletion duplicate_images/function_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
ResultsGenerator = Generator[List[Path], None, None]
ResultsGrouper = Callable[[ResultsGenerator], Results]
CacheEntry = Tuple[Path, Optional[Hash]]
Cache = Dict[Path, Hash]
CacheKey = Tuple[Path, float, int] # (path, mtime, size)
Cache = Dict[CacheKey, Hash]


def is_hash(x: Any) -> bool:
Expand Down
87 changes: 67 additions & 20 deletions duplicate_images/hash_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from imagehash import hex_to_hash

from duplicate_images.common import log_execution_time
from duplicate_images.function_types import Cache, Hash, is_hash
from duplicate_images.function_types import Cache, CacheKey, Hash, is_hash


class NullHashStore:
Expand Down Expand Up @@ -89,19 +89,32 @@ def __exit__(self, exc_type: Any, _: Any, __: Any) -> None:
self.dump()

def add(self, file: Path, image_hash: Hash) -> None:
self.values[file] = image_hash
stat = file.stat()
resolved_path = file.resolve()

# Remove any existing entries for this path to prevent cache bloat
self.values = {k: v for k, v in self.values.items() if k[0] != resolved_path}

# Add new entry with composite key (path, mtime, size)
new_key = (resolved_path, stat.st_mtime, stat.st_size)
self.values[new_key] = image_hash
self.dirty = True

def get(self, file: Path) -> Optional[Hash]:
return self.values.get(file)
stat = file.stat()
key = (file.resolve(), stat.st_mtime, stat.st_size)
return self.values.get(key)

def metadata(self) -> Dict:
return {'algorithm': self.algorithm, **self.hash_size_kwargs}

def values_with_metadata(self) -> Tuple[Dict, Dict]:
return self.values, self.metadata()

def checked_load(self, file: IO, load: Callable[[IO], Tuple[Cache, Dict]]) -> None:
def checked_load(
self, file: IO,
load: Callable[[IO], Tuple[Dict[Union[Path, CacheKey], Hash], Dict]]
) -> None:
try:
values, metadata = load(file) # nosec
except IndexError as error:
Expand All @@ -112,10 +125,34 @@ def checked_load(self, file: IO, load: Callable[[IO], Tuple[Cache, Dict]]) -> No
raise ValueError('Metadata empty')
if not isinstance(metadata, dict):
raise ValueError(f'Metadata not a dict: {metadata}')
bad_keys = [key for key in values.keys() if not isinstance(key, Path)]

# Detect and migrate old format: Dict[Path, Hash] -> Dict[(Path, mtime, size), Hash]
migrated_values: Cache = {}
if values and not isinstance(next(iter(values.keys())), tuple):
logging.info('Migrating old hash database format to new format with file metadata')
# Old format: keys are Path objects, convert to composite keys with dummy metadata
# Files with changed mtime/size will be recalculated on first access
for path, hash_val in values.items():
if isinstance(path, Path):
migrated_values[(path, 0.0, 0)] = hash_val
else:
# New format or already migrated - just cast to the right type
for k, v in values.items():
if isinstance(k, tuple):
migrated_values[k] = v

# Validate keys are tuples of (Path, float, int)
def is_valid_key(key):
return (isinstance(key, tuple) and len(key) == 3
and isinstance(key[0], Path)
and isinstance(key[1], (int, float))
and isinstance(key[2], int))

bad_keys = [key for key in migrated_values.keys() if not is_valid_key(key)]
if bad_keys:
raise ValueError(f'Not a Path: {bad_keys}')
bad_values = [value for value in values.values() if not is_hash(value)]
raise ValueError(f'Invalid cache key format: {bad_keys}')

bad_values = [value for value in migrated_values.values() if not is_hash(value)]
if bad_values:
raise ValueError(f'Not an image hash: {bad_values}')
if metadata['algorithm'] != self.algorithm:
Expand All @@ -124,7 +161,7 @@ def checked_load(self, file: IO, load: Callable[[IO], Tuple[Cache, Dict]]) -> No
raise ValueError(f'Metadata mismatch: {metadata} != {self.metadata()}')
if metadata != self.metadata():
raise ValueError(f'Metadata mismatch: {metadata} != {self.metadata()}')
self.values = values
self.values = migrated_values

def load(self) -> None:
raise NotImplementedError()
Expand All @@ -150,7 +187,7 @@ def dump(self) -> None:
pickle.dump(self.values_with_metadata(), file) # nosec


def load_values_and_metadata(file: IO) -> Tuple[Cache, Dict]:
def load_values_and_metadata(file: IO) -> Tuple[Dict[Union[Path, CacheKey], Hash], Dict]:
try:
valds = json.load(file)
except json.JSONDecodeError as error:
Expand All @@ -161,7 +198,22 @@ def load_values_and_metadata(file: IO) -> Tuple[Cache, Dict]:
raise ValueError(f'Not a dict: {valds[0]}')
if not isinstance(valds[1], dict):
raise ValueError(f'Metadata not a dict: {valds[1]}')
return {Path(k).resolve(): hex_to_hash(str(v)) for k, v in valds[0].items()}, valds[1]

# Parse keys: can be either "path" (old format) or "path|mtime|size" (new format)
parsed_values: Dict[Union[Path, CacheKey], Hash] = {}
for k, v in valds[0].items():
if '|' in k:
# New format: "path|mtime|size"
parts = k.rsplit('|', 2)
path = Path(parts[0]).resolve()
mtime = float(parts[1])
size = int(parts[2])
parsed_values[(path, mtime, size)] = hex_to_hash(str(v))
else:
# Old format: just path (will be migrated in checked_load)
parsed_values[Path(k).resolve()] = hex_to_hash(str(v))

return parsed_values, valds[1]


class JSONHashStore(FileHashStore):
Expand All @@ -170,15 +222,6 @@ class JSONHashStore(FileHashStore):
image hashes in JSON format
"""

def add(self, file: Path, image_hash: Hash) -> None:
# Resolve path to ensure consistent key format
self.values[file.resolve()] = image_hash
self.dirty = True

def get(self, file: Path) -> Optional[Hash]:
# Resolve path to match how paths are stored
return self.values.get(file.resolve())

@log_execution_time()
def load(self) -> None:
with self.store_path.open('r') as file:
Expand All @@ -187,7 +230,11 @@ def load(self) -> None:
# see https://bugs.python.org/issue18820 for why this pain is necessary (Python does not allow
# to automatically convert dict keys for JSON export
def converted_values(self):
return {str(k.resolve()): str(v) for k, v in self.values.items()}
# Serialize composite keys as "path|mtime|size"
return {
f'{k[0].resolve()}|{k[1]}|{k[2]}': str(v)
for k, v in self.values.items()
}

@log_execution_time()
def dump(self) -> None:
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "duplicate_images"
version = "0.11.10"
version = "0.11.11"
description = "Finds equal or similar images in a directory containing (many) image files"
authors = ["Lene Preuss <lene.preuss@gmail.com>"]
repository = "https://github.com/lene/DuplicateImages.git"
Expand Down
46 changes: 46 additions & 0 deletions tests/unit/test_persistent_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,3 +199,49 @@ def check_correct_results(finder: ImagePairFinder, images: List[Path]) -> None:
for pair in expected_pairs:
assert pair in pairs or (pair[1], pair[0]) in pairs, \
f'{pair[0].name}, {pair[1].name} not in {expected_pairs_string}'


@pytest.mark.parametrize('file_type', ['pickle', 'json'])
def test_file_replacement_recalculates_hash(
top_directory: TemporaryDirectory, hash_store_path: Path,
reset_call_count # pylint: disable=unused-argument
) -> None:
"""Test that replacing a file at the same path forces hash recalculation (issue #19)"""
from time import sleep
from .conftest import create_image, IMAGE_WIDTH

# Create initial file and populate cache
file_path = Path(top_directory.name) / 'test_image.jpg'
create_image(file_path, IMAGE_WIDTH)
initial_mtime = file_path.stat().st_mtime

with FileHashStore.create(hash_store_path, DEFAULT_ALGORITHM, DEFAULT_HASH_SIZE) as hash_store:
finder = ImagePairFinder.create(
[file_path], mock_algorithm, options=PairFinderOptions(slow=True),
hash_store=hash_store
)
finder.get_equal_groups()

initial_call_count = mock_algorithm.call_count
assert initial_call_count > 0, 'Hash should be calculated for new file'

# Sleep to ensure different mtime
sleep(0.01)

# Replace file with different content (different size)
file_path.unlink()
create_image(file_path, IMAGE_WIDTH * 2) # Different size
new_mtime = file_path.stat().st_mtime
assert new_mtime != initial_mtime, 'File replacement should change mtime'

# Scan again with same cache - should recalculate because file changed
mock_algorithm.call_count = 0
with FileHashStore.create(hash_store_path, DEFAULT_ALGORITHM, DEFAULT_HASH_SIZE) as hash_store:
finder = ImagePairFinder.create(
[file_path], mock_algorithm, options=PairFinderOptions(slow=True),
hash_store=hash_store
)
finder.get_equal_groups()

assert mock_algorithm.call_count > 0, \
'Hash should be recalculated when file is replaced (issue #19 fix)'