Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -113,3 +113,50 @@ These visualizations are useful for double-checking that the tiling or sampling
### Process summary

- **`process_list.csv`**: a summary file listing each processed slide, indicating whether processing was successful or failed. If a failure occurred, the traceback is provided to help diagnose the issue.

## Standalone tissue segmentator

For quick mask generation outside the full pipeline, use the standalone script:

```shell
python -m pip install tifffile # need extra tifffile deps

# Single slide
python scripts/generate_tissue_mask.py \
--wsi /path/to/slide.tif \
--output /path/to/tissue-mask-pyramid.tif \
--spacing 4.0 \
--tolerance 0.1

# Multiple slides
python scripts/generate_tissue_mask.py \
--wsi /path/to/slide_dir/*.tif \
--output-dir /path/to/output_dir \
--spacing 4.0 \
--tolerance 0.1
```

This script:
- reads the WSI with `wholeslidedata`
- computes a binary tissue mask using HSV thresholding (`0=background`, `1=tissue`)
- uses a coarse-to-fine ROI shortcut by default to avoid loading the full target-spacing WSI into memory
- writes a pyramidal TIFF mask at a desired `spacing`, where each level is downsampled from the previous one
- prints a final recap of how many slides succeeded, skipped, and failed

Useful options:
- `--backend` to switch the wholeslidedata backend (default: `asap`)
- `--output` for single-slide mode and `--output-dir` for multi-slide mode
- `--num-workers` to control parallelism
- `--no-cache` to disable cache-based skipping and force recomputation
- `--disable-coarse-roi-shortcut` to force legacy full-frame loading at target spacing
- `--coarse-spacing`, `--coarse-roi-margin-um`, and `--processing-tile-size` to tune coarse-to-fine ROI processing
- `--tolerance` to control how much a natural spacing can deviate from target spacing when selecting the best level for reading the whole slide
- `--min-component-area-um2` to remove tiny tissue blobs
- `--min-hole-area-um2` to fill small holes inside tissue
- `--gaussian-sigma-um` to apply optional pre-threshold Gaussian smoothing
- `--open-radius-um` / `--close-radius-um` for spacing-aware morphological smoothing
- `--spacing-at-level-0` to override level-0 spacing when metadata is incorrect
- `--compression` and `--tile-size` to tune TIFF output

The summary file is saved as `summary.csv` in `--output-dir` (multi-slide mode) or next to `--output` (single-slide mode).
The cache manifest used for skip inference is saved as `cache_manifest.json` in the same directory.
8 changes: 8 additions & 0 deletions hs2p/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,9 @@
from hs2p.wsi.wsi import WholeSlideImage
from hs2p.wsi import (
extract_coordinates,
sample_coordinates,
overlay_mask_on_slide,
visualize_coordinates,
)

__version__ = "1.1.0"
96 changes: 60 additions & 36 deletions hs2p/sampling.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,15 @@
from collections.abc import Sequence

from hs2p.utils import setup, load_csv, fix_random_seeds
from hs2p.wsi import extract_coordinates, filter_coordinates, sample_coordinates, save_coordinates, visualize_coordinates, overlay_mask_on_slide, SamplingParameters
from hs2p.wsi import (
extract_coordinates,
filter_coordinates,
sample_coordinates,
save_coordinates,
visualize_coordinates,
overlay_mask_on_slide,
SamplingParameters,
)


def _validate_visualization_color_mapping(
Expand All @@ -36,7 +44,9 @@ def _validate_visualization_color_mapping(
raise ValueError(
f"color_mapping['{annotation}'] must be None or a length-3 RGB sequence"
)
if any((not isinstance(c, (int, np.integer)) or c < 0 or c > 255) for c in color):
if any(
(not isinstance(c, (int, np.integer)) or c < 0 or c > 255) for c in color
):
raise ValueError(
f"color_mapping['{annotation}'] must contain integers in [0, 255]"
)
Expand All @@ -60,7 +70,7 @@ def get_args_parser(add_help: bool = True):
)
parser.add_argument(
"opts",
help="Modify config options at the end of the command using \"path.key=value\".",
help='Modify config options at the end of the command using "path.key=value".',
default=None,
nargs=argparse.REMAINDER,
)
Expand Down Expand Up @@ -123,18 +133,22 @@ def process_slide(
if not cfg.tiling.sampling_params.independant_sampling:
tissue_mask_visu_path = None
if cfg.visualize and mask_visualize_dir is not None:
tissue_mask_visu_path = Path(mask_visualize_dir, f"{wsi_name}-tissue.png")
coordinates, contour_indices, tile_level, resize_factor, tile_size_lv0 = extract_coordinates(
wsi_path=wsi_path,
mask_path=mask_path,
backend=cfg.tiling.backend,
tiling_params=cfg.tiling.params,
segment_params=cfg.tiling.seg_params,
filter_params=cfg.tiling.filter_params,
sampling_params=sampling_params,
mask_visu_path=tissue_mask_visu_path,
disable_tqdm=disable_tqdm,
num_workers=num_workers,
tissue_mask_visu_path = Path(
mask_visualize_dir, f"{wsi_name}-tissue.png"
)
coordinates, contour_indices, tile_level, resize_factor, tile_size_lv0 = (
extract_coordinates(
wsi_path=wsi_path,
mask_path=mask_path,
backend=cfg.tiling.backend,
tiling_params=cfg.tiling.params,
segment_params=cfg.tiling.seg_params,
filter_params=cfg.tiling.filter_params,
sampling_params=sampling_params,
mask_visu_path=tissue_mask_visu_path,
disable_tqdm=disable_tqdm,
num_workers=num_workers,
)
)
filtered_coordinates, filtered_contour_indices = filter_coordinates(
wsi_path=wsi_path,
Expand All @@ -147,7 +161,7 @@ def process_slide(
tiling_params=cfg.tiling.params,
sampling_params=sampling_params,
disable_tqdm=disable_tqdm,
) # a dict mapping annotation -> coordinates
) # a dict mapping annotation -> coordinates
for annotation, coordinates in filtered_coordinates.items():
if len(coordinates) == 0:
continue
Expand Down Expand Up @@ -188,7 +202,13 @@ def process_slide(
annotation_mask_dir = mask_visualize_dir / annotation
annotation_mask_dir.mkdir(exist_ok=True, parents=True)
tissue_mask_visu_path = Path(annotation_mask_dir, f"{wsi_name}.jpg")
coordinates, contour_indices, tile_level, resize_factor, tile_size_lv0 = sample_coordinates(
(
coordinates,
contour_indices,
tile_level,
resize_factor,
tile_size_lv0,
) = sample_coordinates(
wsi_path=wsi_path,
mask_path=mask_path,
backend=cfg.tiling.backend,
Expand Down Expand Up @@ -270,7 +290,9 @@ def main(args):
if process_list.is_file() and cfg.resume:
process_df = pd.read_csv(process_list)
if "mask_path" not in process_df.columns:
process_df["mask_path"] = [str(p) if p is not None else p for p in mask_paths]
process_df["mask_path"] = [
str(p) if p is not None else p for p in mask_paths
]
else:
process_df["mask_path"] = process_df["mask_path"].apply(
lambda x: str(x) if pd.notna(x) else None
Expand All @@ -288,14 +310,20 @@ def main(args):

skip_sampling = process_df["sampling_status"].str.contains("success").all()

pixel_mapping = {k: v for e in cfg.tiling.sampling_params.pixel_mapping for k, v in e.items()}
tissue_percentage = {k: v for e in cfg.tiling.sampling_params.tissue_percentage for k, v in e.items()}
pixel_mapping = {
k: v for e in cfg.tiling.sampling_params.pixel_mapping for k, v in e.items()
}
tissue_percentage = {
k: v for e in cfg.tiling.sampling_params.tissue_percentage for k, v in e.items()
}
tissue_key_present = True
if "tissue" not in tissue_percentage:
tissue_key_present = False
tissue_percentage["tissue"] = cfg.tiling.params.min_tissue_percentage
if cfg.tiling.sampling_params.color_mapping is not None:
color_mapping = {k: v for e in cfg.tiling.sampling_params.color_mapping for k, v in e.items()}
color_mapping = {
k: v for e in cfg.tiling.sampling_params.color_mapping for k, v in e.items()
}
else:
color_mapping = None

Expand All @@ -311,9 +339,7 @@ def main(args):
process_stack = process_df[mask]
total = len(process_stack)

wsi_paths_to_process = [
Path(x) for x in process_stack.wsi_path.values.tolist()
]
wsi_paths_to_process = [Path(x) for x in process_stack.wsi_path.values.tolist()]
mask_paths_to_process = [
Path(x) if x is not None and not pd.isna(x) else x
for x in process_stack.mask_path.values.tolist()
Expand Down Expand Up @@ -344,9 +370,7 @@ def main(args):
"disable_tqdm": True,
"num_workers": parallel_workers,
}
for wsi_fp, mask_fp in zip(
wsi_paths_to_process, mask_paths_to_process
)
for wsi_fp, mask_fp in zip(wsi_paths_to_process, mask_paths_to_process)
]
results = list(
tqdm.tqdm(
Expand All @@ -361,16 +385,16 @@ def main(args):
sampling_updates[wsi_path] = status_info

for wsi_path, status_info in sampling_updates.items():
process_df.loc[
process_df["wsi_path"] == wsi_path, "sampling_status"
] = status_info["status"]
process_df.loc[process_df["wsi_path"] == wsi_path, "sampling_status"] = (
status_info["status"]
)
if "error" in status_info:
process_df.loc[
process_df["wsi_path"] == wsi_path, "error"
] = status_info["error"]
process_df.loc[
process_df["wsi_path"] == wsi_path, "traceback"
] = status_info["traceback"]
process_df.loc[process_df["wsi_path"] == wsi_path, "error"] = (
status_info["error"]
)
process_df.loc[process_df["wsi_path"] == wsi_path, "traceback"] = (
status_info["traceback"]
)
process_df.to_csv(process_list, index=False)

# summary logging
Expand Down
81 changes: 48 additions & 33 deletions hs2p/tiling.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,12 @@
from pathlib import Path

from hs2p.utils import setup, load_csv, fix_random_seeds
from hs2p.wsi import extract_coordinates, save_coordinates, visualize_coordinates, SamplingParameters
from hs2p.wsi import (
extract_coordinates,
save_coordinates,
visualize_coordinates,
SamplingParameters,
)


def get_args_parser(add_help: bool = True):
Expand All @@ -29,7 +34,7 @@ def get_args_parser(add_help: bool = True):
)
parser.add_argument(
"opts",
help="Modify config options at the end of the command using \"path.key=value\".",
help='Modify config options at the end of the command using "path.key=value".',
default=None,
nargs=argparse.REMAINDER,
)
Expand Down Expand Up @@ -57,7 +62,11 @@ def process_slide(
wsi_name = wsi_path.stem.replace(" ", "_")
if cfg.tiling.read_coordinates_from is not None:
coordinates_path = Path(cfg.tiling.read_coordinates_from, f"{wsi_name}.npy")
if coordinates_path.is_file() and cfg.visualize and tile_visualize_dir is not None:
if (
coordinates_path.is_file()
and cfg.visualize
and tile_visualize_dir is not None
):
coordinates_arr = np.load(coordinates_path, allow_pickle=True)
coordinates = list(zip(coordinates_arr["x"], coordinates_arr["y"]))
tile_size_lv0 = coordinates_arr["tile_size_lv0"][0]
Expand All @@ -74,17 +83,19 @@ def process_slide(
tissue_mask_visu_path = None
if cfg.visualize and mask_visualize_dir is not None:
tissue_mask_visu_path = Path(mask_visualize_dir, f"{wsi_name}.jpg")
coordinates, contour_indices, tile_level, resize_factor, tile_size_lv0 = extract_coordinates(
wsi_path=wsi_path,
mask_path=mask_path,
backend=cfg.tiling.backend,
tiling_params=cfg.tiling.params,
segment_params=cfg.tiling.seg_params,
filter_params=cfg.tiling.filter_params,
sampling_params=sampling_params,
mask_visu_path=tissue_mask_visu_path,
disable_tqdm=disable_tqdm,
num_workers=num_workers,
coordinates, contour_indices, tile_level, resize_factor, tile_size_lv0 = (
extract_coordinates(
wsi_path=wsi_path,
mask_path=mask_path,
backend=cfg.tiling.backend,
tiling_params=cfg.tiling.params,
segment_params=cfg.tiling.seg_params,
filter_params=cfg.tiling.filter_params,
sampling_params=sampling_params,
mask_visu_path=tissue_mask_visu_path,
disable_tqdm=disable_tqdm,
num_workers=num_workers,
)
)
coordinates_dir = Path(cfg.output_dir, "coordinates")
coordinates_path = Path(coordinates_dir, f"{wsi_name}.npy")
Expand Down Expand Up @@ -136,7 +147,9 @@ def main(args):
if process_list.is_file() and cfg.resume:
process_df = pd.read_csv(process_list)
if "mask_path" not in process_df.columns:
process_df["mask_path"] = [str(p) if p is not None else p for p in mask_paths]
process_df["mask_path"] = [
str(p) if p is not None else p for p in mask_paths
]
else:
process_df["mask_path"] = process_df["mask_path"].apply(
lambda x: str(x) if pd.notna(x) else None
Expand All @@ -154,12 +167,18 @@ def main(args):

skip_tiling = process_df["tiling_status"].str.contains("success").all()

pixel_mapping = {k: v for e in cfg.tiling.sampling_params.pixel_mapping for k, v in e.items()}
tissue_percentage = {k: v for e in cfg.tiling.sampling_params.tissue_percentage for k, v in e.items()}
pixel_mapping = {
k: v for e in cfg.tiling.sampling_params.pixel_mapping for k, v in e.items()
}
tissue_percentage = {
k: v for e in cfg.tiling.sampling_params.tissue_percentage for k, v in e.items()
}
if "tissue" not in tissue_percentage:
tissue_percentage["tissue"] = cfg.tiling.params.min_tissue_percentage
if cfg.tiling.sampling_params.color_mapping is not None:
color_mapping = {k: v for e in cfg.tiling.sampling_params.color_mapping for k, v in e.items()}
color_mapping = {
k: v for e in cfg.tiling.sampling_params.color_mapping for k, v in e.items()
}
else:
color_mapping = None

Expand All @@ -175,9 +194,7 @@ def main(args):
process_stack = process_df[mask]
total = len(process_stack)

wsi_paths_to_process = [
Path(x) for x in process_stack.wsi_path.values.tolist()
]
wsi_paths_to_process = [Path(x) for x in process_stack.wsi_path.values.tolist()]
mask_paths_to_process = [
Path(x) if x is not None and not pd.isna(x) else x
for x in process_stack.mask_path.values.tolist()
Expand Down Expand Up @@ -211,9 +228,7 @@ def main(args):
"disable_tqdm": True,
"num_workers": parallel_workers,
}
for wsi_fp, mask_fp in zip(
wsi_paths_to_process, mask_paths_to_process
)
for wsi_fp, mask_fp in zip(wsi_paths_to_process, mask_paths_to_process)
]
results = list(
tqdm.tqdm(
Expand All @@ -228,16 +243,16 @@ def main(args):
tiling_updates[wsi_path] = status_info

for wsi_path, status_info in tiling_updates.items():
process_df.loc[
process_df["wsi_path"] == wsi_path, "tiling_status"
] = status_info["status"]
process_df.loc[process_df["wsi_path"] == wsi_path, "tiling_status"] = (
status_info["status"]
)
if "error" in status_info:
process_df.loc[
process_df["wsi_path"] == wsi_path, "error"
] = status_info["error"]
process_df.loc[
process_df["wsi_path"] == wsi_path, "traceback"
] = status_info["traceback"]
process_df.loc[process_df["wsi_path"] == wsi_path, "error"] = (
status_info["error"]
)
process_df.loc[process_df["wsi_path"] == wsi_path, "traceback"] = (
status_info["traceback"]
)
process_df.to_csv(process_list, index=False)

# summary logging
Expand Down
2 changes: 1 addition & 1 deletion hs2p/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@
update_state_dict,
)
from .log_utils import setup_logging
from .config import setup
from .config import setup
Loading