From ffa55609ded450b288d7929158b6e059ad36c947 Mon Sep 17 00:00:00 2001 From: Cynthia Lo Date: Mon, 12 Jan 2026 03:53:19 -0800 Subject: [PATCH 1/6] Add support for the ct report type --- README.md | 7 +- configs/default.yaml | 2 + dev_environment.yaml | 1 - pyproject.toml | 1 - src/radfact/llm_utils/nli/processor.py | 36 +- .../nli/prompts/ct/few_shot_examples.json | 627 ++++++++++++++++++ .../ct/system_message_ev_singlephrase.txt | 1 + .../prompts/{ => cxr}/few_shot_examples.json | 0 .../system_message_ev_singlephrase.txt | 0 src/radfact/llm_utils/prompt_tasks.py | 47 ++ .../llm_utils/report_to_phrases/processor.py | 29 +- .../prompts/ct/few_shot_examples.json | 471 +++++++++++++ .../prompts/ct/system_message.txt | 13 + .../prompts/{ => cxr}/few_shot_examples.json | 0 .../prompts/{ => cxr}/system_message.txt | 0 tests/metric/test_radfact.py | 49 ++ 16 files changed, 1258 insertions(+), 26 deletions(-) create mode 100644 src/radfact/llm_utils/nli/prompts/ct/few_shot_examples.json create mode 100644 src/radfact/llm_utils/nli/prompts/ct/system_message_ev_singlephrase.txt rename src/radfact/llm_utils/nli/prompts/{ => cxr}/few_shot_examples.json (100%) rename src/radfact/llm_utils/nli/prompts/{ => cxr}/system_message_ev_singlephrase.txt (100%) create mode 100644 src/radfact/llm_utils/prompt_tasks.py create mode 100644 src/radfact/llm_utils/report_to_phrases/prompts/ct/few_shot_examples.json create mode 100644 src/radfact/llm_utils/report_to_phrases/prompts/ct/system_message.txt rename src/radfact/llm_utils/report_to_phrases/prompts/{ => cxr}/few_shot_examples.json (100%) rename src/radfact/llm_utils/report_to_phrases/prompts/{ => cxr}/system_message.txt (100%) diff --git a/README.md b/README.md index 3bb06da..05c248c 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ RadFact is a framework for the evaluation of model-generated radiology reports given a ground-truth report, **with or without grounding**. Leveraging the logical inference capabilities of large language models, RadFact is not a single number but a _suite_ of metrics, capturing aspects of precision and recall at text-only and text-and-grounding levels. -RadFact was introduced in [MAIRA-2: Grounded Radiology Report Generation](https://aka.ms/maira-2). Here we provide an open-source implementation of the metric to facilitate its use and development. +RadFact was introduced in [MAIRA-2: Grounded Radiology Report Generation](https://aka.ms/maira-2). Here we provide an open-source implementation of the metric to facilitate its use and development. The RadFact metric currently supports both `cxr` and `ct` report types. ## Table of Contents @@ -173,6 +173,11 @@ options: Refer to the example input files in the [`examples`](examples) directory for the expected format of the input files. The input files should be in the format of a CSV file for non-grounded reports [findings_generation_examples.csv](examples/findings_generation_examples.csv) and a JSON file for grounded reports [grounded_reporting_examples.json](examples/grounded_reporting_examples.json). +RadFact supports different report types through the `report_type` field in [`configs/default.yaml`](configs/default.yaml). Currently supported options are: + +- `cxr` - Chest X-ray reports (default) +- `ct` - CT scan reports + The script computes confidence intervals for the metrics using bootstrapping. The number of bootstrap samples can be controlled using the `--bootstrap_samples` argument. The default value is 500. To disable bootstrapping, set `--bootstrap_samples 0`. ⚠️**WARNING**: Some queries may fail due to the endpoint limitations (timeouts, rate limits, etc.). When the LLM performing entailment verification fails, we **set these examples as not-entailed by default**. If this occurs in a significant number of cases, the results will not be reliable. The final metrics dict contains the number of such skipped queries under the key `num_llm_failures`. The script will print the number of skipped queries at the end of the run, and store these in the `skipped` directroy under the run id folder. You will also see a warning message in the logs for each failed query. `WARNING: No response for example {query_id}. Setting as NOT ENTAILED`. diff --git a/configs/default.yaml b/configs/default.yaml index 28c4869..9878825 100644 --- a/configs/default.yaml +++ b/configs/default.yaml @@ -21,6 +21,8 @@ processing: end_index: null output_filename: "outputs.json" +report_type: "cxr" + # The type of cache that should be set for langchain. This can be either "redis" or "sqlite". # Sqlite cache is useful for local development, it will be written to ~/.langchain.db # Redis cache is useful to share state across many evaluation runs in AzureML diff --git a/dev_environment.yaml b/dev_environment.yaml index 7b39764..94ec7d8 100644 --- a/dev_environment.yaml +++ b/dev_environment.yaml @@ -311,7 +311,6 @@ dependencies: - pyparsing==3.2.0 - pysocks==1.7.1 - pytest==8.3.3 - - pytest-lazy-fixture==0.6.3 - python-dateutil==2.9.0.post0 - pytz==2024.2 - pyyaml==6.0.2 diff --git a/pyproject.toml b/pyproject.toml index f6a4eaa..5d4ee0e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -78,7 +78,6 @@ test = [ "mock", "pandas-stubs", "pytest", - "pytest-lazy-fixture", ] [project.urls] diff --git a/src/radfact/llm_utils/nli/processor.py b/src/radfact/llm_utils/nli/processor.py index add77f0..6228f7e 100644 --- a/src/radfact/llm_utils/nli/processor.py +++ b/src/radfact/llm_utils/nli/processor.py @@ -10,6 +10,7 @@ from typing import Any, Callable import pandas as pd +from radfact.llm_utils.prompt_tasks import NLITaskOptions, ReportType from langchain_core.language_models import BaseLanguageModel from langchain_core.messages import BaseMessage from omegaconf import DictConfig @@ -35,11 +36,9 @@ StructuredProcessor, simple_formatter, ) -from radfact.paths import OUTPUT_DIR, get_prompts_dir +from radfact.paths import OUTPUT_DIR logger = logging.getLogger(__name__) -PARSING_TASK = "nli" -PROMPTS_DIR = get_prompts_dir(task=PARSING_TASK) RADFACT_SUBFOLDER = "radfact" @@ -49,22 +48,22 @@ class MetricDataframeKeys(str, Enum): STUDY_ID = "study_id" -def get_ev_processor_singlephrase(log_dir: Path) -> StructuredProcessor[ComparisonQuerySinglePhrase, EvidencedPhrase]: +def get_ev_processor_singlephrase( + report_type: ReportType, log_dir: Path +) -> StructuredProcessor[ComparisonQuerySinglePhrase, EvidencedPhrase]: """ Helper function to load the NLI processor with the correct system prompt and few-shot examples. The setting here is to classify a SINGLE PHRASE at a time given the reference report. Further, we do entailment verification, aka the binary version of NLI. - :param api_arguments: API arguments for the LLM. + :param report_type: The type of report, e.g., "ReportType.CXR" or "ReportType.CT". :param log_dir: Directory to save logs. :return: Processor for entailment verification. """ - - system_prompt_path = PROMPTS_DIR / "system_message_ev_singlephrase.txt" - few_shot_examples_path = PROMPTS_DIR / "few_shot_examples.json" - system_prompt = system_prompt_path.read_text() - few_shot_examples = load_examples_from_json(json_path=few_shot_examples_path, binary=True) + task = NLITaskOptions[report_type.name].value + system_prompt = task.system_message_path.read_text() + few_shot_examples = load_examples_from_json(json_path=task.few_shot_examples_path, binary=True) # The few-shots are in the bidirectional format, we need to convert them to single-phrase. few_shot_examples_single_phrase: list[NLISampleSinglePhrase] = [] for few_shot_example in few_shot_examples: @@ -94,10 +93,12 @@ class ReportGroundingNLIProcessor(BaseProcessor[NLIQuerySample, NLISample]): NUM_LLM_SUCCESS = "num_llm_success" NUM_LLM_PHRASE_REWRITES = "num_llm_phrase_rewrites" - def __init__(self, format_query_fn: Callable[..., Any] | None = None) -> None: + def __init__(self, report_type: ReportType, format_query_fn: Callable[..., Any] | None = None) -> None: super().__init__() self.format_query_fn = format_query_fn - self.phrase_processor = get_ev_processor_singlephrase(log_dir=OUTPUT_DIR / "ev_processor_logs") + self.phrase_processor = get_ev_processor_singlephrase( + report_type=report_type, log_dir=OUTPUT_DIR / "ev_processor_logs" + ) # Logging errors self.num_llm_failures = 0 self.num_llm_success = 0 @@ -190,7 +191,16 @@ def get_report_nli_engine( cfg: DictConfig, candidates: dict[str, GroundedPhraseList], references: dict[str, GroundedPhraseList] ) -> LLMEngine: output_folder = get_subfolder(root=OUTPUT_DIR, subfolder=RADFACT_SUBFOLDER) - nli_report_processor = ReportGroundingNLIProcessor(format_query_fn=format_row_to_nli_query_sample) + report_type_value = cfg.get("report_type") + try: + report_type = ReportType(report_type_value) + except ValueError as e: + raise ValueError( + f"Invalid report_type '{report_type_value}'. Valid options are: {[rt.value for rt in ReportType]}" + ) from e + nli_report_processor = ReportGroundingNLIProcessor( + report_type=report_type, format_query_fn=format_row_to_nli_query_sample + ) dataset_df = pd.DataFrame( { MetricDataframeKeys.STUDY_ID: study_id, diff --git a/src/radfact/llm_utils/nli/prompts/ct/few_shot_examples.json b/src/radfact/llm_utils/nli/prompts/ct/few_shot_examples.json new file mode 100644 index 0000000..bcd0b3c --- /dev/null +++ b/src/radfact/llm_utils/nli/prompts/ct/few_shot_examples.json @@ -0,0 +1,627 @@ +[ + { + "example_id": "ct_example1_study1_study2", + "study_id": "nan", + "input": { + "phrases_A": [ + "There is a 2 cm renal cyst in the left kidney.", + "The liver shows signs of mild hepatic steatosis.", + "No evidence of free air in the abdominal cavity." + ], + "phrases_B": [ + "A 2 cm hypodense lesion is present in the left kidney, consistent with a cyst.", + "Diffuse hepatic steatosis is noted.", + "Free air is visible under the diaphragm." + ] + }, + "rationale": "rationale", + "output": { + "phrases_A_evidenced": [ + { + "evidence": [ + "A 2 cm hypodense lesion is present in the left kidney, consistent with a cyst." + ], + "phrase": "There is a 2 cm renal cyst in the left kidney.", + "status": "entailment" + }, + { + "evidence": [ + "Diffuse hepatic steatosis is noted." + ], + "phrase": "The liver shows signs of mild hepatic steatosis.", + "status": "entailment" + }, + { + "evidence": [ + "Free air is visible under the diaphragm." + ], + "phrase": "No evidence of free air in the abdominal cavity.", + "status": "contradiction" + } + ], + "phrases_B_evidenced": [ + { + "evidence": [ + "There is a 2 cm renal cyst in the left kidney." + ], + "phrase": "A 2 cm hypodense lesion is present in the left kidney, consistent with a cyst.", + "status": "entailment" + }, + { + "evidence": [ + "The liver shows signs of mild hepatic steatosis." + ], + "phrase": "Diffuse hepatic steatosis is noted.", + "status": "entailment" + }, + { + "evidence": [ + "No evidence of free air in the abdominal cavity." + ], + "phrase": "Free air is visible under the diaphragm.", + "status": "contradiction" + } + ] + } +}, +{ + "example_id": "ct_example2_study3_study4", + "study_id": "nan", + "input": { + "phrases_A": [ + "There is a small renal cyst in the left kidney.", + "The liver appears normal in size and shape.", + "The lungs are clear with no signs of infiltrates." + ], + "phrases_B": [ + "A small cyst is noted in the left kidney.", + "The liver is unremarkable.", + "Heart size is within normal limits." + ] + }, + "rationale": "rationale", + "output": { + "phrases_A_evidenced": [ + { + "evidence": [ + "A small cyst is noted in the left kidney." + ], + "phrase": "There is a small renal cyst in the left kidney.", + "status": "entailment" + }, + { + "evidence": [ + "The liver is unremarkable." + ], + "phrase": "The liver appears normal in size and shape.", + "status": "entailment" + }, + { + "evidence": [], + "phrase": "The lungs are clear with no signs of infiltrates.", + "status": "neutral" + } + ], + "phrases_B_evidenced": [ + { + "evidence": [ + "There is a small renal cyst in the left kidney." + ], + "phrase": "A small cyst is noted in the left kidney.", + "status": "entailment" + }, + { + "evidence": [ + "The liver appears normal in size and shape." + ], + "phrase": "The liver is unremarkable.", + "status": "entailment" + }, + { + "evidence": [], + "phrase": "Heart size is within normal limits.", + "status": "neutral" + } + ] + } +}, +{ + "example_id": "ct_example3_study5_study6", + "study_id": "nan", + "input": { + "phrases_A": [ + "The pancreas shows evidence of atrophy.", + "Hydronephrosis noted in the right kidney.", + "No gallstones are present in the gallbladder." + ], + "phrases_B": [ + "Atrophy of the pancreatic tissue is visible.", + "Moderate right hydronephrosis is identified.", + "Multiple gallstones seen in the gallbladder." + ] + }, + "rationale": "rationale", + "output": { + "phrases_A_evidenced": [ + { + "evidence": [ + "Atrophy of the pancreatic tissue is visible." + ], + "phrase": "The pancreas shows evidence of atrophy.", + "status": "entailment" + }, + { + "evidence": [ + "Moderate right hydronephrosis is identified." + ], + "phrase": "Hydronephrosis noted in the right kidney.", + "status": "entailment" + }, + { + "evidence": [ + "Multiple gallstones seen in the gallbladder." + ], + "phrase": "No gallstones are present in the gallbladder.", + "status": "contradiction" + } + ], + "phrases_B_evidenced": [ + { + "evidence": [ + "The pancreas shows evidence of atrophy." + ], + "phrase": "Atrophy of the pancreatic tissue is visible.", + "status": "entailment" + }, + { + "evidence": [ + "Hydronephrosis noted in the right kidney." + ], + "phrase": "Moderate right hydronephrosis is identified.", + "status": "entailment" + }, + { + "evidence": [ + "No gallstones are present in the gallbladder." + ], + "phrase": "Multiple gallstones seen in the gallbladder.", + "status": "contradiction" + } + ] + } +}, +{ + "example_id": "ct_example4_study7_study8", + "study_id": "nan", + "input": { + "phrases_A": [ + "No signs of pulmonary embolism.", + "The aorta is of normal caliber.", + "There is a small hiatal hernia." + ], + "phrases_B": [ + "The pulmonary arteries are clear with no embolism detected.", + "The aorta is unremarkable.", + "There is no evidence of renal calculi." + ] + }, + "rationale": "rationale", + "output": { + "phrases_A_evidenced": [ + { + "evidence": [ + "The pulmonary arteries are clear with no embolism detected." + ], + "phrase": "No signs of pulmonary embolism.", + "status": "entailment" + }, + { + "evidence": [ + "The aorta is unremarkable." + ], + "phrase": "The aorta is of normal caliber.", + "status": "entailment" + }, + { + "evidence": [], + "phrase": "There is a small hiatal hernia.", + "status": "neutral" + } + ], + "phrases_B_evidenced": [ + { + "evidence": [ + "No signs of pulmonary embolism." + ], + "phrase": "The pulmonary arteries are clear with no embolism detected.", + "status": "entailment" + }, + { + "evidence": [ + "The aorta is of normal caliber." + ], + "phrase": "The aorta is unremarkable.", + "status": "entailment" + }, + { + "evidence": [], + "phrase": "There is no evidence of renal calculi.", + "status": "neutral" + } + ] + } +}, +{ + "example_id": "ct_example5_study9_study10", + "study_id": "nan", + "input": { + "phrases_A": [ + "Mild cardiomegaly is observed.", + "No evidence of bowel obstruction.", + "The spleen is of normal size." + ], + "phrases_B": [ + "Heart size is slightly enlarged.", + "The bowel loops are normal with no signs of obstruction.", + "The pancreas shows no abnormalities." + ] + }, + "rationale": "rationale", + "output": { + "phrases_A_evidenced": [ + { + "evidence": [ + "Heart size is slightly enlarged." + ], + "phrase": "Mild cardiomegaly is observed.", + "status": "entailment" + }, + { + "evidence": [ + "The bowel loops are normal with no signs of obstruction." + ], + "phrase": "No evidence of bowel obstruction.", + "status": "entailment" + }, + { + "evidence": [], + "phrase": "The spleen is of normal size.", + "status": "neutral" + } + ], + "phrases_B_evidenced": [ + { + "evidence": [ + "Mild cardiomegaly is observed." + ], + "phrase": "Heart size is slightly enlarged.", + "status": "entailment" + }, + { + "evidence": [ + "No evidence of bowel obstruction." + ], + "phrase": "The bowel loops are normal with no signs of obstruction.", + "status": "entailment" + }, + { + "evidence": [], + "phrase": "The pancreas shows no abnormalities.", + "status": "neutral" + } + ] + } +}, +{ + "example_id": "ct_example6_study11_study12", + "study_id": "nan", + "input": { + "phrases_A": [ + "The heart size is normal on CT.", + "No infiltrates or masses identified in the lungs.", + "No pleural effusion seen on CT scan." + ], + "phrases_B": [ + "Multiple nodular densities in both lungs.", + "Heart size is normal on CT.", + "No pneumothorax detected." + ] + }, + "output": { + "phrases_A_evidenced": [ + { + "evidence": [ + "Heart size is normal on CT." + ], + "phrase": "The heart size is normal on CT.", + "status": "entailment" + }, + { + "evidence": [ + "Multiple nodular densities in both lungs." + ], + "phrase": "No infiltrates or masses identified in the lungs.", + "status": "contradiction" + }, + { + "evidence": [], + "phrase": "No pleural effusion seen on CT scan.", + "status": "neutral" + } + ], + "phrases_B_evidenced": [ + { + "evidence": [ + "No infiltrates or masses identified in the lungs." + ], + "phrase": "Multiple nodular densities in both lungs.", + "status": "contradiction" + }, + { + "evidence": [ + "The heart size is normal on CT." + ], + "phrase": "Heart size is normal on CT.", + "status": "entailment" + }, + { + "evidence": [], + "phrase": "No pneumothorax detected.", + "status": "neutral" + } + ] + } +}, +{ + "example_id": "1.3.6.1.4.1.55648.016633401174206362235121936818612914820_1.3.6.1.4.1.55648.241546114664305764079189793827197454684", + "study_id": "nan", + "input": { + "phrases_A": [ + "Scarring or atelectasis in the left lung base is seen.", + "Cardiomediastinum is unremarkable.", + "Degenerative changes are seen throughout the spine." + ], + "phrases_B": [ + "The lungs are clear.", + "The cardiomediastinal structures are unremarkable.", + "There are no pleural effusions." + ] + }, + "rationale": "rationale", + "output": { + "phrases_A_evidenced": [ + { + "evidence": [ + "The lungs are clear." + ], + "phrase": "Scarring or atelectasis in the left lung base is seen.", + "status": "contradiction" + }, + { + "evidence": [ + "The cardiomediastinal structures are unremarkable." + ], + "phrase": "Cardiomediastinum is unremarkable.", + "status": "entailment" + }, + { + "evidence": [], + "phrase": "Degenerative changes are seen throughout the spine.", + "status": "neutral" + } + ], + "phrases_B_evidenced": [ + { + "evidence": [ + "Scarring or atelectasis in the left lung base is seen." + ], + "phrase": "The lungs are clear.", + "status": "contradiction" + }, + { + "evidence": [ + "Cardiomediastinum is unremarkable." + ], + "phrase": "The cardiomediastinal structures are unremarkable.", + "status": "entailment" + }, + { + "evidence": [], + "phrase": "There are no pleural effusions.", + "status": "neutral" + } + ] + } +}, +{ + "example_id": "1.3.6.1.4.1.55648.192816863503104216516892632261380714559_1.3.6.1.4.1.55648.287593655440699590688075363115862054018", + "study_id": "nan", + "input": { + "phrases_A": [ + "A moderate size left pleural effusion slightly larger in size.", + "Pacemaker is unchanged.", + "There is persistent consolidation in the left lung base." + ], + "phrases_B": [ + "There are small bilateral pleural effusions that have developed since prior study.", + "Left basilar consolidation is present.", + "The heart size is within normal limits.", + "No acute chest wall abnormality is radiographically evident." + ] + }, + "rationale": "rationale", + "output": { + "phrases_A_evidenced": [ + { + "evidence": [ + "There are small bilateral pleural effusions that have developed since prior study." + ], + "phrase": "A moderate size left pleural effusion slightly larger in size.", + "status": "contradiction" + }, + { + "evidence": [], + "phrase": "Pacemaker is unchanged.", + "status": "neutral" + }, + { + "evidence": [], + "phrase": "There is persistent consolidation in the left lung base.", + "status": "neutral" + } + ], + "phrases_B_evidenced": [ + { + "evidence": [], + "phrase": "There are small bilateral pleural effusions that have developed since prior study.", + "status": "neutral" + }, + { + "evidence": [ + "There is persistent consolidation in the left lung base." + ], + "phrase": "Left basilar consolidation is present.", + "status": "entailment" + }, + { + "evidence": [], + "phrase": "The heart size is within normal limits.", + "status": "neutral" + }, + { + "evidence": [], + "phrase": "No acute chest wall abnormality is radiographically evident.", + "status": "neutral" + } + ] + } +}, +{ + "example_id": "1.3.6.1.4.1.55648.144771367909794201071026393675954665550_1.3.6.1.4.1.55648.84420269742583802251982398782271710226", + "study_id": "nan", + "input": { + "phrases_A": [ + "The heart is within normal limits.", + "Pulmonary vascularity is unremarkable.", + "There are patchy bibasilar infiltrates." + ], + "phrases_B": [ + "The left-sided pneumothorax has enlarged.", + "The left-sided pigtail chest tube remains in place.", + "Heart size is normal.", + "There is pulmonary consolidation." + ] + }, + "rationale": "rationale", + "output": { + "phrases_A_evidenced": [ + { + "evidence": [ + "Heart size is normal." + ], + "phrase": "The heart is within normal limits.", + "status": "entailment" + }, + { + "evidence": [], + "phrase": "Pulmonary vascularity is unremarkable.", + "status": "neutral" + }, + { + "evidence": [], + "phrase": "There are patchy bibasilar infiltrates.", + "status": "neutral" + } + ], + "phrases_B_evidenced": [ + { + "evidence": [], + "phrase": "The left-sided pneumothorax has enlarged.", + "status": "neutral" + }, + { + "evidence": [], + "phrase": "The left-sided pigtail chest tube remains in place.", + "status": "neutral" + }, + { + "evidence": [ + "The heart is within normal limits." + ], + "phrase": "Heart size is normal.", + "status": "entailment" + }, + { + "evidence": [ + "There are patchy bibasilar infiltrates." + ], + "phrase": "There is pulmonary consolidation.", + "status": "entailment" + } + ] + } +}, +{ + "example_id": "ct_example7_study13_study14", + "study_id": "nan", + "input": { + "phrases_A": [ + "CT confirms bilateral ground-glass opacities.", + "No evidence of aortic aneurysm.", + "Liver is enlarged with fatty infiltration." + ], + "phrases_B": [ + "Ground-glass opacities are absent in the left lung.", + "Ground-glass opacities not found in the right lung.", + "Normal liver size and texture." + ] + }, + "output": { + "phrases_A_evidenced": [ + { + "evidence": [ + "Ground-glass opacities are absent in the left lung.", + "Ground-glass opacities not found in the right lung." + ], + "phrase": "CT confirms bilateral ground-glass opacities.", + "status": "contradiction" + }, + { + "evidence": [], + "phrase": "No evidence of aortic aneurysm.", + "status": "neutral" + }, + { + "evidence": [ + "Normal liver size and texture." + ], + "phrase": "Liver is enlarged with fatty infiltration.", + "status": "contradiction" + } + ], + "phrases_B_evidenced": [ + { + "evidence": [ + "CT confirms bilateral ground-glass opacities." + ], + "phrase": "Ground-glass opacities are absent in the left lung.", + "status": "contradiction" + }, + { + "evidence": [ + "CT confirms bilateral ground-glass opacities." + ], + "phrase": "Ground-glass opacities not found in the right lung.", + "status": "contradiction" + }, + { + "evidence": [ + "Liver is enlarged with fatty infiltration." + ], + "phrase": "Normal liver size and texture.", + "status": "contradiction" + } + ] + } +} +] \ No newline at end of file diff --git a/src/radfact/llm_utils/nli/prompts/ct/system_message_ev_singlephrase.txt b/src/radfact/llm_utils/nli/prompts/ct/system_message_ev_singlephrase.txt new file mode 100644 index 0000000..164f2c9 --- /dev/null +++ b/src/radfact/llm_utils/nli/prompts/ct/system_message_ev_singlephrase.txt @@ -0,0 +1 @@ +You are an AI radiology assistant. Your task is to assess whether a statement about a CT scan (the "hypothesis") is true or not, given a reference report about the CT scan. This task is known as entailment verification. If the statement is true ("entailed") according to the reference, provide the evidence to support it. \ No newline at end of file diff --git a/src/radfact/llm_utils/nli/prompts/few_shot_examples.json b/src/radfact/llm_utils/nli/prompts/cxr/few_shot_examples.json similarity index 100% rename from src/radfact/llm_utils/nli/prompts/few_shot_examples.json rename to src/radfact/llm_utils/nli/prompts/cxr/few_shot_examples.json diff --git a/src/radfact/llm_utils/nli/prompts/system_message_ev_singlephrase.txt b/src/radfact/llm_utils/nli/prompts/cxr/system_message_ev_singlephrase.txt similarity index 100% rename from src/radfact/llm_utils/nli/prompts/system_message_ev_singlephrase.txt rename to src/radfact/llm_utils/nli/prompts/cxr/system_message_ev_singlephrase.txt diff --git a/src/radfact/llm_utils/prompt_tasks.py b/src/radfact/llm_utils/prompt_tasks.py new file mode 100644 index 0000000..6067b67 --- /dev/null +++ b/src/radfact/llm_utils/prompt_tasks.py @@ -0,0 +1,47 @@ +from dataclasses import dataclass +from enum import Enum +from pathlib import Path +from radfact.paths import get_prompts_dir + +REPORT_TO_PHRASES_PARSING_TASK = "report_to_phrases" +REPORT_TO_PHRASES_PROMPTS_DIR = get_prompts_dir(task=REPORT_TO_PHRASES_PARSING_TASK) +NLI_PARSING_TASK = "nli" +NLI_PROMPTS_DIR = get_prompts_dir(task=NLI_PARSING_TASK) + + +class ReportType(str, Enum): + CXR = "cxr" + CT = "ct" + + +@dataclass(frozen=True) +class PromptTask: + name: str + system_message_path: Path + few_shot_examples_path: Path + + +class ReportToPhrasesTaskOptions(Enum): + CXR = PromptTask( + name=f"{ReportType.CXR.value}_report_to_phrases", + system_message_path=REPORT_TO_PHRASES_PROMPTS_DIR / ReportType.CXR.value / "system_message.txt", + few_shot_examples_path=REPORT_TO_PHRASES_PROMPTS_DIR / ReportType.CXR.value / "few_shot_examples.json", + ) + CT = PromptTask( + name=f"{ReportType.CT.value}_report_to_phrases", + system_message_path=REPORT_TO_PHRASES_PROMPTS_DIR / ReportType.CT.value / "system_message.txt", + few_shot_examples_path=REPORT_TO_PHRASES_PROMPTS_DIR / ReportType.CT.value / "few_shot_examples.json", + ) + + +class NLITaskOptions(Enum): + CXR = PromptTask( + name=f"{ReportType.CXR.value}_nli", + system_message_path=NLI_PROMPTS_DIR / ReportType.CXR.value / "system_message_ev_singlephrase.txt", + few_shot_examples_path=NLI_PROMPTS_DIR / ReportType.CXR.value / "few_shot_examples.json", + ) + CT = PromptTask( + name=f"{ReportType.CT.value}_nli", + system_message_path=NLI_PROMPTS_DIR / ReportType.CT.value / "system_message_ev_singlephrase.txt", + few_shot_examples_path=NLI_PROMPTS_DIR / ReportType.CT.value / "few_shot_examples.json", + ) diff --git a/src/radfact/llm_utils/report_to_phrases/processor.py b/src/radfact/llm_utils/report_to_phrases/processor.py index 86f4b38..b88c5ab 100644 --- a/src/radfact/llm_utils/report_to_phrases/processor.py +++ b/src/radfact/llm_utils/report_to_phrases/processor.py @@ -7,29 +7,30 @@ from typing import Any import pandas as pd +from radfact.llm_utils.prompt_tasks import REPORT_TO_PHRASES_PARSING_TASK, ReportToPhrasesTaskOptions, ReportType from omegaconf import DictConfig from radfact.llm_utils.engine.engine import LLMEngine, get_subfolder from radfact.llm_utils.processor.structured_processor import StructuredProcessor from radfact.llm_utils.report_to_phrases.schema import ParsedReport, load_examples_from_json -from radfact.paths import OUTPUT_DIR, get_prompts_dir +from radfact.paths import OUTPUT_DIR FINDINGS_SECTION = "FINDINGS" -PARSING_TASK = "report_to_phrases" -PROMPTS_DIR = get_prompts_dir(task=PARSING_TASK) StudyIdType = str | int -def get_report_to_phrases_processor(log_dir: Path | None = None) -> StructuredProcessor[str, ParsedReport]: +def get_report_to_phrases_processor( + report_type: ReportType, log_dir: Path | None = None +) -> StructuredProcessor[str, ParsedReport]: """Return a processor for converting reports to phrases. + :param report_type: The type of report, e.g., "ReportType.CXR" or "ReportType.CT". :param log_dir: The directory to save logs. :return: The processor for report to phrase conversion. """ - system_message_path = PROMPTS_DIR / "system_message.txt" - few_shot_examples_path = PROMPTS_DIR / "few_shot_examples.json" - system_prompt = system_message_path.read_text() - few_shot_examples = load_examples_from_json(few_shot_examples_path) + task = ReportToPhrasesTaskOptions[report_type.name].value + system_prompt = task.system_message_path.read_text() + few_shot_examples = load_examples_from_json(task.few_shot_examples_path) processor = StructuredProcessor( query_type=str, result_type=ParsedReport, @@ -58,12 +59,20 @@ def get_report_to_phrases_engine(cfg: DictConfig, dataset_df: pd.DataFrame) -> L :return: The processing engine. """ subfolder = cfg.dataset.name - root = OUTPUT_DIR / PARSING_TASK + root = OUTPUT_DIR / REPORT_TO_PHRASES_PARSING_TASK output_folder = get_subfolder(root, subfolder) final_output_folder = get_subfolder(root, subfolder) log_dir = get_subfolder(root, "logs") - report_to_phrases_processor = get_report_to_phrases_processor(log_dir=log_dir) + report_type_value = cfg.get("report_type") + try: + report_type = ReportType(report_type_value) + except ValueError as e: + raise ValueError( + f"Invalid report_type '{report_type_value}'. Valid options are: {[rt.value for rt in ReportType]}" + ) from e + + report_to_phrases_processor = get_report_to_phrases_processor(report_type=report_type, log_dir=log_dir) id_col = cfg.processing.index_col dataset_df = dataset_df[[id_col, FINDINGS_SECTION]] engine = LLMEngine( diff --git a/src/radfact/llm_utils/report_to_phrases/prompts/ct/few_shot_examples.json b/src/radfact/llm_utils/report_to_phrases/prompts/ct/few_shot_examples.json new file mode 100644 index 0000000..c46b7e6 --- /dev/null +++ b/src/radfact/llm_utils/report_to_phrases/prompts/ct/few_shot_examples.json @@ -0,0 +1,471 @@ +[ + { + "example_id": "few_shot1", + "findings_text": "The endotracheal tube is appropriately positioned 4.8 cm from the carina. An enteric tube is present and its tip is in the stomach. A right internal jugular central venous catheter is present and its tip is in the mid SVC. Bilateral parenchymal opacities are present, mainly at the bases, consistent with an infectious process. Interstitial opacities have worsened, indicating increased edema. A small left pleural effusion is stable. The right pleural effusion has slightly increased in size. No pneumothorax is present. The cardiomediastinal structures are normal.", + "parsed_report": { + "id": "few_shot1", + "sentence_list": [ + { + "orig": "The endotracheal tube is appropriately positioned 4.8 cm from the carina.", + "new": [ + "The endotracheal tube is appropriately positioned 4.8 cm from the carina." + ] + }, + { + "orig": "An enteric tube is present and its tip is in the stomach.", + "new": [ + "An enteric tube is present and its tip is in the stomach." + ] + }, + { + "orig": "A right internal jugular central venous catheter is present and its tip is in the mid SVC.", + "new": [ + "A right internal jugular central venous catheter is present and its tip is in the mid SVC." + ] + }, + { + "orig": "Bilateral parenchymal opacities are present, mainly at the bases, consistent with an infectious process.", + "new": [ + "Bilateral parenchymal opacities are present, mainly at the bases." + ] + }, + { + "orig": "Interstitial opacities have worsened, indicating increased edema.", + "new": [ + "Interstitial opacities have worsened, indicating increased edema." + ] + }, + { + "orig": "A small left pleural effusion is stable.", + "new": [ + "A small left pleural effusion is stable." + ] + }, + { + "orig": "The right pleural effusion has slightly increased in size.", + "new": [ + "The right pleural effusion has slightly increased in size." + ] + }, + { + "orig": "No pneumothorax is present.", + "new": [ + "No pneumothorax is present." + ] + }, + { + "orig": "The cardiomediastinal structures are normal.", + "new": [ + "The cardiomediastinal structures are normal." + ] + } + ] + }, + "study_id": "study_id_1", + "example_rationale": "Positive finding." + }, + { + "example_id": "few_shot2", + "findings_text": "The perihilar opacities have increased since the last examination, along with pulmonary vascular congestion and widespread opacity. There is moderate cardiomegaly and a new small left-sided pleural effusion. No pneumothorax is observed.", + "parsed_report": { + "id": "few_shot2", + "sentence_list": [ + { + "orig": "The perihilar opacities have increased since the last examination, along with pulmonary vascular congestion and widespread opacity.", + "new": [ + "The perihilar opacities have increased since the last examination.", + "Pulmonary vascular congestion has increased since the last examination.", + "Widespread opacity has increased since the last examination." + ] + }, + { + "orig": "There is moderate cardiomegaly and a new small left-sided pleural effusion.", + "new": [ + "There is moderate cardiomegaly.", + "New small left-sided pleural effusion." + ] + }, + { + "orig": "No pneumothorax is observed.", + "new": [ + "No pneumothorax is observed." + ] + } + ] + }, + "study_id": "study_id_2", + "example_rationale": "Multiple positive findings" + }, + { + "example_id": "few_shot3", + "findings_text": "Chest CT scan was performed with axial slices, and images were obtained in multiple planes. Diffuse peribronchovascular opacities are observed throughout the lungs, which may indicate diffuse bronchopneumonia. No lobar consolidation, effusion, or pneumothorax is detected. The cardiomediastinal structures appear normal, and the bony structures are intact. No free air is observed below the right hemidiaphragm.", + "parsed_report": { + "id": "few_shot3", + "sentence_list": [ + { + "orig": "Chest CT scan was performed with axial slices, and images were obtained in multiple planes.", + "new": [ + "" + ] + }, + { + "orig": "Diffuse peribronchovascular opacities are observed throughout the lungs, which may indicate diffuse bronchopneumonia.", + "new": [ + "Diffuse peribronchovascular opacities are observed throughout the lungs." + ] + }, + { + "orig": "No lobar consolidation, effusion, or pneumothorax is detected.", + "new": [ + "No lobar consolidation is detected.", + "No effusion is detected", + "No pneumothorax is detected." + ] + }, + { + "orig": "The cardiomediastinal structures appear normal, and the bony structures are intact.", + "new": [ + "The cardiomediastinal structures appear normal.", + "The bony structures are intact." + ] + }, + { + "orig": "No free air is observed below the right hemidiaphragm.", + "new": [ + "No free air is observed below the right hemidiaphragm." + ] + } + ] + }, + "study_id": "study_id_3", + "example_rationale": "Negative finding without location." + }, + { + "example_id": "few_shot4", + "findings_text": "The left PICC line terminates in the upper SVC and the tracheostomy remains in its original position. The chronic left atelectasis persists without any change. Top normal heart size is stable. No evidence of pneumothorax or right pleural effusion is observed.", + "parsed_report": { + "id": "few_shot4", + "sentence_list": [ + { + "orig": "The left PICC line terminates in the upper SVC and the tracheostomy remains in its original position.", + "new": [ + "The left PICC line terminates in the upper SVC.", + "Tracheostomy remains in its original position" + ] + }, + { + "orig": "The chronic left atelectasis persists without any change.", + "new": [ + "The chronic left atelectasis persists without any change." + ] + }, + { + "orig": "Top normal heart size is stable.", + "new": [ + "Top normal heart size is stable." + ] + }, + { + "orig": "No evidence of pneumothorax or right pleural effusion is observed.", + "new": [ + "No evidence of pneumothorax is observed.", + "No right pleural effusion is observed." + ] + } + ] + }, + "study_id": "study_id_4", + "example_rationale": "Negative finding with location." + }, + { + "example_id": "few_shot5", + "findings_text": "The lungs are still hyperinflated, consistent with a history of asthma. Compared to the previous study, there are now hazy areas in the lower parts of both lungs. These may be caused by thickening of the bronchial walls or small airway disease, but there is no evidence of a specific lobe being affected. No pleural effusion or pneumothorax is seen. The cardiac and mediastinal structures are unremarkable.", + "parsed_report": { + "id": "few_shot5", + "sentence_list": [ + { + "orig": "The lungs are still hyperinflated, consistent with a history of asthma.", + "new": [ + "The lungs are still hyperinflated." + ] + }, + { + "orig": "Compared to the previous study, there are now hazy areas in the lower parts of both lungs. These may be caused by thickening of the bronchial walls or small airway disease, but there is no evidence of a specific lobe being affected.", + "new": [ + "Compared to the previous study, there are now hazy areas in the lower parts of both lungs." + ] + }, + { + "orig": "No pleural effusion or pneumothorax is seen.", + "new": [ + "No pleural effusion is seen.", + "No pneumothorax is seen." + ] + }, + { + "orig": "The cardiac and mediastinal structures are unremarkable.", + "new": [ + "The cardiac structures are unremarkable.", + "The mediastinal structures are unremarkable." + ] + } + ] + }, + "study_id": "study_id_5", + "example_rationale": "Clinical interpretation." + }, + { + "example_id": "few_shot6", + "findings_text": "The CT scan reveals the presence of a PAC (pulmonary artery catheter) overlying the upper left lateral hemithorax. There is a relative opacity observed in the left mid-to-lower lung, possibly located in the lingula. This could be attributed to the patient's positioning, an underlying infection causing consolidation, or a contusion. No pleural effusion is seen. There is no evidence of pneumothorax. The cardiac and mediastinal structures are unremarkable. Thoracic scoliosis is present.", + "parsed_report": { + "id": "few_shot6", + "sentence_list": [ + { + "orig": "The CT scan reveals the presence of a PAC (pulmonary artery catheter) overlying the upper left lateral hemithorax.", + "new": [ + "The CT scan reveals the presence of a PAC (pulmonary artery catheter) overlying the upper left lateral hemithorax." + ] + }, + { + "orig": "There is a relative opacity observed in the left mid-to-lower lung, possibly located in the lingula. This could be attributed to the patient's positioning, an underlying infection causing consolidation, or a contusion.", + "new": [ + "There is a relative opacity observed in the left mid-to-lower lung, possibly located in the lingula." + ] + }, + { + "orig": "No pleural effusion is seen.", + "new": [ + "No pleural effusion is seen." + ] + }, + { + "orig": "There is no evidence of pneumothorax.", + "new": [ + "There is no evidence of pneumothorax." + ] + }, + { + "orig": "The cardiac and mediastinal structures are unremarkable.", + "new": [ + "The cardiac structures are unremarkable.", + "The mediastinal structures are unremarkable." + ] + }, + { + "orig": "Thoracic scoliosis is present.", + "new": [ + "Thoracic scoliosis is present." + ] + } + ] + }, + "study_id": "study_id_6", + "example_rationale": "Comment on technical quality." + }, + { + "example_id": "few_shot7", + "findings_text": "The CT scan shows linear opacities in the right middle lobe and left lower lobe, indicating atelectasis. Mediastinum is unchanged. The patient has undergone a right mastectomy. No pneumothorax is observed. However, there is a new blunting of the left costophrenic angle, which could be due to effusion or pleural thickening. These findings warrant correlation with the patient's clinical history of malignancy.", + "parsed_report": { + "id": "few_shot7", + "sentence_list": [ + { + "orig": "The CT scan shows linear opacities in the right middle lobe and left lower lobe, indicating atelectasis.", + "new": [ + "The CT scan shows linear opacities in the right middle lobe and left lower lobe, indicating atelectasis." + ] + }, + { + "orig": "Mediastinum is unchanged.", + "new": [ + "Mediastinum is unchanged." + ] + }, + { + "orig": "The patient has undergone a right mastectomy.", + "new": [ + "The patient has undergone a right mastectomy." + ] + }, + { + "orig": "No pneumothorax is observed.", + "new": [ + "No pneumothorax is observed." + ] + }, + { + "orig": "However, there is a new blunting of the left costophrenic angle, which could be due to effusion or pleural thickening. These findings warrant correlation with the patient's clinical history of malignancy.", + "new": [ + "There is new blunting of the left costrophrenic angle." + ] + } + ] + }, + "study_id": "study_id_7", + "example_rationale": "Recommendation." + }, + { + "example_id": "few_shot8", + "findings_text": "Cardiomediastinal structures appear unchanged. The lungs are clear without any signs of pneumonia. However, there is some overlap of structures in the left juxtamediastinal area. Findings should be correlated with clinical suspicion of infection or further imaging as necessary.", + "parsed_report": { + "id": "few_shot8", + "sentence_list": [ + { + "orig": "Cardiomediastinal structures appear unchanged.", + "new": [ + "Cardiomediastinal structures appear unchanged." + ] + }, + { + "orig": "The lungs are clear without any signs of pneumonia. However, there is some overlap of structures in the left juxtamediastinal area.", + "new": [ + "The lungs are clear.", + "Overlap of structures in the left juxtamediastinal area." + ] + }, + { + "orig": "Findings should be correlated with clinical suspicion of infection or further imaging as necessary.", + "new": [ + "" + ] + } + ] + }, + "study_id": "study_id_8", + "example_rationale": "Other irrelevant detail." + }, + { + "example_id": "few_shot9", + "findings_text": "The current scan was compared to the previous one from _. The left side still shows mediastinal shifting and volume loss. This remains unchanged. The increased density at the left base indicates the collapse of the postoperative left lung, as previously suggested. The right lung is well aerated, with no signs of pulmonary edema, focal consolidation, or pleural effusions. No pneumothoraces are observed. A central venous line is present on the right side, with the distal lead tip located in the proximal SVC.", + "parsed_report": { + "id": "few_shot9", + "sentence_list": [ + { + "orig": "The current scan was compared to the previous one from _.", + "new": [ + "" + ] + }, + { + "orig": "The left side still shows mediastinal shifting and volume loss. This remains unchanged.", + "new": [ + "The left side still shows mediastinal shifting and volume loss." + ] + }, + { + "orig": "The increased density at the left base indicates the collapse of the postoperative left lung, as previously suggested.", + "new": [ + "The increased density at the left base indicates the collapse of the postoperative left lung, as previously suggested." + ] + }, + { + "orig": "The right lung is well aerated, with no signs of pulmonary edema, focal consolidation, or pleural effusions.", + "new": [ + "The right lung is well aerated.", + "No signs of pulmonary edema.", + "No signs of focal consolidation.", + "No signs of pleural effusions." + ] + }, + { + "orig": "No pneumothoraces are observed.", + "new": [ + "No pneumothoraces are observed." + ] + }, + { + "orig": "A central venous line is present on the right side, with the distal lead tip located in the proximal SVC.", + "new": [ + "A central venous line is present on the right side, with the distal lead tip located in the proximal SVC." + ] + } + ] + }, + "study_id": "study_id_9", + "example_rationale": "Combine sentences." + }, + { + "example_id": "few_shot10", + "findings_text": "The liver, spleen, pancreas, and kidneys appear normal in size and morphology. No focal hepatic or splenic lesions are identified. There is mild wall thickening of the sigmoid colon, which may represent colitis or nonspecific changes. There is no free fluid or free air in the abdomen. Findings should be correlated with clinical symptoms, and follow-up imaging may be considered if clinically indicated.", + "parsed_report": { + "id": "few_shot10", + "sentence_list": [ + { + "orig": "The liver, spleen, pancreas, and kidneys appear normal in size and morphology.", + "new": [ + "The liver appears normal in size and morphology.", + "The spleen appears normal in size and morphology.", + "The pancreas appears normal in size and morphology.", + "The kidneys appear normal in size and morphology." + ] + }, + { + "orig": "No focal hepatic or splenic lesions are identified.", + "new": [ + "No focal hepatic lesions are identified.", + "No focal splenic lesions are identified." + ] + }, + { + "orig": "There is mild wall thickening of the sigmoid colon, which may represent colitis or nonspecific changes.", + "new": [ + "Mild wall thickening of the sigmoid colon is present." + ] + }, + { + "orig": "There is no free fluid or free air in the abdomen.", + "new": [ + "No free fluid is present in the abdomen.", + "No free air is present in the abdomen." + ] + }, + { + "orig": "Findings should be correlated with clinical symptoms, and follow-up imaging may be considered if clinically indicated.", + "new": [ + "" + ] + } + ] + }, + "study_id": "study_id_10", + "example_rationale": "Other irrelevant detail." + }, + { + "example_id": "few_shot11", + "findings_text": "The CT scan shows the liver to be normal in size with no focal lesions. The spleen, pancreas, and kidneys appear unremarkable. There is evidence of prior cholecystectomy. The abdominal aorta is normal in caliber, and no lymphadenopathy is seen.", + "parsed_report": { + "id": "few_shot11", + "sentence_list": [ + { + "orig": "The CT scan shows the liver to be normal in size with no focal lesions.", + "new": [ + "The liver is normal in size.", + "No focal lesions are identified." + ] + }, + { + "orig": "The spleen, pancreas, and kidneys appear unremarkable.", + "new": [ + "The spleen appears unremarkable.", + "The pancreas appears unremarkable.", + "The kidneys appear unremarkable." + ] + }, + { + "orig": "There is evidence of prior cholecystectomy.", + "new": [ + "There is evidence of prior cholecystectomy." + ] + }, + { + "orig": "The abdominal aorta is normal in caliber, and no lymphadenopathy is seen.", + "new": [ + "The abdominal aorta is normal in caliber.", + "No lymphadenopathy is seen." + ] + } + ] + }, + "study_id": "study_id_11", + "example_rationale": "Negative finding without location." + } +] \ No newline at end of file diff --git a/src/radfact/llm_utils/report_to_phrases/prompts/ct/system_message.txt b/src/radfact/llm_utils/report_to_phrases/prompts/ct/system_message.txt new file mode 100644 index 0000000..533e9f9 --- /dev/null +++ b/src/radfact/llm_utils/report_to_phrases/prompts/ct/system_message.txt @@ -0,0 +1,13 @@ +You are an AI radiology assistant. You are helping process reports from CT (computed tomography) scans. + +Please extract phrases from the radiology report which refer to objects, findings, or anatomies visible in a CT scan, or the absence of such. + +Rules: +- If a sentence describes multiple findings, split them up into separate sentences. +- Exclude clinical speculation or interpretation (e.g. "... highly suggestive of pneumonia"). +- Exclude recommendations (e.g. "Recommend further imaging or follow-up"). +- Exclude comments on the technical quality of the CT scan (e.g. "motion artifacts noted in the scan"). +- Include mentions of change (e.g. "Pleural effusion has increased") because change is visible when we compare two scans. +- If consecutive sentences are closely linked such that one sentence can't be understood without the other one, process them together. + +The objective is to extract phrases which refer to things which can be located on a CT scan, or confirmed not to be present. \ No newline at end of file diff --git a/src/radfact/llm_utils/report_to_phrases/prompts/few_shot_examples.json b/src/radfact/llm_utils/report_to_phrases/prompts/cxr/few_shot_examples.json similarity index 100% rename from src/radfact/llm_utils/report_to_phrases/prompts/few_shot_examples.json rename to src/radfact/llm_utils/report_to_phrases/prompts/cxr/few_shot_examples.json diff --git a/src/radfact/llm_utils/report_to_phrases/prompts/system_message.txt b/src/radfact/llm_utils/report_to_phrases/prompts/cxr/system_message.txt similarity index 100% rename from src/radfact/llm_utils/report_to_phrases/prompts/system_message.txt rename to src/radfact/llm_utils/report_to_phrases/prompts/cxr/system_message.txt diff --git a/tests/metric/test_radfact.py b/tests/metric/test_radfact.py index 2d08a1f..b631d72 100644 --- a/tests/metric/test_radfact.py +++ b/tests/metric/test_radfact.py @@ -8,6 +8,16 @@ import mock import pandas as pd +from radfact.llm_utils.report_to_phrases.processor import get_report_to_phrases_engine +from radfact.metric.radfact import REPORT_TO_PHRASES_CONFIG, init_hydra_config +from radfact.llm_utils.nli.processor import get_ev_processor_singlephrase +from radfact.paths import OUTPUT_DIR +from radfact.llm_utils.prompt_tasks import ReportType +from radfact.llm_utils.report_to_phrases.processor import get_report_to_phrases_processor +from radfact.llm_utils.prompt_tasks import NLITaskOptions, ReportToPhrasesTaskOptions +from radfact.llm_utils.nli.schema import NLISampleSinglePhrase +from radfact.llm_utils.report_to_phrases.schema import load_examples_from_json as load_examples_from_json_phrasification + import pytest from numpy.testing import assert_equal from omegaconf import DictConfig @@ -23,6 +33,7 @@ NLIQuerySample, NLISample, NLIState, + load_examples_from_json as load_examples_from_json_nli, ) from radfact.llm_utils.report_to_phrases.processor import StudyIdType from radfact.llm_utils.report_to_phrases.schema import ParsedReport, SentenceWithRephrases @@ -433,3 +444,41 @@ def test_convert_narrative_text_to_phrases() -> None: with mock.patch('radfact.metric.radfact.get_report_to_phrases_engine', return_value=mock_phrase_engine): processed_texts = metric.convert_narrative_text_to_phrases(input_texts, metric_prefix) assert processed_texts == expected_texts + + +@pytest.mark.parametrize("report_type_value", ["cxr", "ct"]) +def test_report_type_phrasification(report_type_value: str) -> None: + report_type = ReportType(report_type_value) + processor = get_report_to_phrases_processor(report_type=report_type) + + task = ReportToPhrasesTaskOptions[report_type.name].value + + system_message = task.system_message_path.read_text() + assert processor.query_template.system_prompt.startswith(system_message) + few_shot_examples = load_examples_from_json_phrasification(task.few_shot_examples_path) + assert few_shot_examples == processor.query_template.examples # type: ignore[comparison-overlap] + + +@pytest.mark.parametrize("report_type_value", ["cxr", "ct"]) +def test_report_type_nli(report_type_value: str) -> None: + report_type = ReportType(report_type_value) + processor = get_ev_processor_singlephrase(report_type=report_type, log_dir=OUTPUT_DIR / "ev_processor_logs_test") + + task = NLITaskOptions[report_type.name].value + + system_message = task.system_message_path.read_text() + assert processor.query_template.system_prompt == system_message + few_shot_examples = load_examples_from_json_nli(task.few_shot_examples_path, binary=True) + few_shot_examples_single_phrase: list[NLISampleSinglePhrase] = [] + for few_shot_example in few_shot_examples: + one_way_dict = NLISampleSinglePhrase.from_nli_sample(few_shot_example) + for single_phrase_sample in one_way_dict.values(): + few_shot_examples_single_phrase.extend(single_phrase_sample) + assert few_shot_examples_single_phrase == processor.query_template.examples + + +def test_invalid_report_type() -> None: + config = init_hydra_config(REPORT_TO_PHRASES_CONFIG) + config.report_type = "invalid_type" + with pytest.raises(ValueError): + get_report_to_phrases_engine(cfg=config, dataset_df=pd.DataFrame({}, columns=["study_id", "FINDINGS"])) From 2abbf32ebc9cf0e33f45555aa5c06b55e8159cb3 Mon Sep 17 00:00:00 2001 From: Cynthia Lo Date: Mon, 12 Jan 2026 04:01:26 -0800 Subject: [PATCH 2/6] Update README --- README.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 05c248c..79da37b 100644 --- a/README.md +++ b/README.md @@ -173,15 +173,16 @@ options: Refer to the example input files in the [`examples`](examples) directory for the expected format of the input files. The input files should be in the format of a CSV file for non-grounded reports [findings_generation_examples.csv](examples/findings_generation_examples.csv) and a JSON file for grounded reports [grounded_reporting_examples.json](examples/grounded_reporting_examples.json). +The script computes confidence intervals for the metrics using bootstrapping. The number of bootstrap samples can be controlled using the `--bootstrap_samples` argument. The default value is 500. To disable bootstrapping, set `--bootstrap_samples 0`. + +⚠️**WARNING**: Some queries may fail due to the endpoint limitations (timeouts, rate limits, etc.). When the LLM performing entailment verification fails, we **set these examples as not-entailed by default**. If this occurs in a significant number of cases, the results will not be reliable. The final metrics dict contains the number of such skipped queries under the key `num_llm_failures`. The script will print the number of skipped queries at the end of the run, and store these in the `skipped` directroy under the run id folder. You will also see a warning message in the logs for each failed query. `WARNING: No response for example {query_id}. Setting as NOT ENTAILED`. + +### Supporting Multiple Report Rypes RadFact supports different report types through the `report_type` field in [`configs/default.yaml`](configs/default.yaml). Currently supported options are: - `cxr` - Chest X-ray reports (default) - `ct` - CT scan reports -The script computes confidence intervals for the metrics using bootstrapping. The number of bootstrap samples can be controlled using the `--bootstrap_samples` argument. The default value is 500. To disable bootstrapping, set `--bootstrap_samples 0`. - -⚠️**WARNING**: Some queries may fail due to the endpoint limitations (timeouts, rate limits, etc.). When the LLM performing entailment verification fails, we **set these examples as not-entailed by default**. If this occurs in a significant number of cases, the results will not be reliable. The final metrics dict contains the number of such skipped queries under the key `num_llm_failures`. The script will print the number of skipped queries at the end of the run, and store these in the `skipped` directroy under the run id folder. You will also see a warning message in the logs for each failed query. `WARNING: No response for example {query_id}. Setting as NOT ENTAILED`. - ### Split reports into phrases We also provide a script to convert reports to phrases. This is useful when you have a narrative report and want to convert it to a list of phrases for RadFact evaluation. You can run this step offline and then use the output file as input to RadFact. Make sure you've set up the endpoints as described above before running the script. The `run_report_to_phrases` command runs `python src/radfact/cli/run_report_to_phrases.py` script under the hood. From 068187fd3b788260fc13eb1f7e310647a3b989f7 Mon Sep 17 00:00:00 2001 From: Cynthia Lo Date: Mon, 12 Jan 2026 04:15:39 -0800 Subject: [PATCH 3/6] Add comment to config --- configs/default.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/default.yaml b/configs/default.yaml index 9878825..abffd5e 100644 --- a/configs/default.yaml +++ b/configs/default.yaml @@ -21,7 +21,7 @@ processing: end_index: null output_filename: "outputs.json" -report_type: "cxr" +report_type: "cxr" # cxr or ct # The type of cache that should be set for langchain. This can be either "redis" or "sqlite". # Sqlite cache is useful for local development, it will be written to ~/.langchain.db From 308e18317a09e6952f82495c851bbca4afc47cc6 Mon Sep 17 00:00:00 2001 From: Cynthia Lo Date: Tue, 13 Jan 2026 07:49:35 -0800 Subject: [PATCH 4/6] Add report type cli support --- README.md | 4 +++- configs/default.yaml | 2 -- src/radfact/cli/run_radfact.py | 12 ++++++++++++ src/radfact/llm_utils/nli/processor.py | 13 +++++-------- .../llm_utils/report_to_phrases/processor.py | 15 +++++---------- src/radfact/metric/radfact.py | 10 ++++++++-- tests/metric/test_radfact.py | 11 +---------- 7 files changed, 34 insertions(+), 33 deletions(-) diff --git a/README.md b/README.md index 79da37b..63c76b8 100644 --- a/README.md +++ b/README.md @@ -157,6 +157,8 @@ options: Path to the directory where the results will be saved as a json file. --bootstrap_samples BOOTSTRAP_SAMPLES Number of bootstrap samples to use for computing the confidence intervals. Set to 0 to disable bootstrapping. + --report_type {cxr,ct} + Type of report: 'cxr' for chest x-ray reports or 'ct' for CT reports. ``` - for non-grounded reports (findings generation narrative text): @@ -178,7 +180,7 @@ The script computes confidence intervals for the metrics using bootstrapping. Th ⚠️**WARNING**: Some queries may fail due to the endpoint limitations (timeouts, rate limits, etc.). When the LLM performing entailment verification fails, we **set these examples as not-entailed by default**. If this occurs in a significant number of cases, the results will not be reliable. The final metrics dict contains the number of such skipped queries under the key `num_llm_failures`. The script will print the number of skipped queries at the end of the run, and store these in the `skipped` directroy under the run id folder. You will also see a warning message in the logs for each failed query. `WARNING: No response for example {query_id}. Setting as NOT ENTAILED`. ### Supporting Multiple Report Rypes -RadFact supports different report types through the `report_type` field in [`configs/default.yaml`](configs/default.yaml). Currently supported options are: +RadFact supports different report types through the `report_type` field in the `RadFactMetric` class. Currently supported options are: - `cxr` - Chest X-ray reports (default) - `ct` - CT scan reports diff --git a/configs/default.yaml b/configs/default.yaml index abffd5e..28c4869 100644 --- a/configs/default.yaml +++ b/configs/default.yaml @@ -21,8 +21,6 @@ processing: end_index: null output_filename: "outputs.json" -report_type: "cxr" # cxr or ct - # The type of cache that should be set for langchain. This can be either "redis" or "sqlite". # Sqlite cache is useful for local development, it will be written to ~/.langchain.db # Redis cache is useful to share state across many evaluation runs in AzureML diff --git a/src/radfact/cli/run_radfact.py b/src/radfact/cli/run_radfact.py index a08a23e..f2140e9 100644 --- a/src/radfact/cli/run_radfact.py +++ b/src/radfact/cli/run_radfact.py @@ -10,6 +10,7 @@ import pandas as pd +from radfact.llm_utils.prompt_tasks import ReportType from radfact.data_utils.grounded_phrase_list import GroundedPhraseList from radfact.llm_utils.report_to_phrases.processor import StudyIdType from radfact.metric.bootstrapping import MetricBootstrapper @@ -66,12 +67,14 @@ def compute_radfact_scores( candidates: InputDict, references: InputDict, is_narrative_text: bool, + report_type: ReportType, bootstrap_samples: int, ) -> dict[str, float]: radfact_metric = RadFactMetric( nli_config_name=radfact_config_name, phrase_config_name=phrases_config_name, is_narrative_text=is_narrative_text, + report_type=report_type, ) if bootstrap_samples == 0: _, results = radfact_metric.compute_metric_score(candidates, references) @@ -131,6 +134,13 @@ def main() -> None: "bootstrapping.", default=500, ) + parser.add_argument( + "--report_type", + type=str, + choices=["cxr", "ct"], + help="Type of report: 'cxr' for chest x-ray reports or 'ct' for CT reports.", + default="cxr", + ) args = parser.parse_args() input_path = Path(args.input_path) @@ -139,6 +149,7 @@ def main() -> None: radfact_config_name = args.radfact_config_name phrases_config_name = args.phrases_config_name bootstrap_samples = args.bootstrap_samples + report_type = ReportType(args.report_type) assert input_path.suffix in [".csv", ".json"], "Input file must be a csv or json file." assert input_path.suffix == ".csv" or not is_narrative_text, ( @@ -163,6 +174,7 @@ def main() -> None: references=references, is_narrative_text=is_narrative_text, bootstrap_samples=bootstrap_samples, + report_type=report_type, ) print_fn = print_results if bootstrap_samples == 0 else print_bootstrap_results diff --git a/src/radfact/llm_utils/nli/processor.py b/src/radfact/llm_utils/nli/processor.py index 6228f7e..59c5a0f 100644 --- a/src/radfact/llm_utils/nli/processor.py +++ b/src/radfact/llm_utils/nli/processor.py @@ -188,16 +188,13 @@ def format_row_to_nli_query_sample(row: "pd.Series[Any]") -> NLIQuerySample: def get_report_nli_engine( - cfg: DictConfig, candidates: dict[str, GroundedPhraseList], references: dict[str, GroundedPhraseList] + cfg: DictConfig, + candidates: dict[str, GroundedPhraseList], + references: dict[str, GroundedPhraseList], + report_type: ReportType = ReportType.CXR, ) -> LLMEngine: output_folder = get_subfolder(root=OUTPUT_DIR, subfolder=RADFACT_SUBFOLDER) - report_type_value = cfg.get("report_type") - try: - report_type = ReportType(report_type_value) - except ValueError as e: - raise ValueError( - f"Invalid report_type '{report_type_value}'. Valid options are: {[rt.value for rt in ReportType]}" - ) from e + nli_report_processor = ReportGroundingNLIProcessor( report_type=report_type, format_query_fn=format_row_to_nli_query_sample ) diff --git a/src/radfact/llm_utils/report_to_phrases/processor.py b/src/radfact/llm_utils/report_to_phrases/processor.py index b88c5ab..97c0acb 100644 --- a/src/radfact/llm_utils/report_to_phrases/processor.py +++ b/src/radfact/llm_utils/report_to_phrases/processor.py @@ -49,13 +49,15 @@ def get_findings_from_row(row: "pd.Series[Any]") -> str: return findings -def get_report_to_phrases_engine(cfg: DictConfig, dataset_df: pd.DataFrame) -> LLMEngine: +def get_report_to_phrases_engine( + cfg: DictConfig, dataset_df: pd.DataFrame, report_type: ReportType = ReportType.CXR +) -> LLMEngine: """ Create the processing engine for converting reports to phrases. :param cfg: The configuration for the processing engine. :param dataset_df: The dataset DataFrame. - :param subfolder: The subfolder to save the processing output. + :param report_type: The type of report, e.g., CXR or CT. :return: The processing engine. """ subfolder = cfg.dataset.name @@ -63,14 +65,7 @@ def get_report_to_phrases_engine(cfg: DictConfig, dataset_df: pd.DataFrame) -> L output_folder = get_subfolder(root, subfolder) final_output_folder = get_subfolder(root, subfolder) log_dir = get_subfolder(root, "logs") - - report_type_value = cfg.get("report_type") - try: - report_type = ReportType(report_type_value) - except ValueError as e: - raise ValueError( - f"Invalid report_type '{report_type_value}'. Valid options are: {[rt.value for rt in ReportType]}" - ) from e + breakpoint() report_to_phrases_processor = get_report_to_phrases_processor(report_type=report_type, log_dir=log_dir) id_col = cfg.processing.index_col diff --git a/src/radfact/metric/radfact.py b/src/radfact/metric/radfact.py index 1da0b18..0dd93e5 100644 --- a/src/radfact/metric/radfact.py +++ b/src/radfact/metric/radfact.py @@ -7,6 +7,7 @@ from dataclasses import asdict, replace from typing import Any, Iterable, Mapping +from radfact.llm_utils.prompt_tasks import ReportType import hydra import numpy as np import pandas as pd @@ -71,6 +72,7 @@ def __init__( image_size: int = 224, box_precision_threshold: float = 0.5, is_narrative_text: bool = False, + report_type: ReportType = ReportType.CXR, ) -> None: """ Initializes the RadFactMetric with the necessary configurations. We need to know the image size so we can @@ -86,9 +88,11 @@ def __init__( findings section. We need to convert this to lists GroundedPhrase before conducting entailment verification. If False, we are running the metric on grounded reports, where the phrases are already in the correct format for entailment verification. + :param report_type: The type of report, e.g. CXR or CT """ self.llm_nli_cfg = init_hydra_config(nli_config_name or RADFACT_CONFIG) self.llm_phrase_cfg = init_hydra_config(phrase_config_name or REPORT_TO_PHRASES_CONFIG) + self.report_type = report_type self.image_size = image_size self.box_precision_threshold = box_precision_threshold self.is_narrative_text = is_narrative_text @@ -206,7 +210,7 @@ def convert_narrative_text_to_phrases( texts_as_str_df = pd.DataFrame( {id_col: study_id, FINDINGS_SECTION: texts_as_str[study_id]} for study_id in texts_as_str.keys() ) - engine = get_report_to_phrases_engine(self.llm_phrase_cfg, texts_as_str_df) + engine = get_report_to_phrases_engine(self.llm_phrase_cfg, texts_as_str_df, self.report_type) parsed_reports: list[ParsedReport] = engine.run() processed_texts = { parsed.id: parsed.to_grounded_phrases_list() for parsed in parsed_reports if parsed.id is not None @@ -304,7 +308,9 @@ def compute_results_per_sample(self, candidates: InputDict, references: InputDic candidates_str_ids = {str(study_id): sequence for study_id, sequence in candidates_mm.items()} references_str_ids = {str(study_id): sequence for study_id, sequence in references_mm.items()} - llm_ev_engine = get_report_nli_engine(self.llm_nli_cfg, candidates_str_ids, references_str_ids) + llm_ev_engine = get_report_nli_engine( + self.llm_nli_cfg, candidates_str_ids, references_str_ids, self.report_type + ) processed_samples: list[NLISample] = llm_ev_engine.run() if llm_ev_engine.aggregated_processor_stats: self.meta_metrics.update(llm_ev_engine.aggregated_processor_stats) diff --git a/tests/metric/test_radfact.py b/tests/metric/test_radfact.py index b631d72..776614f 100644 --- a/tests/metric/test_radfact.py +++ b/tests/metric/test_radfact.py @@ -8,8 +8,6 @@ import mock import pandas as pd -from radfact.llm_utils.report_to_phrases.processor import get_report_to_phrases_engine -from radfact.metric.radfact import REPORT_TO_PHRASES_CONFIG, init_hydra_config from radfact.llm_utils.nli.processor import get_ev_processor_singlephrase from radfact.paths import OUTPUT_DIR from radfact.llm_utils.prompt_tasks import ReportType @@ -255,7 +253,7 @@ def test_nli_processing_with_endpoint(mock_nli_engine: mock.Mock) -> None: } -def get_mock_phrase_engine(llm_phrase_cfg: DictConfig, df: pd.DataFrame) -> mock.Mock: +def get_mock_phrase_engine(llm_phrase_cfg: DictConfig, df: pd.DataFrame, report_type: ReportType) -> mock.Mock: mock_phrase_engine = mock.Mock() if df["FINDINGS"].values[0] == "The cat The dog The bird The rabbit": mock_phrase_engine.run.return_value = [ @@ -475,10 +473,3 @@ def test_report_type_nli(report_type_value: str) -> None: for single_phrase_sample in one_way_dict.values(): few_shot_examples_single_phrase.extend(single_phrase_sample) assert few_shot_examples_single_phrase == processor.query_template.examples - - -def test_invalid_report_type() -> None: - config = init_hydra_config(REPORT_TO_PHRASES_CONFIG) - config.report_type = "invalid_type" - with pytest.raises(ValueError): - get_report_to_phrases_engine(cfg=config, dataset_df=pd.DataFrame({}, columns=["study_id", "FINDINGS"])) From 9a43812d4ebe43c999301a5ebab2840542e4931a Mon Sep 17 00:00:00 2001 From: Cynthia Lo Date: Tue, 13 Jan 2026 07:57:50 -0800 Subject: [PATCH 5/6] Remove breakpoint --- src/radfact/llm_utils/report_to_phrases/processor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/radfact/llm_utils/report_to_phrases/processor.py b/src/radfact/llm_utils/report_to_phrases/processor.py index 97c0acb..39d623f 100644 --- a/src/radfact/llm_utils/report_to_phrases/processor.py +++ b/src/radfact/llm_utils/report_to_phrases/processor.py @@ -65,7 +65,6 @@ def get_report_to_phrases_engine( output_folder = get_subfolder(root, subfolder) final_output_folder = get_subfolder(root, subfolder) log_dir = get_subfolder(root, "logs") - breakpoint() report_to_phrases_processor = get_report_to_phrases_processor(report_type=report_type, log_dir=log_dir) id_col = cfg.processing.index_col From 4c811a967026f6fa4e6577e72e16f1136d983af7 Mon Sep 17 00:00:00 2001 From: Cynthia Lo Date: Thu, 15 Jan 2026 06:35:58 -0800 Subject: [PATCH 6/6] Upgrade ubuntu in ci checks --- .github/workflows/pr-checks.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pr-checks.yaml b/.github/workflows/pr-checks.yaml index ba7a8dc..a784c56 100644 --- a/.github/workflows/pr-checks.yaml +++ b/.github/workflows/pr-checks.yaml @@ -21,7 +21,7 @@ permissions: jobs: run_code_quality: - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 @@ -50,7 +50,7 @@ jobs: shell: bash -el {0} run_pytest: - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 with: