From ffa55609ded450b288d7929158b6e059ad36c947 Mon Sep 17 00:00:00 2001
From: Cynthia Lo <t-cynthialo@microsoft.com>
Date: Mon, 12 Jan 2026 03:53:19 -0800
Subject: [PATCH 01/14] Add support for the ct report type

---
 README.md                                     |   7 +-
 configs/default.yaml                          |   2 +
 dev_environment.yaml                          |   1 -
 pyproject.toml                                |   1 -
 src/radfact/llm_utils/nli/processor.py        |  36 +-
 .../nli/prompts/ct/few_shot_examples.json     | 627 ++++++++++++++++++
 .../ct/system_message_ev_singlephrase.txt     |   1 +
 .../prompts/{ => cxr}/few_shot_examples.json  |   0
 .../system_message_ev_singlephrase.txt        |   0
 src/radfact/llm_utils/prompt_tasks.py         |  47 ++
 .../llm_utils/report_to_phrases/processor.py  |  29 +-
 .../prompts/ct/few_shot_examples.json         | 471 +++++++++++++
 .../prompts/ct/system_message.txt             |  13 +
 .../prompts/{ => cxr}/few_shot_examples.json  |   0
 .../prompts/{ => cxr}/system_message.txt      |   0
 tests/metric/test_radfact.py                  |  49 ++
 16 files changed, 1258 insertions(+), 26 deletions(-)
 create mode 100644 src/radfact/llm_utils/nli/prompts/ct/few_shot_examples.json
 create mode 100644 src/radfact/llm_utils/nli/prompts/ct/system_message_ev_singlephrase.txt
 rename src/radfact/llm_utils/nli/prompts/{ => cxr}/few_shot_examples.json (100%)
 rename src/radfact/llm_utils/nli/prompts/{ => cxr}/system_message_ev_singlephrase.txt (100%)
 create mode 100644 src/radfact/llm_utils/prompt_tasks.py
 create mode 100644 src/radfact/llm_utils/report_to_phrases/prompts/ct/few_shot_examples.json
 create mode 100644 src/radfact/llm_utils/report_to_phrases/prompts/ct/system_message.txt
 rename src/radfact/llm_utils/report_to_phrases/prompts/{ => cxr}/few_shot_examples.json (100%)
 rename src/radfact/llm_utils/report_to_phrases/prompts/{ => cxr}/system_message.txt (100%)

diff --git a/README.md b/README.md
index 3bb06da..05c248c 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
 RadFact is a framework for the evaluation of model-generated radiology reports given a ground-truth report, **with or without grounding**. Leveraging the logical inference capabilities of large language models, RadFact is not a single number but a _suite_ of metrics, capturing aspects of precision and recall at text-only and text-and-grounding levels.
 
-RadFact was introduced in [MAIRA-2: Grounded Radiology Report Generation](https://aka.ms/maira-2). Here we provide an open-source implementation of the metric to facilitate its use and development.
+RadFact was introduced in [MAIRA-2: Grounded Radiology Report Generation](https://aka.ms/maira-2). Here we provide an open-source implementation of the metric to facilitate its use and development. The RadFact metric currently supports both `cxr` and `ct` report types.
 
 ## Table of Contents
 
@@ -173,6 +173,11 @@ options:
 
 Refer to the example input files in the [`examples`](examples) directory for the expected format of the input files. The input files should be in the format of a CSV file for non-grounded reports [findings_generation_examples.csv](examples/findings_generation_examples.csv) and a JSON file for grounded reports [grounded_reporting_examples.json](examples/grounded_reporting_examples.json).
 
+RadFact supports different report types through the `report_type` field in [`configs/default.yaml`](configs/default.yaml). Currently supported options are:
+
+- `cxr` - Chest X-ray reports (default)
+- `ct` - CT scan reports
+
 The script computes confidence intervals for the metrics using bootstrapping. The number of bootstrap samples can be controlled using the `--bootstrap_samples` argument. The default value is 500. To disable bootstrapping, set `--bootstrap_samples 0`.
 
 ⚠️**WARNING**: Some queries may fail due to the endpoint limitations (timeouts, rate limits, etc.). When the LLM performing entailment verification fails, we **set these examples as not-entailed by default**. If this occurs in a significant number of cases, the results will not be reliable. The final metrics dict contains the number of such skipped queries under the key `num_llm_failures`. The script will print the number of skipped queries at the end of the run, and store these in the `skipped` directroy under the run id folder. You will also see a warning message in the logs for each failed query. `WARNING: No response for example {query_id}. Setting as NOT ENTAILED`.
diff --git a/configs/default.yaml b/configs/default.yaml
index 28c4869..9878825 100644
--- a/configs/default.yaml
+++ b/configs/default.yaml
@@ -21,6 +21,8 @@ processing:
   end_index: null
   output_filename: "outputs.json"
 
+report_type: "cxr"
+
 # The type of cache that should be set for langchain. This can be either "redis" or "sqlite".
 # Sqlite cache is useful for local development, it will be written to ~/.langchain.db
 # Redis cache is useful to share state across many evaluation runs in AzureML
diff --git a/dev_environment.yaml b/dev_environment.yaml
index 7b39764..94ec7d8 100644
--- a/dev_environment.yaml
+++ b/dev_environment.yaml
@@ -311,7 +311,6 @@ dependencies:
       - pyparsing==3.2.0
       - pysocks==1.7.1
       - pytest==8.3.3
-      - pytest-lazy-fixture==0.6.3
       - python-dateutil==2.9.0.post0
       - pytz==2024.2
       - pyyaml==6.0.2
diff --git a/pyproject.toml b/pyproject.toml
index f6a4eaa..5d4ee0e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -78,7 +78,6 @@ test = [
     "mock",
     "pandas-stubs",
     "pytest",
-    "pytest-lazy-fixture",
 ]
 
 [project.urls]
diff --git a/src/radfact/llm_utils/nli/processor.py b/src/radfact/llm_utils/nli/processor.py
index add77f0..6228f7e 100644
--- a/src/radfact/llm_utils/nli/processor.py
+++ b/src/radfact/llm_utils/nli/processor.py
@@ -10,6 +10,7 @@
 from typing import Any, Callable
 
 import pandas as pd
+from radfact.llm_utils.prompt_tasks import NLITaskOptions, ReportType
 from langchain_core.language_models import BaseLanguageModel
 from langchain_core.messages import BaseMessage
 from omegaconf import DictConfig
@@ -35,11 +36,9 @@
     StructuredProcessor,
     simple_formatter,
 )
-from radfact.paths import OUTPUT_DIR, get_prompts_dir
+from radfact.paths import OUTPUT_DIR
 
 logger = logging.getLogger(__name__)
-PARSING_TASK = "nli"
-PROMPTS_DIR = get_prompts_dir(task=PARSING_TASK)
 RADFACT_SUBFOLDER = "radfact"
 
 
@@ -49,22 +48,22 @@ class MetricDataframeKeys(str, Enum):
     STUDY_ID = "study_id"
 
 
-def get_ev_processor_singlephrase(log_dir: Path) -> StructuredProcessor[ComparisonQuerySinglePhrase, EvidencedPhrase]:
+def get_ev_processor_singlephrase(
+    report_type: ReportType, log_dir: Path
+) -> StructuredProcessor[ComparisonQuerySinglePhrase, EvidencedPhrase]:
     """
     Helper function to load the NLI processor with the correct system prompt and few-shot examples.
 
     The setting here is to classify a SINGLE PHRASE at a time given the reference report.
     Further, we do entailment verification, aka the binary version of NLI.
 
-    :param api_arguments: API arguments for the LLM.
+    :param report_type: The type of report, e.g., "ReportType.CXR" or "ReportType.CT".
     :param log_dir: Directory to save logs.
     :return: Processor for entailment verification.
     """
-
-    system_prompt_path = PROMPTS_DIR / "system_message_ev_singlephrase.txt"
-    few_shot_examples_path = PROMPTS_DIR / "few_shot_examples.json"
-    system_prompt = system_prompt_path.read_text()
-    few_shot_examples = load_examples_from_json(json_path=few_shot_examples_path, binary=True)
+    task = NLITaskOptions[report_type.name].value
+    system_prompt = task.system_message_path.read_text()
+    few_shot_examples = load_examples_from_json(json_path=task.few_shot_examples_path, binary=True)
     # The few-shots are in the bidirectional format, we need to convert them to single-phrase.
     few_shot_examples_single_phrase: list[NLISampleSinglePhrase] = []
     for few_shot_example in few_shot_examples:
@@ -94,10 +93,12 @@ class ReportGroundingNLIProcessor(BaseProcessor[NLIQuerySample, NLISample]):
     NUM_LLM_SUCCESS = "num_llm_success"
     NUM_LLM_PHRASE_REWRITES = "num_llm_phrase_rewrites"
 
-    def __init__(self, format_query_fn: Callable[..., Any] | None = None) -> None:
+    def __init__(self, report_type: ReportType, format_query_fn: Callable[..., Any] | None = None) -> None:
         super().__init__()
         self.format_query_fn = format_query_fn
-        self.phrase_processor = get_ev_processor_singlephrase(log_dir=OUTPUT_DIR / "ev_processor_logs")
+        self.phrase_processor = get_ev_processor_singlephrase(
+            report_type=report_type, log_dir=OUTPUT_DIR / "ev_processor_logs"
+        )
         # Logging errors
         self.num_llm_failures = 0
         self.num_llm_success = 0
@@ -190,7 +191,16 @@ def get_report_nli_engine(
     cfg: DictConfig, candidates: dict[str, GroundedPhraseList], references: dict[str, GroundedPhraseList]
 ) -> LLMEngine:
     output_folder = get_subfolder(root=OUTPUT_DIR, subfolder=RADFACT_SUBFOLDER)
-    nli_report_processor = ReportGroundingNLIProcessor(format_query_fn=format_row_to_nli_query_sample)
+    report_type_value = cfg.get("report_type")
+    try:
+        report_type = ReportType(report_type_value)
+    except ValueError as e:
+        raise ValueError(
+            f"Invalid report_type '{report_type_value}'. Valid options are: {[rt.value for rt in ReportType]}"
+        ) from e
+    nli_report_processor = ReportGroundingNLIProcessor(
+        report_type=report_type, format_query_fn=format_row_to_nli_query_sample
+    )
     dataset_df = pd.DataFrame(
         {
             MetricDataframeKeys.STUDY_ID: study_id,
diff --git a/src/radfact/llm_utils/nli/prompts/ct/few_shot_examples.json b/src/radfact/llm_utils/nli/prompts/ct/few_shot_examples.json
new file mode 100644
index 0000000..bcd0b3c
--- /dev/null
+++ b/src/radfact/llm_utils/nli/prompts/ct/few_shot_examples.json
@@ -0,0 +1,627 @@
+[
+    {
+    "example_id": "ct_example1_study1_study2",
+    "study_id": "nan",
+    "input": {
+        "phrases_A": [
+            "There is a 2 cm renal cyst in the left kidney.",
+            "The liver shows signs of mild hepatic steatosis.",
+            "No evidence of free air in the abdominal cavity."
+        ],
+        "phrases_B": [
+            "A 2 cm hypodense lesion is present in the left kidney, consistent with a cyst.",
+            "Diffuse hepatic steatosis is noted.",
+            "Free air is visible under the diaphragm."
+        ]
+    },
+    "rationale": "rationale",
+    "output": {
+        "phrases_A_evidenced": [
+            {
+                "evidence": [
+                    "A 2 cm hypodense lesion is present in the left kidney, consistent with a cyst."
+                ],
+                "phrase": "There is a 2 cm renal cyst in the left kidney.",
+                "status": "entailment"
+            },
+            {
+                "evidence": [
+                    "Diffuse hepatic steatosis is noted."
+                ],
+                "phrase": "The liver shows signs of mild hepatic steatosis.",
+                "status": "entailment"
+            },
+            {
+                "evidence": [
+                    "Free air is visible under the diaphragm."
+                ],
+                "phrase": "No evidence of free air in the abdominal cavity.",
+                "status": "contradiction"
+            }
+        ],
+        "phrases_B_evidenced": [
+            {
+                "evidence": [
+                    "There is a 2 cm renal cyst in the left kidney."
+                ],
+                "phrase": "A 2 cm hypodense lesion is present in the left kidney, consistent with a cyst.",
+                "status": "entailment"
+            },
+            {
+                "evidence": [
+                    "The liver shows signs of mild hepatic steatosis."
+                ],
+                "phrase": "Diffuse hepatic steatosis is noted.",
+                "status": "entailment"
+            },
+            {
+                "evidence": [
+                    "No evidence of free air in the abdominal cavity."
+                ],
+                "phrase": "Free air is visible under the diaphragm.",
+                "status": "contradiction"
+            }
+        ]
+    }
+},
+{
+    "example_id": "ct_example2_study3_study4",
+    "study_id": "nan",
+    "input": {
+        "phrases_A": [
+            "There is a small renal cyst in the left kidney.",
+            "The liver appears normal in size and shape.",
+            "The lungs are clear with no signs of infiltrates."
+        ],
+        "phrases_B": [
+            "A small cyst is noted in the left kidney.",
+            "The liver is unremarkable.",
+            "Heart size is within normal limits."
+        ]
+    },
+    "rationale": "rationale",
+    "output": {
+        "phrases_A_evidenced": [
+            {
+                "evidence": [
+                    "A small cyst is noted in the left kidney."
+                ],
+                "phrase": "There is a small renal cyst in the left kidney.",
+                "status": "entailment"
+            },
+            {
+                "evidence": [
+                    "The liver is unremarkable."
+                ],
+                "phrase": "The liver appears normal in size and shape.",
+                "status": "entailment"
+            },
+            {
+                "evidence": [],
+                "phrase": "The lungs are clear with no signs of infiltrates.",
+                "status": "neutral"
+            }
+        ],
+        "phrases_B_evidenced": [
+            {
+                "evidence": [
+                    "There is a small renal cyst in the left kidney."
+                ],
+                "phrase": "A small cyst is noted in the left kidney.",
+                "status": "entailment"
+            },
+            {
+                "evidence": [
+                    "The liver appears normal in size and shape."
+                ],
+                "phrase": "The liver is unremarkable.",
+                "status": "entailment"
+            },
+            {
+                "evidence": [],
+                "phrase": "Heart size is within normal limits.",
+                "status": "neutral"
+            }
+        ]
+    }
+},
+{
+    "example_id": "ct_example3_study5_study6",
+    "study_id": "nan",
+    "input": {
+        "phrases_A": [
+            "The pancreas shows evidence of atrophy.",
+            "Hydronephrosis noted in the right kidney.",
+            "No gallstones are present in the gallbladder."
+        ],
+        "phrases_B": [
+            "Atrophy of the pancreatic tissue is visible.",
+            "Moderate right hydronephrosis is identified.",
+            "Multiple gallstones seen in the gallbladder."
+        ]
+    },
+    "rationale": "rationale",
+    "output": {
+        "phrases_A_evidenced": [
+            {
+                "evidence": [
+                    "Atrophy of the pancreatic tissue is visible."
+                ],
+                "phrase": "The pancreas shows evidence of atrophy.",
+                "status": "entailment"
+            },
+            {
+                "evidence": [
+                    "Moderate right hydronephrosis is identified."
+                ],
+                "phrase": "Hydronephrosis noted in the right kidney.",
+                "status": "entailment"
+            },
+            {
+                "evidence": [
+                    "Multiple gallstones seen in the gallbladder."
+                ],
+                "phrase": "No gallstones are present in the gallbladder.",
+                "status": "contradiction"
+            }
+        ],
+        "phrases_B_evidenced": [
+            {
+                "evidence": [
+                    "The pancreas shows evidence of atrophy."
+                ],
+                "phrase": "Atrophy of the pancreatic tissue is visible.",
+                "status": "entailment"
+            },
+            {
+                "evidence": [
+                    "Hydronephrosis noted in the right kidney."
+                ],
+                "phrase": "Moderate right hydronephrosis is identified.",
+                "status": "entailment"
+            },
+            {
+                "evidence": [
+                    "No gallstones are present in the gallbladder."
+                ],
+                "phrase": "Multiple gallstones seen in the gallbladder.",
+                "status": "contradiction"
+            }
+        ]
+    }
+},
+{
+    "example_id": "ct_example4_study7_study8",
+    "study_id": "nan",
+    "input": {
+        "phrases_A": [
+            "No signs of pulmonary embolism.",
+            "The aorta is of normal caliber.",
+            "There is a small hiatal hernia."
+        ],
+        "phrases_B": [
+            "The pulmonary arteries are clear with no embolism detected.",
+            "The aorta is unremarkable.",
+            "There is no evidence of renal calculi."
+        ]
+    },
+    "rationale": "rationale",
+    "output": {
+        "phrases_A_evidenced": [
+            {
+                "evidence": [
+                    "The pulmonary arteries are clear with no embolism detected."
+                ],
+                "phrase": "No signs of pulmonary embolism.",
+                "status": "entailment"
+            },
+            {
+                "evidence": [
+                    "The aorta is unremarkable."
+                ],
+                "phrase": "The aorta is of normal caliber.",
+                "status": "entailment"
+            },
+            {
+                "evidence": [],
+                "phrase": "There is a small hiatal hernia.",
+                "status": "neutral"
+            }
+        ],
+        "phrases_B_evidenced": [
+            {
+                "evidence": [
+                    "No signs of pulmonary embolism."
+                ],
+                "phrase": "The pulmonary arteries are clear with no embolism detected.",
+                "status": "entailment"
+            },
+            {
+                "evidence": [
+                    "The aorta is of normal caliber."
+                ],
+                "phrase": "The aorta is unremarkable.",
+                "status": "entailment"
+            },
+            {
+                "evidence": [],
+                "phrase": "There is no evidence of renal calculi.",
+                "status": "neutral"
+            }
+        ]
+    }
+},
+{
+    "example_id": "ct_example5_study9_study10",
+    "study_id": "nan",
+    "input": {
+        "phrases_A": [
+            "Mild cardiomegaly is observed.",
+            "No evidence of bowel obstruction.",
+            "The spleen is of normal size."
+        ],
+        "phrases_B": [
+            "Heart size is slightly enlarged.",
+            "The bowel loops are normal with no signs of obstruction.",
+            "The pancreas shows no abnormalities."
+        ]
+    },
+    "rationale": "rationale",
+    "output": {
+        "phrases_A_evidenced": [
+            {
+                "evidence": [
+                    "Heart size is slightly enlarged."
+                ],
+                "phrase": "Mild cardiomegaly is observed.",
+                "status": "entailment"
+            },
+            {
+                "evidence": [
+                    "The bowel loops are normal with no signs of obstruction."
+                ],
+                "phrase": "No evidence of bowel obstruction.",
+                "status": "entailment"
+            },
+            {
+                "evidence": [],
+                "phrase": "The spleen is of normal size.",
+                "status": "neutral"
+            }
+        ],
+        "phrases_B_evidenced": [
+            {
+                "evidence": [
+                    "Mild cardiomegaly is observed."
+                ],
+                "phrase": "Heart size is slightly enlarged.",
+                "status": "entailment"
+            },
+            {
+                "evidence": [
+                    "No evidence of bowel obstruction."
+                ],
+                "phrase": "The bowel loops are normal with no signs of obstruction.",
+                "status": "entailment"
+            },
+            {
+                "evidence": [],
+                "phrase": "The pancreas shows no abnormalities.",
+                "status": "neutral"
+            }
+        ]
+    }
+},
+{
+    "example_id": "ct_example6_study11_study12",
+    "study_id": "nan",
+    "input": {
+        "phrases_A": [
+            "The heart size is normal on CT.",
+            "No infiltrates or masses identified in the lungs.",
+            "No pleural effusion seen on CT scan."
+        ],
+        "phrases_B": [
+            "Multiple nodular densities in both lungs.",
+            "Heart size is normal on CT.",
+            "No pneumothorax detected."
+        ]
+    },
+    "output": {
+        "phrases_A_evidenced": [
+            {
+                "evidence": [
+                    "Heart size is normal on CT."
+                ],
+                "phrase": "The heart size is normal on CT.",
+                "status": "entailment"
+            },
+            {
+                "evidence": [
+                    "Multiple nodular densities in both lungs."
+                ],
+                "phrase": "No infiltrates or masses identified in the lungs.",
+                "status": "contradiction"
+            },
+            {
+                "evidence": [],
+                "phrase": "No pleural effusion seen on CT scan.",
+                "status": "neutral"
+            }
+        ],
+        "phrases_B_evidenced": [
+            {
+                "evidence": [
+                    "No infiltrates or masses identified in the lungs."
+                ],
+                "phrase": "Multiple nodular densities in both lungs.",
+                "status": "contradiction"
+            },
+            {
+                "evidence": [
+                    "The heart size is normal on CT."
+                ],
+                "phrase": "Heart size is normal on CT.",
+                "status": "entailment"
+            },
+            {
+                "evidence": [],
+                "phrase": "No pneumothorax detected.",
+                "status": "neutral"
+            }
+        ]
+    }
+},
+{
+    "example_id": "1.3.6.1.4.1.55648.016633401174206362235121936818612914820_1.3.6.1.4.1.55648.241546114664305764079189793827197454684",
+    "study_id": "nan",
+    "input": {
+        "phrases_A": [
+            "Scarring or atelectasis in the left lung base is seen.",
+            "Cardiomediastinum is unremarkable.",
+            "Degenerative changes are seen throughout the spine."
+        ],
+        "phrases_B": [
+            "The lungs are clear.",
+            "The cardiomediastinal structures are unremarkable.",
+            "There are no pleural effusions."
+        ]
+    },
+    "rationale": "rationale",
+    "output": {
+        "phrases_A_evidenced": [
+            {
+                "evidence": [
+                    "The lungs are clear."
+                ],
+                "phrase": "Scarring or atelectasis in the left lung base is seen.",
+                "status": "contradiction"
+            },
+            {
+                "evidence": [
+                    "The cardiomediastinal structures are unremarkable."
+                ],
+                "phrase": "Cardiomediastinum is unremarkable.",
+                "status": "entailment"
+            },
+            {
+                "evidence": [],
+                "phrase": "Degenerative changes are seen throughout the spine.",
+                "status": "neutral"
+            }
+        ],
+        "phrases_B_evidenced": [
+            {
+                "evidence": [
+                    "Scarring or atelectasis in the left lung base is seen."
+                ],
+                "phrase": "The lungs are clear.",
+                "status": "contradiction"
+            },
+            {
+                "evidence": [
+                    "Cardiomediastinum is unremarkable."
+                ],
+                "phrase": "The cardiomediastinal structures are unremarkable.",
+                "status": "entailment"
+            },
+            {
+                "evidence": [],
+                "phrase": "There are no pleural effusions.",
+                "status": "neutral"
+            }
+        ]
+    }
+},
+{
+    "example_id": "1.3.6.1.4.1.55648.192816863503104216516892632261380714559_1.3.6.1.4.1.55648.287593655440699590688075363115862054018",
+    "study_id": "nan",
+    "input": {
+        "phrases_A": [
+            "A moderate size left pleural effusion slightly larger in size.",
+            "Pacemaker is unchanged.",
+            "There is persistent consolidation in the left lung base."
+        ],
+        "phrases_B": [
+            "There are small bilateral pleural effusions that have developed since prior study.",
+            "Left basilar consolidation is present.",
+            "The heart size is within normal limits.",
+            "No acute chest wall abnormality is radiographically evident."
+        ]
+    },
+    "rationale": "rationale",
+    "output": {
+        "phrases_A_evidenced": [
+            {
+                "evidence": [
+                    "There are small bilateral pleural effusions that have developed since prior study."
+                ],
+                "phrase": "A moderate size left pleural effusion slightly larger in size.",
+                "status": "contradiction"
+            },
+            {
+                "evidence": [],
+                "phrase": "Pacemaker is unchanged.",
+                "status": "neutral"
+            },
+            {
+                "evidence": [],
+                "phrase": "There is persistent consolidation in the left lung base.",
+                "status": "neutral"
+            }
+        ],
+        "phrases_B_evidenced": [
+            {
+                "evidence": [],
+                "phrase": "There are small bilateral pleural effusions that have developed since prior study.",
+                "status": "neutral"
+            },
+            {
+                "evidence": [
+                    "There is persistent consolidation in the left lung base."
+                ],
+                "phrase": "Left basilar consolidation is present.",
+                "status": "entailment"
+            },
+            {
+                "evidence": [],
+                "phrase": "The heart size is within normal limits.",
+                "status": "neutral"
+            },
+            {
+                "evidence": [],
+                "phrase": "No acute chest wall abnormality is radiographically evident.",
+                "status": "neutral"
+            }
+        ]
+    }
+},
+{
+    "example_id": "1.3.6.1.4.1.55648.144771367909794201071026393675954665550_1.3.6.1.4.1.55648.84420269742583802251982398782271710226",
+    "study_id": "nan",
+    "input": {
+        "phrases_A": [
+            "The heart is within normal limits.",
+            "Pulmonary vascularity is unremarkable.",
+            "There are patchy bibasilar infiltrates."
+        ],
+        "phrases_B": [
+            "The left-sided pneumothorax has enlarged.",
+            "The left-sided pigtail chest tube remains in place.",
+            "Heart size is normal.",
+            "There is pulmonary consolidation."
+        ]
+    },
+    "rationale": "rationale",
+    "output": {
+        "phrases_A_evidenced": [
+            {
+                "evidence": [
+                    "Heart size is normal."
+                ],
+                "phrase": "The heart is within normal limits.",
+                "status": "entailment"
+            },
+            {
+                "evidence": [],
+                "phrase": "Pulmonary vascularity is unremarkable.",
+                "status": "neutral"
+            },
+            {
+                "evidence": [],
+                "phrase": "There are patchy bibasilar infiltrates.",
+                "status": "neutral"
+            }
+        ],
+        "phrases_B_evidenced": [
+            {
+                "evidence": [],
+                "phrase": "The left-sided pneumothorax has enlarged.",
+                "status": "neutral"
+            },
+            {
+                "evidence": [],
+                "phrase": "The left-sided pigtail chest tube remains in place.",
+                "status": "neutral"
+            },
+            {
+                "evidence": [
+                    "The heart is within normal limits."
+                ],
+                "phrase": "Heart size is normal.",
+                "status": "entailment"
+            },
+            {
+                "evidence": [
+                    "There are patchy bibasilar infiltrates."
+                ],
+                "phrase": "There is pulmonary consolidation.",
+                "status": "entailment"
+            }
+        ]
+    }
+},
+{
+    "example_id": "ct_example7_study13_study14",
+    "study_id": "nan",
+    "input": {
+        "phrases_A": [
+            "CT confirms bilateral ground-glass opacities.",
+            "No evidence of aortic aneurysm.",
+            "Liver is enlarged with fatty infiltration."
+        ],
+        "phrases_B": [
+            "Ground-glass opacities are absent in the left lung.",
+            "Ground-glass opacities not found in the right lung.",
+            "Normal liver size and texture."
+        ]
+    },
+    "output": {
+        "phrases_A_evidenced": [
+            {
+                "evidence": [
+                    "Ground-glass opacities are absent in the left lung.",
+                    "Ground-glass opacities not found in the right lung."
+                ],
+                "phrase": "CT confirms bilateral ground-glass opacities.",
+                "status": "contradiction"
+            },
+            {
+                "evidence": [],
+                "phrase": "No evidence of aortic aneurysm.",
+                "status": "neutral"
+            },
+            {
+                "evidence": [
+                    "Normal liver size and texture."
+                ],
+                "phrase": "Liver is enlarged with fatty infiltration.",
+                "status": "contradiction"
+            }
+        ],
+        "phrases_B_evidenced": [
+            {
+                "evidence": [
+                    "CT confirms bilateral ground-glass opacities."
+                ],
+                "phrase": "Ground-glass opacities are absent in the left lung.",
+                "status": "contradiction"
+            },
+            {
+                "evidence": [
+                    "CT confirms bilateral ground-glass opacities."
+                ],
+                "phrase": "Ground-glass opacities not found in the right lung.",
+                "status": "contradiction"
+            },
+            {
+                "evidence": [
+                    "Liver is enlarged with fatty infiltration."
+                ],
+                "phrase": "Normal liver size and texture.",
+                "status": "contradiction"
+            }
+        ]
+    }
+}
+]
\ No newline at end of file
diff --git a/src/radfact/llm_utils/nli/prompts/ct/system_message_ev_singlephrase.txt b/src/radfact/llm_utils/nli/prompts/ct/system_message_ev_singlephrase.txt
new file mode 100644
index 0000000..164f2c9
--- /dev/null
+++ b/src/radfact/llm_utils/nli/prompts/ct/system_message_ev_singlephrase.txt
@@ -0,0 +1 @@
+You are an AI radiology assistant. Your task is to assess whether a statement about a CT scan (the "hypothesis") is true or not, given a reference report about the CT scan. This task is known as entailment verification. If the statement is true ("entailed") according to the reference, provide the evidence to support it.
\ No newline at end of file
diff --git a/src/radfact/llm_utils/nli/prompts/few_shot_examples.json b/src/radfact/llm_utils/nli/prompts/cxr/few_shot_examples.json
similarity index 100%
rename from src/radfact/llm_utils/nli/prompts/few_shot_examples.json
rename to src/radfact/llm_utils/nli/prompts/cxr/few_shot_examples.json
diff --git a/src/radfact/llm_utils/nli/prompts/system_message_ev_singlephrase.txt b/src/radfact/llm_utils/nli/prompts/cxr/system_message_ev_singlephrase.txt
similarity index 100%
rename from src/radfact/llm_utils/nli/prompts/system_message_ev_singlephrase.txt
rename to src/radfact/llm_utils/nli/prompts/cxr/system_message_ev_singlephrase.txt
diff --git a/src/radfact/llm_utils/prompt_tasks.py b/src/radfact/llm_utils/prompt_tasks.py
new file mode 100644
index 0000000..6067b67
--- /dev/null
+++ b/src/radfact/llm_utils/prompt_tasks.py
@@ -0,0 +1,47 @@
+from dataclasses import dataclass
+from enum import Enum
+from pathlib import Path
+from radfact.paths import get_prompts_dir
+
+REPORT_TO_PHRASES_PARSING_TASK = "report_to_phrases"
+REPORT_TO_PHRASES_PROMPTS_DIR = get_prompts_dir(task=REPORT_TO_PHRASES_PARSING_TASK)
+NLI_PARSING_TASK = "nli"
+NLI_PROMPTS_DIR = get_prompts_dir(task=NLI_PARSING_TASK)
+
+
+class ReportType(str, Enum):
+    CXR = "cxr"
+    CT = "ct"
+
+
+@dataclass(frozen=True)
+class PromptTask:
+    name: str
+    system_message_path: Path
+    few_shot_examples_path: Path
+
+
+class ReportToPhrasesTaskOptions(Enum):
+    CXR = PromptTask(
+        name=f"{ReportType.CXR.value}_report_to_phrases",
+        system_message_path=REPORT_TO_PHRASES_PROMPTS_DIR / ReportType.CXR.value / "system_message.txt",
+        few_shot_examples_path=REPORT_TO_PHRASES_PROMPTS_DIR / ReportType.CXR.value / "few_shot_examples.json",
+    )
+    CT = PromptTask(
+        name=f"{ReportType.CT.value}_report_to_phrases",
+        system_message_path=REPORT_TO_PHRASES_PROMPTS_DIR / ReportType.CT.value / "system_message.txt",
+        few_shot_examples_path=REPORT_TO_PHRASES_PROMPTS_DIR / ReportType.CT.value / "few_shot_examples.json",
+    )
+
+
+class NLITaskOptions(Enum):
+    CXR = PromptTask(
+        name=f"{ReportType.CXR.value}_nli",
+        system_message_path=NLI_PROMPTS_DIR / ReportType.CXR.value / "system_message_ev_singlephrase.txt",
+        few_shot_examples_path=NLI_PROMPTS_DIR / ReportType.CXR.value / "few_shot_examples.json",
+    )
+    CT = PromptTask(
+        name=f"{ReportType.CT.value}_nli",
+        system_message_path=NLI_PROMPTS_DIR / ReportType.CT.value / "system_message_ev_singlephrase.txt",
+        few_shot_examples_path=NLI_PROMPTS_DIR / ReportType.CT.value / "few_shot_examples.json",
+    )
diff --git a/src/radfact/llm_utils/report_to_phrases/processor.py b/src/radfact/llm_utils/report_to_phrases/processor.py
index 86f4b38..b88c5ab 100644
--- a/src/radfact/llm_utils/report_to_phrases/processor.py
+++ b/src/radfact/llm_utils/report_to_phrases/processor.py
@@ -7,29 +7,30 @@
 from typing import Any
 
 import pandas as pd
+from radfact.llm_utils.prompt_tasks import REPORT_TO_PHRASES_PARSING_TASK, ReportToPhrasesTaskOptions, ReportType
 from omegaconf import DictConfig
 
 from radfact.llm_utils.engine.engine import LLMEngine, get_subfolder
 from radfact.llm_utils.processor.structured_processor import StructuredProcessor
 from radfact.llm_utils.report_to_phrases.schema import ParsedReport, load_examples_from_json
-from radfact.paths import OUTPUT_DIR, get_prompts_dir
+from radfact.paths import OUTPUT_DIR
 
 FINDINGS_SECTION = "FINDINGS"
-PARSING_TASK = "report_to_phrases"
-PROMPTS_DIR = get_prompts_dir(task=PARSING_TASK)
 StudyIdType = str | int
 
 
-def get_report_to_phrases_processor(log_dir: Path | None = None) -> StructuredProcessor[str, ParsedReport]:
+def get_report_to_phrases_processor(
+    report_type: ReportType, log_dir: Path | None = None
+) -> StructuredProcessor[str, ParsedReport]:
     """Return a processor for converting reports to phrases.
 
+    :param report_type: The type of report, e.g., "ReportType.CXR" or "ReportType.CT".
     :param log_dir: The directory to save logs.
     :return: The processor for report to phrase conversion.
     """
-    system_message_path = PROMPTS_DIR / "system_message.txt"
-    few_shot_examples_path = PROMPTS_DIR / "few_shot_examples.json"
-    system_prompt = system_message_path.read_text()
-    few_shot_examples = load_examples_from_json(few_shot_examples_path)
+    task = ReportToPhrasesTaskOptions[report_type.name].value
+    system_prompt = task.system_message_path.read_text()
+    few_shot_examples = load_examples_from_json(task.few_shot_examples_path)
     processor = StructuredProcessor(
         query_type=str,
         result_type=ParsedReport,
@@ -58,12 +59,20 @@ def get_report_to_phrases_engine(cfg: DictConfig, dataset_df: pd.DataFrame) -> L
     :return: The processing engine.
     """
     subfolder = cfg.dataset.name
-    root = OUTPUT_DIR / PARSING_TASK
+    root = OUTPUT_DIR / REPORT_TO_PHRASES_PARSING_TASK
     output_folder = get_subfolder(root, subfolder)
     final_output_folder = get_subfolder(root, subfolder)
     log_dir = get_subfolder(root, "logs")
 
-    report_to_phrases_processor = get_report_to_phrases_processor(log_dir=log_dir)
+    report_type_value = cfg.get("report_type")
+    try:
+        report_type = ReportType(report_type_value)
+    except ValueError as e:
+        raise ValueError(
+            f"Invalid report_type '{report_type_value}'. Valid options are: {[rt.value for rt in ReportType]}"
+        ) from e
+
+    report_to_phrases_processor = get_report_to_phrases_processor(report_type=report_type, log_dir=log_dir)
     id_col = cfg.processing.index_col
     dataset_df = dataset_df[[id_col, FINDINGS_SECTION]]
     engine = LLMEngine(
diff --git a/src/radfact/llm_utils/report_to_phrases/prompts/ct/few_shot_examples.json b/src/radfact/llm_utils/report_to_phrases/prompts/ct/few_shot_examples.json
new file mode 100644
index 0000000..c46b7e6
--- /dev/null
+++ b/src/radfact/llm_utils/report_to_phrases/prompts/ct/few_shot_examples.json
@@ -0,0 +1,471 @@
+[
+  {
+    "example_id": "few_shot1",
+    "findings_text": "The endotracheal tube is appropriately positioned 4.8 cm from the carina. An enteric tube is present and its tip is in the stomach. A right internal jugular central venous catheter is present and its tip is in the mid SVC. Bilateral parenchymal opacities are present, mainly at the bases, consistent with an infectious process. Interstitial opacities have worsened, indicating increased edema. A small left pleural effusion is stable. The right pleural effusion has slightly increased in size. No pneumothorax is present. The cardiomediastinal structures are normal.",
+    "parsed_report": {
+      "id": "few_shot1",
+      "sentence_list": [
+        {
+          "orig": "The endotracheal tube is appropriately positioned 4.8 cm from the carina.",
+          "new": [
+            "The endotracheal tube is appropriately positioned 4.8 cm from the carina."
+          ]
+        },
+        {
+          "orig": "An enteric tube is present and its tip is in the stomach.",
+          "new": [
+            "An enteric tube is present and its tip is in the stomach."
+          ]
+        },
+        {
+          "orig": "A right internal jugular central venous catheter is present and its tip is in the mid SVC.",
+          "new": [
+            "A right internal jugular central venous catheter is present and its tip is in the mid SVC."
+          ]
+        },
+        {
+          "orig": "Bilateral parenchymal opacities are present, mainly at the bases, consistent with an infectious process.",
+          "new": [
+            "Bilateral parenchymal opacities are present, mainly at the bases."
+          ]
+        },
+        {
+          "orig": "Interstitial opacities have worsened, indicating increased edema.",
+          "new": [
+            "Interstitial opacities have worsened, indicating increased edema."
+          ]
+        },
+        {
+          "orig": "A small left pleural effusion is stable.",
+          "new": [
+            "A small left pleural effusion is stable."
+          ]
+        },
+        {
+          "orig": "The right pleural effusion has slightly increased in size.",
+          "new": [
+            "The right pleural effusion has slightly increased in size."
+          ]
+        },
+        {
+          "orig": "No pneumothorax is present.",
+          "new": [
+            "No pneumothorax is present."
+          ]
+        },
+        {
+          "orig": "The cardiomediastinal structures are normal.",
+          "new": [
+            "The cardiomediastinal structures are normal."
+          ]
+        }
+      ]
+    },
+    "study_id": "study_id_1",
+    "example_rationale": "Positive finding."
+  },
+  {
+    "example_id": "few_shot2",
+    "findings_text": "The perihilar opacities have increased since the last examination, along with pulmonary vascular congestion and widespread opacity. There is moderate cardiomegaly and a new small left-sided pleural effusion. No pneumothorax is observed.",
+    "parsed_report": {
+      "id": "few_shot2",
+      "sentence_list": [
+        {
+          "orig": "The perihilar opacities have increased since the last examination, along with pulmonary vascular congestion and widespread opacity.",
+          "new": [
+            "The perihilar opacities have increased since the last examination.",
+            "Pulmonary vascular congestion has increased since the last examination.",
+            "Widespread opacity has increased since the last examination."
+          ]
+        },
+        {
+          "orig": "There is moderate cardiomegaly and a new small left-sided pleural effusion.",
+          "new": [
+            "There is moderate cardiomegaly.",
+            "New small left-sided pleural effusion."
+          ]
+        },
+        {
+          "orig": "No pneumothorax is observed.",
+          "new": [
+            "No pneumothorax is observed."
+          ]
+        }
+      ]
+    },
+    "study_id": "study_id_2",
+    "example_rationale": "Multiple positive findings"
+  },
+  {
+    "example_id": "few_shot3",
+    "findings_text": "Chest CT scan was performed with axial slices, and images were obtained in multiple planes. Diffuse peribronchovascular opacities are observed throughout the lungs, which may indicate diffuse bronchopneumonia. No lobar consolidation, effusion, or pneumothorax is detected. The cardiomediastinal structures appear normal, and the bony structures are intact. No free air is observed below the right hemidiaphragm.",
+    "parsed_report": {
+      "id": "few_shot3",
+      "sentence_list": [
+        {
+          "orig": "Chest CT scan was performed with axial slices, and images were obtained in multiple planes.",
+          "new": [
+            ""
+          ]
+        },
+        {
+          "orig": "Diffuse peribronchovascular opacities are observed throughout the lungs, which may indicate diffuse bronchopneumonia.",
+          "new": [
+            "Diffuse peribronchovascular opacities are observed throughout the lungs."
+          ]
+        },
+        {
+          "orig": "No lobar consolidation, effusion, or pneumothorax is detected.",
+          "new": [
+            "No lobar consolidation is detected.",
+            "No effusion is detected",
+            "No pneumothorax is detected."
+          ]
+        },
+        {
+          "orig": "The cardiomediastinal structures appear normal, and the bony structures are intact.",
+          "new": [
+            "The cardiomediastinal structures appear normal.",
+            "The bony structures are intact."
+          ]
+        },
+        {
+          "orig": "No free air is observed below the right hemidiaphragm.",
+          "new": [
+            "No free air is observed below the right hemidiaphragm."
+          ]
+        }
+      ]
+    },
+    "study_id": "study_id_3",
+    "example_rationale": "Negative finding without location."
+  },
+  {
+    "example_id": "few_shot4",
+    "findings_text": "The left PICC line terminates in the upper SVC and the tracheostomy remains in its original position. The chronic left atelectasis persists without any change. Top normal heart size is stable. No evidence of pneumothorax or right pleural effusion is observed.",
+    "parsed_report": {
+      "id": "few_shot4",
+      "sentence_list": [
+        {
+          "orig": "The left PICC line terminates in the upper SVC and the tracheostomy remains in its original position.",
+          "new": [
+            "The left PICC line terminates in the upper SVC.",
+            "Tracheostomy remains in its original position"
+          ]
+        },
+        {
+          "orig": "The chronic left atelectasis persists without any change.",
+          "new": [
+            "The chronic left atelectasis persists without any change."
+          ]
+        },
+        {
+          "orig": "Top normal heart size is stable.",
+          "new": [
+            "Top normal heart size is stable."
+          ]
+        },
+        {
+          "orig": "No evidence of pneumothorax or right pleural effusion is observed.",
+          "new": [
+            "No evidence of pneumothorax is observed.",
+            "No right pleural effusion is observed."
+          ]
+        }
+      ]
+    },
+    "study_id": "study_id_4",
+    "example_rationale": "Negative finding with location."
+  },
+  {
+    "example_id": "few_shot5",
+    "findings_text": "The lungs are still hyperinflated, consistent with a history of asthma. Compared to the previous study, there are now hazy areas in the lower parts of both lungs. These may be caused by thickening of the bronchial walls or small airway disease, but there is no evidence of a specific lobe being affected. No pleural effusion or pneumothorax is seen. The cardiac and mediastinal structures are unremarkable.",
+    "parsed_report": {
+      "id": "few_shot5",
+      "sentence_list": [
+        {
+          "orig": "The lungs are still hyperinflated, consistent with a history of asthma.",
+          "new": [
+            "The lungs are still hyperinflated."
+          ]
+        },
+        {
+          "orig": "Compared to the previous study, there are now hazy areas in the lower parts of both lungs. These may be caused by thickening of the bronchial walls or small airway disease, but there is no evidence of a specific lobe being affected.",
+          "new": [
+            "Compared to the previous study, there are now hazy areas in the lower parts of both lungs."
+          ]
+        },
+        {
+          "orig": "No pleural effusion or pneumothorax is seen.",
+          "new": [
+            "No pleural effusion is seen.",
+            "No pneumothorax is seen."
+          ]
+        },
+        {
+          "orig": "The cardiac and mediastinal structures are unremarkable.",
+          "new": [
+            "The cardiac structures are unremarkable.",
+            "The mediastinal structures are unremarkable."
+          ]
+        }
+      ]
+    },
+    "study_id": "study_id_5",
+    "example_rationale": "Clinical interpretation."
+  },
+  {
+    "example_id": "few_shot6",
+    "findings_text": "The CT scan reveals the presence of a PAC (pulmonary artery catheter) overlying the upper left lateral hemithorax. There is a relative opacity observed in the left mid-to-lower lung, possibly located in the lingula. This could be attributed to the patient's positioning, an underlying infection causing consolidation, or a contusion. No pleural effusion is seen. There is no evidence of pneumothorax. The cardiac and mediastinal structures are unremarkable. Thoracic scoliosis is present.",
+    "parsed_report": {
+      "id": "few_shot6",
+      "sentence_list": [
+        {
+          "orig": "The CT scan reveals the presence of a PAC (pulmonary artery catheter) overlying the upper left lateral hemithorax.",
+          "new": [
+            "The CT scan reveals the presence of a PAC (pulmonary artery catheter) overlying the upper left lateral hemithorax."
+          ]
+        },
+        {
+          "orig": "There is a relative opacity observed in the left mid-to-lower lung, possibly located in the lingula. This could be attributed to the patient's positioning, an underlying infection causing consolidation, or a contusion.",
+          "new": [
+            "There is a relative opacity observed in the left mid-to-lower lung, possibly located in the lingula."
+          ]
+        },
+        {
+          "orig": "No pleural effusion is seen.",
+          "new": [
+            "No pleural effusion is seen."
+          ]
+        },
+        {
+          "orig": "There is no evidence of pneumothorax.",
+          "new": [
+            "There is no evidence of pneumothorax."
+          ]
+        },
+        {
+          "orig": "The cardiac and mediastinal structures are unremarkable.",
+          "new": [
+            "The cardiac structures are unremarkable.",
+            "The mediastinal structures are unremarkable."
+          ]
+        },
+        {
+          "orig": "Thoracic scoliosis is present.",
+          "new": [
+            "Thoracic scoliosis is present."
+          ]
+        }
+      ]
+    },
+    "study_id": "study_id_6",
+    "example_rationale": "Comment on technical quality."
+  },
+  {
+    "example_id": "few_shot7",
+    "findings_text": "The CT scan shows linear opacities in the right middle lobe and left lower lobe, indicating atelectasis. Mediastinum is unchanged. The patient has undergone a right mastectomy. No pneumothorax is observed. However, there is a new blunting of the left costophrenic angle, which could be due to effusion or pleural thickening. These findings warrant correlation with the patient's clinical history of malignancy.",
+    "parsed_report": {
+      "id": "few_shot7",
+      "sentence_list": [
+        {
+          "orig": "The CT scan shows linear opacities in the right middle lobe and left lower lobe, indicating atelectasis.",
+          "new": [
+            "The CT scan shows linear opacities in the right middle lobe and left lower lobe, indicating atelectasis."
+          ]
+        },
+        {
+          "orig": "Mediastinum is unchanged.",
+          "new": [
+            "Mediastinum is unchanged."
+          ]
+        },
+        {
+          "orig": "The patient has undergone a right mastectomy.",
+          "new": [
+            "The patient has undergone a right mastectomy."
+          ]
+        },
+        {
+          "orig": "No pneumothorax is observed.",
+          "new": [
+            "No pneumothorax is observed."
+          ]
+        },
+        {
+          "orig": "However, there is a new blunting of the left costophrenic angle, which could be due to effusion or pleural thickening. These findings warrant correlation with the patient's clinical history of malignancy.",
+          "new": [
+            "There is new blunting of the left costrophrenic angle."
+          ]
+        }
+      ]
+    },
+    "study_id": "study_id_7",
+    "example_rationale": "Recommendation."
+  },
+  {
+    "example_id": "few_shot8",
+    "findings_text": "Cardiomediastinal structures appear unchanged. The lungs are clear without any signs of pneumonia. However, there is some overlap of structures in the left juxtamediastinal area. Findings should be correlated with clinical suspicion of infection or further imaging as necessary.",
+    "parsed_report": {
+      "id": "few_shot8",
+      "sentence_list": [
+        {
+          "orig": "Cardiomediastinal structures appear unchanged.",
+          "new": [
+            "Cardiomediastinal structures appear unchanged."
+          ]
+        },
+        {
+          "orig": "The lungs are clear without any signs of pneumonia. However, there is some overlap of structures in the left juxtamediastinal area.",
+          "new": [
+            "The lungs are clear.",
+            "Overlap of structures in the left juxtamediastinal area."
+          ]
+        },
+        {
+          "orig": "Findings should be correlated with clinical suspicion of infection or further imaging as necessary.",
+          "new": [
+            ""
+          ]
+        }
+      ]
+    },
+    "study_id": "study_id_8",
+    "example_rationale": "Other irrelevant detail."
+  },
+  {
+    "example_id": "few_shot9",
+    "findings_text": "The current scan was compared to the previous one from _. The left side still shows mediastinal shifting and volume loss. This remains unchanged. The increased density at the left base indicates the collapse of the postoperative left lung, as previously suggested. The right lung is well aerated, with no signs of pulmonary edema, focal consolidation, or pleural effusions. No pneumothoraces are observed. A central venous line is present on the right side, with the distal lead tip located in the proximal SVC.",
+    "parsed_report": {
+      "id": "few_shot9",
+      "sentence_list": [
+        {
+          "orig": "The current scan was compared to the previous one from _.",
+          "new": [
+            ""
+          ]
+        },
+        {
+          "orig": "The left side still shows mediastinal shifting and volume loss. This remains unchanged.",
+          "new": [
+            "The left side still shows mediastinal shifting and volume loss."
+          ]
+        },
+        {
+          "orig": "The increased density at the left base indicates the collapse of the postoperative left lung, as previously suggested.",
+          "new": [
+            "The increased density at the left base indicates the collapse of the postoperative left lung, as previously suggested."
+          ]
+        },
+        {
+          "orig": "The right lung is well aerated, with no signs of pulmonary edema, focal consolidation, or pleural effusions.",
+          "new": [
+            "The right lung is well aerated.",
+            "No signs of pulmonary edema.",
+            "No signs of focal consolidation.",
+            "No signs of pleural effusions."
+          ]
+        },
+        {
+          "orig": "No pneumothoraces are observed.",
+          "new": [
+            "No pneumothoraces are observed."
+          ]
+        },
+        {
+          "orig": "A central venous line is present on the right side, with the distal lead tip located in the proximal SVC.",
+          "new": [
+            "A central venous line is present on the right side, with the distal lead tip located in the proximal SVC."
+          ]
+        }
+      ]
+    },
+    "study_id": "study_id_9",
+    "example_rationale": "Combine sentences."
+  },
+  {
+    "example_id": "few_shot10",
+    "findings_text": "The liver, spleen, pancreas, and kidneys appear normal in size and morphology. No focal hepatic or splenic lesions are identified. There is mild wall thickening of the sigmoid colon, which may represent colitis or nonspecific changes. There is no free fluid or free air in the abdomen. Findings should be correlated with clinical symptoms, and follow-up imaging may be considered if clinically indicated.",
+    "parsed_report": {
+      "id": "few_shot10",
+      "sentence_list": [
+        {
+          "orig": "The liver, spleen, pancreas, and kidneys appear normal in size and morphology.",
+          "new": [
+            "The liver appears normal in size and morphology.",
+            "The spleen appears normal in size and morphology.",
+            "The pancreas appears normal in size and morphology.",
+            "The kidneys appear normal in size and morphology."
+          ]
+        },
+        {
+          "orig": "No focal hepatic or splenic lesions are identified.",
+          "new": [
+            "No focal hepatic lesions are identified.",
+            "No focal splenic lesions are identified."
+          ]
+        },
+        {
+          "orig": "There is mild wall thickening of the sigmoid colon, which may represent colitis or nonspecific changes.",
+          "new": [
+            "Mild wall thickening of the sigmoid colon is present."
+          ]
+        },
+        {
+          "orig": "There is no free fluid or free air in the abdomen.",
+          "new": [
+            "No free fluid is present in the abdomen.",
+            "No free air is present in the abdomen."
+          ]
+        },
+        {
+          "orig": "Findings should be correlated with clinical symptoms, and follow-up imaging may be considered if clinically indicated.",
+          "new": [
+            ""
+          ]
+        }
+      ]
+    },
+    "study_id": "study_id_10",
+    "example_rationale": "Other irrelevant detail."
+  },
+  {
+    "example_id": "few_shot11",
+    "findings_text": "The CT scan shows the liver to be normal in size with no focal lesions. The spleen, pancreas, and kidneys appear unremarkable. There is evidence of prior cholecystectomy. The abdominal aorta is normal in caliber, and no lymphadenopathy is seen.",
+    "parsed_report": {
+      "id": "few_shot11",
+      "sentence_list": [
+        {
+          "orig": "The CT scan shows the liver to be normal in size with no focal lesions.",
+          "new": [
+            "The liver is normal in size.",
+            "No focal lesions are identified."
+          ]
+        },
+        {
+          "orig": "The spleen, pancreas, and kidneys appear unremarkable.",
+          "new": [
+            "The spleen appears unremarkable.",
+            "The pancreas appears unremarkable.",
+            "The kidneys appear unremarkable."
+          ]
+        },
+        {
+          "orig": "There is evidence of prior cholecystectomy.",
+          "new": [
+            "There is evidence of prior cholecystectomy."
+          ]
+        },
+        {
+          "orig": "The abdominal aorta is normal in caliber, and no lymphadenopathy is seen.",
+          "new": [
+            "The abdominal aorta is normal in caliber.",
+            "No lymphadenopathy is seen."
+          ]
+        }
+      ]
+    },
+    "study_id": "study_id_11",
+    "example_rationale": "Negative finding without location."
+  }
+]
\ No newline at end of file
diff --git a/src/radfact/llm_utils/report_to_phrases/prompts/ct/system_message.txt b/src/radfact/llm_utils/report_to_phrases/prompts/ct/system_message.txt
new file mode 100644
index 0000000..533e9f9
--- /dev/null
+++ b/src/radfact/llm_utils/report_to_phrases/prompts/ct/system_message.txt
@@ -0,0 +1,13 @@
+You are an AI radiology assistant. You are helping process reports from CT (computed tomography) scans.
+
+Please extract phrases from the radiology report which refer to objects, findings, or anatomies visible in a CT scan, or the absence of such.
+
+Rules:
+- If a sentence describes multiple findings, split them up into separate sentences.
+- Exclude clinical speculation or interpretation (e.g. "... highly suggestive of pneumonia").
+- Exclude recommendations (e.g. "Recommend further imaging or follow-up").
+- Exclude comments on the technical quality of the CT scan (e.g. "motion artifacts noted in the scan").
+- Include mentions of change (e.g. "Pleural effusion has increased") because change is visible when we compare two scans.
+- If consecutive sentences are closely linked such that one sentence can't be understood without the other one, process them together.
+
+The objective is to extract phrases which refer to things which can be located on a CT scan, or confirmed not to be present.
\ No newline at end of file
diff --git a/src/radfact/llm_utils/report_to_phrases/prompts/few_shot_examples.json b/src/radfact/llm_utils/report_to_phrases/prompts/cxr/few_shot_examples.json
similarity index 100%
rename from src/radfact/llm_utils/report_to_phrases/prompts/few_shot_examples.json
rename to src/radfact/llm_utils/report_to_phrases/prompts/cxr/few_shot_examples.json
diff --git a/src/radfact/llm_utils/report_to_phrases/prompts/system_message.txt b/src/radfact/llm_utils/report_to_phrases/prompts/cxr/system_message.txt
similarity index 100%
rename from src/radfact/llm_utils/report_to_phrases/prompts/system_message.txt
rename to src/radfact/llm_utils/report_to_phrases/prompts/cxr/system_message.txt
diff --git a/tests/metric/test_radfact.py b/tests/metric/test_radfact.py
index 2d08a1f..b631d72 100644
--- a/tests/metric/test_radfact.py
+++ b/tests/metric/test_radfact.py
@@ -8,6 +8,16 @@
 
 import mock
 import pandas as pd
+from radfact.llm_utils.report_to_phrases.processor import get_report_to_phrases_engine
+from radfact.metric.radfact import REPORT_TO_PHRASES_CONFIG, init_hydra_config
+from radfact.llm_utils.nli.processor import get_ev_processor_singlephrase
+from radfact.paths import OUTPUT_DIR
+from radfact.llm_utils.prompt_tasks import ReportType
+from radfact.llm_utils.report_to_phrases.processor import get_report_to_phrases_processor
+from radfact.llm_utils.prompt_tasks import NLITaskOptions, ReportToPhrasesTaskOptions
+from radfact.llm_utils.nli.schema import NLISampleSinglePhrase
+from radfact.llm_utils.report_to_phrases.schema import load_examples_from_json as load_examples_from_json_phrasification
+
 import pytest
 from numpy.testing import assert_equal
 from omegaconf import DictConfig
@@ -23,6 +33,7 @@
     NLIQuerySample,
     NLISample,
     NLIState,
+    load_examples_from_json as load_examples_from_json_nli,
 )
 from radfact.llm_utils.report_to_phrases.processor import StudyIdType
 from radfact.llm_utils.report_to_phrases.schema import ParsedReport, SentenceWithRephrases
@@ -433,3 +444,41 @@ def test_convert_narrative_text_to_phrases() -> None:
     with mock.patch('radfact.metric.radfact.get_report_to_phrases_engine', return_value=mock_phrase_engine):
         processed_texts = metric.convert_narrative_text_to_phrases(input_texts, metric_prefix)
     assert processed_texts == expected_texts
+
+
+@pytest.mark.parametrize("report_type_value", ["cxr", "ct"])
+def test_report_type_phrasification(report_type_value: str) -> None:
+    report_type = ReportType(report_type_value)
+    processor = get_report_to_phrases_processor(report_type=report_type)
+
+    task = ReportToPhrasesTaskOptions[report_type.name].value
+
+    system_message = task.system_message_path.read_text()
+    assert processor.query_template.system_prompt.startswith(system_message)
+    few_shot_examples = load_examples_from_json_phrasification(task.few_shot_examples_path)
+    assert few_shot_examples == processor.query_template.examples  # type: ignore[comparison-overlap]
+
+
+@pytest.mark.parametrize("report_type_value", ["cxr", "ct"])
+def test_report_type_nli(report_type_value: str) -> None:
+    report_type = ReportType(report_type_value)
+    processor = get_ev_processor_singlephrase(report_type=report_type, log_dir=OUTPUT_DIR / "ev_processor_logs_test")
+
+    task = NLITaskOptions[report_type.name].value
+
+    system_message = task.system_message_path.read_text()
+    assert processor.query_template.system_prompt == system_message
+    few_shot_examples = load_examples_from_json_nli(task.few_shot_examples_path, binary=True)
+    few_shot_examples_single_phrase: list[NLISampleSinglePhrase] = []
+    for few_shot_example in few_shot_examples:
+        one_way_dict = NLISampleSinglePhrase.from_nli_sample(few_shot_example)
+        for single_phrase_sample in one_way_dict.values():
+            few_shot_examples_single_phrase.extend(single_phrase_sample)
+    assert few_shot_examples_single_phrase == processor.query_template.examples
+
+
+def test_invalid_report_type() -> None:
+    config = init_hydra_config(REPORT_TO_PHRASES_CONFIG)
+    config.report_type = "invalid_type"
+    with pytest.raises(ValueError):
+        get_report_to_phrases_engine(cfg=config, dataset_df=pd.DataFrame({}, columns=["study_id", "FINDINGS"]))

From 2abbf32ebc9cf0e33f45555aa5c06b55e8159cb3 Mon Sep 17 00:00:00 2001
From: Cynthia Lo <t-cynthialo@microsoft.com>
Date: Mon, 12 Jan 2026 04:01:26 -0800
Subject: [PATCH 02/14] Update README

---
 README.md | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 05c248c..79da37b 100644
--- a/README.md
+++ b/README.md
@@ -173,15 +173,16 @@ options:
 
 Refer to the example input files in the [`examples`](examples) directory for the expected format of the input files. The input files should be in the format of a CSV file for non-grounded reports [findings_generation_examples.csv](examples/findings_generation_examples.csv) and a JSON file for grounded reports [grounded_reporting_examples.json](examples/grounded_reporting_examples.json).
 
+The script computes confidence intervals for the metrics using bootstrapping. The number of bootstrap samples can be controlled using the `--bootstrap_samples` argument. The default value is 500. To disable bootstrapping, set `--bootstrap_samples 0`.
+
+⚠️**WARNING**: Some queries may fail due to the endpoint limitations (timeouts, rate limits, etc.). When the LLM performing entailment verification fails, we **set these examples as not-entailed by default**. If this occurs in a significant number of cases, the results will not be reliable. The final metrics dict contains the number of such skipped queries under the key `num_llm_failures`. The script will print the number of skipped queries at the end of the run, and store these in the `skipped` directroy under the run id folder. You will also see a warning message in the logs for each failed query. `WARNING: No response for example {query_id}. Setting as NOT ENTAILED`.
+
+### Supporting Multiple Report Rypes
 RadFact supports different report types through the `report_type` field in [`configs/default.yaml`](configs/default.yaml). Currently supported options are:
 
 - `cxr` - Chest X-ray reports (default)
 - `ct` - CT scan reports
 
-The script computes confidence intervals for the metrics using bootstrapping. The number of bootstrap samples can be controlled using the `--bootstrap_samples` argument. The default value is 500. To disable bootstrapping, set `--bootstrap_samples 0`.
-
-⚠️**WARNING**: Some queries may fail due to the endpoint limitations (timeouts, rate limits, etc.). When the LLM performing entailment verification fails, we **set these examples as not-entailed by default**. If this occurs in a significant number of cases, the results will not be reliable. The final metrics dict contains the number of such skipped queries under the key `num_llm_failures`. The script will print the number of skipped queries at the end of the run, and store these in the `skipped` directroy under the run id folder. You will also see a warning message in the logs for each failed query. `WARNING: No response for example {query_id}. Setting as NOT ENTAILED`.
-
 ### Split reports into phrases
 
 We also provide a script to convert reports to phrases. This is useful when you have a narrative report and want to convert it to a list of phrases for RadFact evaluation. You can run this step offline and then use the output file as input to RadFact. Make sure you've set up the endpoints as described above before running the script. The `run_report_to_phrases` command runs `python src/radfact/cli/run_report_to_phrases.py` script under the hood.

From 068187fd3b788260fc13eb1f7e310647a3b989f7 Mon Sep 17 00:00:00 2001
From: Cynthia Lo <t-cynthialo@microsoft.com>
Date: Mon, 12 Jan 2026 04:15:39 -0800
Subject: [PATCH 03/14] Add comment to config

---
 configs/default.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/default.yaml b/configs/default.yaml
index 9878825..abffd5e 100644
--- a/configs/default.yaml
+++ b/configs/default.yaml
@@ -21,7 +21,7 @@ processing:
   end_index: null
   output_filename: "outputs.json"
 
-report_type: "cxr"
+report_type: "cxr" # cxr or ct
 
 # The type of cache that should be set for langchain. This can be either "redis" or "sqlite".
 # Sqlite cache is useful for local development, it will be written to ~/.langchain.db

From 66373d7b7d529d56ce4397b09934a143ca391756 Mon Sep 17 00:00:00 2001
From: Cynthia Lo <t-cynthialo@microsoft.com>
Date: Mon, 12 Jan 2026 09:54:32 -0800
Subject: [PATCH 04/14] Add support for ct negative filtering

---
 configs/report_to_phrases.yaml                |   1 +
 .../llm_utils/negative_filtering/__init__.py  |   0
 .../llm_utils/negative_filtering/processor.py | 135 ++++++++++++++++++
 .../prompts/ct/few_shot_examples.json         |  46 ++++++
 .../prompts/ct/system_message.txt             |  13 ++
 .../processor/structured_processor.py         |  30 ++++
 src/radfact/llm_utils/prompt_tasks.py         |  10 ++
 .../llm_utils/report_to_phrases/processor.py  |   6 +-
 .../llm_utils/report_to_phrases/schema.py     |  14 +-
 src/radfact/metric/radfact.py                 |  29 +++-
 10 files changed, 277 insertions(+), 7 deletions(-)
 create mode 100644 src/radfact/llm_utils/negative_filtering/__init__.py
 create mode 100644 src/radfact/llm_utils/negative_filtering/processor.py
 create mode 100644 src/radfact/llm_utils/negative_filtering/prompts/ct/few_shot_examples.json
 create mode 100644 src/radfact/llm_utils/negative_filtering/prompts/ct/system_message.txt

diff --git a/configs/report_to_phrases.yaml b/configs/report_to_phrases.yaml
index d61efab..9ad0eaa 100644
--- a/configs/report_to_phrases.yaml
+++ b/configs/report_to_phrases.yaml
@@ -8,3 +8,4 @@ defaults:
 dataset:
   name: reports_to_phrases
   csv_path: "<your_path_to_cxr_reports>"
+  filter_negatives: false
diff --git a/src/radfact/llm_utils/negative_filtering/__init__.py b/src/radfact/llm_utils/negative_filtering/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/radfact/llm_utils/negative_filtering/processor.py b/src/radfact/llm_utils/negative_filtering/processor.py
new file mode 100644
index 0000000..e54d707
--- /dev/null
+++ b/src/radfact/llm_utils/negative_filtering/processor.py
@@ -0,0 +1,135 @@
+#  ------------------------------------------------------------------------------------------
+#  Copyright (c) Microsoft Corporation. All rights reserved.
+#  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
+#  ------------------------------------------------------------------------------------------
+
+from collections import defaultdict
+import json
+from pathlib import Path
+
+import pandas as pd
+from radfact.llm_utils.report_to_phrases.processor import StudyIdType
+from radfact.llm_utils.prompt_tasks import NegativeFilteringTaskOptions, ReportType
+from omegaconf import DictConfig
+
+from radfact.llm_utils.engine.engine import LLMEngine, get_subfolder
+from radfact.llm_utils.processor.structured_processor import StructuredProcessor, parse_examples_from_json
+from radfact.llm_utils.report_to_phrases.schema import ParsedReport, Rephrases, RephrasesExample, SentenceWithRephrases
+from radfact.paths import OUTPUT_DIR
+
+
+def get_negative_filtering_phrase_processor(
+    report_type: ReportType, log_dir: Path | None = None
+) -> StructuredProcessor[list[str], Rephrases]:
+    """Return a processor for filtering negative findings from a list of phrases.
+
+    :param report_type: The type of report, e.g., "ReportType.CXR" or "ReportType.CT".
+    :param log_dir: The directory to save logs.
+    :return: The processor for negative finding filtering.
+    """
+    task = NegativeFilteringTaskOptions[report_type.name].value
+    system_prompt = task.system_message_path.read_text()
+    few_shot_examples = parse_examples_from_json(task.few_shot_examples_path, RephrasesExample)
+    processor = StructuredProcessor(
+        query_type=list[str],
+        result_type=Rephrases,
+        system_prompt=system_prompt,
+        format_query_fn=lambda x: json.dumps(x),
+        few_shot_examples=few_shot_examples,
+        log_dir=log_dir,
+    )
+    return processor
+
+
+def load_filtering_queries_from_parsed_reports(
+    reports: list[ParsedReport],
+) -> pd.DataFrame:
+    """
+    Load queries for filtering from a list of parsed reports. Queries consist of all the
+    newly parsed phrases from phrasification, along with metadata including the study ID
+    and original phrase.
+    :param reports: A list of ParsedReport objects.
+    :return: A list of queries.
+    """
+    queries = []
+    report_ids: dict[StudyIdType, int] = defaultdict(int)
+    for report in reports:
+        for sentence in report.sentence_list:
+            assert report.id is not None
+            queries.append([f"{str(report.id)}_{report_ids[report.id]}", sentence.orig, sentence.new])
+            report_ids[str(report.id)] += 1
+    query_df = pd.DataFrame(queries, columns=["study_id", "orig", "new_phrases"])
+    return query_df
+
+
+def get_negative_filtering_engine(cfg: DictConfig, parsed_reports: list[ParsedReport]) -> LLMEngine:
+    """
+    Create the processing engine for filtering negative findings from parsed reports.
+
+    :param cfg: The configuration for the processing engine.
+    :return: The processing engine.
+    """
+    subfolder = cfg.dataset.name
+    root = OUTPUT_DIR / "negative_report_filtering"
+    output_folder = get_subfolder(root, subfolder)
+    final_output_folder = get_subfolder(root, subfolder)
+    log_dir = get_subfolder(root, "logs")
+
+    report_type_value = cfg.get("report_type")
+    try:
+        report_type = ReportType(report_type_value)
+    except ValueError as e:
+        raise ValueError(
+            f"Invalid report_type '{report_type_value}'. Valid options are: {[rt.value for rt in ReportType]}"
+        ) from e
+
+    query_df = load_filtering_queries_from_parsed_reports(parsed_reports)
+    negative_filtering_processor = get_negative_filtering_phrase_processor(report_type=report_type, log_dir=log_dir)
+
+    engine = LLMEngine(
+        cfg=cfg,
+        processor=negative_filtering_processor,
+        dataset_df=query_df,
+        row_to_query_fn=lambda row: row["new_phrases"],
+        progress_output_folder=output_folder,
+        final_output_folder=final_output_folder,
+    )
+    return engine
+
+
+def process_filtered_reports(engine: LLMEngine) -> tuple[list[ParsedReport], int]:
+    """
+    Process the filtered reports using the provided engine.
+
+    :param engine: The LLMEngine used for processing.
+    :return: A tuple containing a list of ParsedReport objects and the number of rewritten sentences.
+    """
+    outputs = engine.return_raw_outputs
+    metadata = engine.return_dataset_subsets
+
+    parsed_report_dict = defaultdict(list)
+    num_rewritten_sentences = 0
+    for k in outputs.keys():
+        rephrases = outputs[k]
+        metadata_df = metadata[k].df
+
+        for idx, row in metadata_df.iterrows():
+            study_id = row["study_id"].split("_")[0]
+            orig = row["orig"]
+            unfiltered_phrases = set(row["new_phrases"])
+            filtered_phrases = set(rephrases[idx].new)
+
+            if not filtered_phrases.issubset(unfiltered_phrases):
+                rewritten_phrases = filtered_phrases - unfiltered_phrases
+                print(
+                    f"New phrases {rewritten_phrases} not in original phrases {unfiltered_phrases}. Reverting back to original phrases."
+                )
+                filtered_phrases = unfiltered_phrases
+                num_rewritten_sentences += 1
+
+            parsed_report_dict[study_id].append(SentenceWithRephrases(orig=orig, new=list(filtered_phrases)))
+
+    parsed_reports = [
+        ParsedReport(id=study_id, sentence_list=sentences) for study_id, sentences in parsed_report_dict.items()
+    ]
+    return parsed_reports, num_rewritten_sentences
diff --git a/src/radfact/llm_utils/negative_filtering/prompts/ct/few_shot_examples.json b/src/radfact/llm_utils/negative_filtering/prompts/ct/few_shot_examples.json
new file mode 100644
index 0000000..ac55c6a
--- /dev/null
+++ b/src/radfact/llm_utils/negative_filtering/prompts/ct/few_shot_examples.json
@@ -0,0 +1,46 @@
+[
+    {
+        "input": [
+            "There is a relative opacity observed in the left mid-to-lower lung, possibly located in the lingula.",
+            "There is no evidence of pneumothorax.",
+            "The cardiac silhouette is unremarkable.",
+            "The mediastinal silhouette is unremarkable.",
+            "Mild recessions are observed in the upper lobe of the left lung."
+        ],
+        "output": {
+            "new": [
+                "There is a relative opacity observed in the left mid-to-lower lung, possibly located in the lingula.",
+                "Mild recessions are observed in the upper lobe of the left lung."
+            ]
+        }
+    }, {
+        "input": [
+            "The right lung is well aerated.",
+            "No signs of pulmonary edema.",
+            "No signs of focal consolidation.",
+            "The left side still shows mediastinal shifting and volume loss.",
+            "No signs of pleural effusions."
+        ],
+        "output": {
+
+        "new": [
+            "The left side still shows mediastinal shifting and volume loss."
+        ]
+    }
+    }, {
+        "input": [
+            "There is a moderate right pleural effusion.",
+            "There is no pneumothorax.",
+            "The heart size is within normal limits.",
+            "The radiograph shows linear opacities in the right middle lobe and left lower lobe, indicating atelectasis.",
+            "The mediastinal contours are unremarkable."
+        ],
+        "output": {
+
+        "new": [
+            "There is a moderate right pleural effusion.",
+            "The radiograph shows linear opacities in the right middle lobe and left lower lobe, indicating atelectasis."
+        ]
+    }
+    }
+]
\ No newline at end of file
diff --git a/src/radfact/llm_utils/negative_filtering/prompts/ct/system_message.txt b/src/radfact/llm_utils/negative_filtering/prompts/ct/system_message.txt
new file mode 100644
index 0000000..9b01b2a
--- /dev/null
+++ b/src/radfact/llm_utils/negative_filtering/prompts/ct/system_message.txt
@@ -0,0 +1,13 @@
+You are an AI radiology assistant. You are helping process reports from CT (computed tomography) scans.
+
+You are given a list of phrases from a radiology report which refer to objects, findings, or anatomies visible in a CT scan, or the absence of such.
+
+Your goal is to filter phrases that do not refer to positive radiology findings.
+
+Rules:
+- Remove statements describing the absence of pathology (e.g. "No pneumothorax", "No pleural effusion detected")
+- Remove statements describing normal anatomical appearance, calibration, or function (e.g. "The liver is normal in size", "Upper abdominal organs are normal", "Thoracic esophageal calibration was normal", "The lungs are well aerated", "Lungs are clear")
+- Remove statements describing unremarkable appearances (e.g. "Kidneys appear unremarkable", "The mediastinum is unremarkable")
+- Keep statements referring to "mild" observations or conditions, as those are still considered positive radiology findings
+
+The objective is to remove phrases which do not refer to positive radiology findings.
\ No newline at end of file
diff --git a/src/radfact/llm_utils/processor/structured_processor.py b/src/radfact/llm_utils/processor/structured_processor.py
index dc3256d..0b8667f 100644
--- a/src/radfact/llm_utils/processor/structured_processor.py
+++ b/src/radfact/llm_utils/processor/structured_processor.py
@@ -3,6 +3,7 @@
 #  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
 #  ------------------------------------------------------------------------------------------
 
+import json
 import logging
 from enum import Enum
 from functools import partial
@@ -22,6 +23,7 @@
 
 _QUERY_KEY = "query"
 ResultT = TypeVar("ResultT", bound=BaseModel)
+ExampleClassT = TypeVar("ExampleClassT", bound=BaseModel)
 ProcessorStats = dict[str, int]
 
 
@@ -55,6 +57,34 @@ class Example(Protocol, Generic[QueryT, ResultT]):
     output: ResultT
 
 
+def parse_examples_from_json(examples_path: Path | None, example_class: type[ExampleClassT]) -> list[ExampleClassT]:
+    """
+    This function returns a list of "parsed" examples from a JSON file.
+
+    This JSON file is expected to contain a list of JSON-formatted objects, which should
+    be parseable by the "example class" (expected to be some Pydantic model).
+
+    If no path is provided, an empty list is returned.
+
+    This function is especially useful for loading few-shot examples for a structured processor.
+
+    :param examples_path: Path to the JSON file containing the examples.
+        If None, an empty list is returned.
+    :param example_class: The class of the examples to load. A Pydantic model.
+        We will attempt to parse each object in the JSON file as an instance of this class.
+    :return: List of examples, as instances of the provided class.
+    """
+    parsed_examples: list[ExampleClassT] = []
+    if examples_path is None:
+        return parsed_examples
+
+    with open(examples_path) as f:
+        unparsed_examples = json.load(f)
+        for example in unparsed_examples:
+            parsed_examples.append(example_class.parse_obj(example))
+    return parsed_examples
+
+
 class QueryTemplate(BaseChatPromptTemplate, Generic[QueryT, ResultT]):
     """Query template for a structured processor."""
 
diff --git a/src/radfact/llm_utils/prompt_tasks.py b/src/radfact/llm_utils/prompt_tasks.py
index 6067b67..5ca5d04 100644
--- a/src/radfact/llm_utils/prompt_tasks.py
+++ b/src/radfact/llm_utils/prompt_tasks.py
@@ -5,6 +5,8 @@
 
 REPORT_TO_PHRASES_PARSING_TASK = "report_to_phrases"
 REPORT_TO_PHRASES_PROMPTS_DIR = get_prompts_dir(task=REPORT_TO_PHRASES_PARSING_TASK)
+NEGATIVE_FILTERING_PARSING_TASK = "negative_filtering"
+NEGATIVE_FILTERING_PROMPTS_DIR = get_prompts_dir(task=NEGATIVE_FILTERING_PARSING_TASK)
 NLI_PARSING_TASK = "nli"
 NLI_PROMPTS_DIR = get_prompts_dir(task=NLI_PARSING_TASK)
 
@@ -34,6 +36,14 @@ class ReportToPhrasesTaskOptions(Enum):
     )
 
 
+class NegativeFilteringTaskOptions(Enum):
+    CT = PromptTask(
+        name=f"{ReportType.CT.value}_negative_filtering",
+        system_message_path=NEGATIVE_FILTERING_PROMPTS_DIR / ReportType.CT.value / "system_message.txt",
+        few_shot_examples_path=NEGATIVE_FILTERING_PROMPTS_DIR / ReportType.CT.value / "few_shot_examples.json",
+    )
+
+
 class NLITaskOptions(Enum):
     CXR = PromptTask(
         name=f"{ReportType.CXR.value}_nli",
diff --git a/src/radfact/llm_utils/report_to_phrases/processor.py b/src/radfact/llm_utils/report_to_phrases/processor.py
index b88c5ab..228aced 100644
--- a/src/radfact/llm_utils/report_to_phrases/processor.py
+++ b/src/radfact/llm_utils/report_to_phrases/processor.py
@@ -7,7 +7,11 @@
 from typing import Any
 
 import pandas as pd
-from radfact.llm_utils.prompt_tasks import REPORT_TO_PHRASES_PARSING_TASK, ReportToPhrasesTaskOptions, ReportType
+from radfact.llm_utils.prompt_tasks import (
+    REPORT_TO_PHRASES_PARSING_TASK,
+    ReportToPhrasesTaskOptions,
+    ReportType,
+)
 from omegaconf import DictConfig
 
 from radfact.llm_utils.engine.engine import LLMEngine, get_subfolder
diff --git a/src/radfact/llm_utils/report_to_phrases/schema.py b/src/radfact/llm_utils/report_to_phrases/schema.py
index 06579ff..b185efd 100644
--- a/src/radfact/llm_utils/report_to_phrases/schema.py
+++ b/src/radfact/llm_utils/report_to_phrases/schema.py
@@ -13,11 +13,12 @@
 from radfact.llm_utils.processor.base_processor import BaseModelWithId
 
 
-class SentenceWithRephrases(BaseModel):
-    """Dataclass for a sentence with rephrases. The source sentence is 'orig' and the rephrased sentences are 'new'."""
+class Rephrases(BaseModel):
+    new: list[str]
+
 
+class SentenceWithRephrases(Rephrases):
     orig: str
-    new: list[str]
 
 
 class ParsedReport(BaseModelWithId):
@@ -71,6 +72,13 @@ def to_grounded_phrases_list(self, rephrased: bool = True) -> GroundedPhraseList
         return sequence
 
 
+class RephrasesExample(BaseModel):
+    """A single example of a list of phrases before and after processing"""
+
+    input: list[str]
+    output: Rephrases
+
+
 class PhraseParsingExample(BaseModel):
     """Dataclass for a single example."""
 
diff --git a/src/radfact/metric/radfact.py b/src/radfact/metric/radfact.py
index 1da0b18..6828ed3 100644
--- a/src/radfact/metric/radfact.py
+++ b/src/radfact/metric/radfact.py
@@ -7,6 +7,7 @@
 from dataclasses import asdict, replace
 from typing import Any, Iterable, Mapping
 
+from radfact.llm_utils.prompt_tasks import ReportType
 import hydra
 import numpy as np
 import pandas as pd
@@ -17,6 +18,7 @@
 from radfact.llm_utils.nli.processor import get_report_nli_engine
 from radfact.llm_utils.nli.schema import EVState, NLISample
 from radfact.llm_utils.report_to_phrases.processor import FINDINGS_SECTION, StudyIdType, get_report_to_phrases_engine
+from radfact.llm_utils.negative_filtering.processor import get_negative_filtering_engine, process_filtered_reports
 from radfact.llm_utils.report_to_phrases.schema import ParsedReport
 from radfact.metric.box_metrics import PRECISION, compute_box_metrics
 from radfact.metric.schema import (
@@ -208,13 +210,34 @@ def convert_narrative_text_to_phrases(
         )
         engine = get_report_to_phrases_engine(self.llm_phrase_cfg, texts_as_str_df)
         parsed_reports: list[ParsedReport] = engine.run()
-        processed_texts = {
-            parsed.id: parsed.to_grounded_phrases_list() for parsed in parsed_reports if parsed.id is not None
-        }
+
         if engine.aggregated_processor_stats is not None:
             self.meta_metrics.update(
                 {f"{metric_prefix}/{k}": float(v) for k, v in engine.aggregated_processor_stats.items()}
             )
+
+        if self.llm_phrase_cfg.dataset.filter_negatives:
+            assert (
+                self.llm_phrase_cfg.report_type == ReportType.CT.value
+            ), "Negative filtering is only supported for CT reports."
+            logger.info("Filtering negatives from previous run.")
+            engine = get_negative_filtering_engine(self.llm_phrase_cfg, parsed_reports)
+            engine.run()
+            parsed_reports, num_rewritten_sentences = process_filtered_reports(engine)
+            if engine.aggregated_processor_stats is not None:
+                self.meta_metrics.update(
+                    {
+                        f"negative_filtering_{metric_prefix}/{k}": float(v)
+                        for k, v in engine.aggregated_processor_stats.items()
+                    }
+                )
+                self.meta_metrics[f"negative_filtering_{metric_prefix}/num_rewritten_sentences"] = (
+                    num_rewritten_sentences
+                )
+
+        processed_texts = {
+            parsed.id: parsed.to_grounded_phrases_list() for parsed in parsed_reports if parsed.id is not None
+        }
         if set(processed_texts.keys()) != set(texts.keys()):
             logger.warning(
                 f"Key mismatch between processed and input texts. #input keys: {len(set(texts.keys()))}. #processed "

From 89739a82f9994af4a0b6c073d5a8bbec6dbc7d2d Mon Sep 17 00:00:00 2001
From: Cynthia Lo <t-cynthialo@microsoft.com>
Date: Tue, 13 Jan 2026 07:28:54 -0800
Subject: [PATCH 05/14] Add pytest and cli support

---
 configs/negative_filtering.yaml               |   9 ++
 configs/report_to_phrases.yaml                |   1 -
 src/radfact/cli/run_radfact.py                |  22 ++++
 .../llm_utils/negative_filtering/processor.py |  39 +++----
 src/radfact/metric/radfact.py                 |  14 ++-
 tests/metric/test_radfact.py                  | 102 ++++++++++++++++++
 6 files changed, 165 insertions(+), 22 deletions(-)
 create mode 100644 configs/negative_filtering.yaml

diff --git a/configs/negative_filtering.yaml b/configs/negative_filtering.yaml
new file mode 100644
index 0000000..0d70080
--- /dev/null
+++ b/configs/negative_filtering.yaml
@@ -0,0 +1,9 @@
+#@package __global__
+
+defaults:
+  - default
+  - override endpoints: azure_chat_openai
+  - _self_
+
+processing:
+  index_col: sentence_id
\ No newline at end of file
diff --git a/configs/report_to_phrases.yaml b/configs/report_to_phrases.yaml
index 9ad0eaa..d61efab 100644
--- a/configs/report_to_phrases.yaml
+++ b/configs/report_to_phrases.yaml
@@ -8,4 +8,3 @@ defaults:
 dataset:
   name: reports_to_phrases
   csv_path: "<your_path_to_cxr_reports>"
-  filter_negatives: false
diff --git a/src/radfact/cli/run_radfact.py b/src/radfact/cli/run_radfact.py
index a08a23e..2355c4a 100644
--- a/src/radfact/cli/run_radfact.py
+++ b/src/radfact/cli/run_radfact.py
@@ -63,15 +63,19 @@ def get_candidates_and_references_from_json(
 def compute_radfact_scores(
     radfact_config_name: str | None,
     phrases_config_name: str | None,
+    filtering_config_name: str | None,
     candidates: InputDict,
     references: InputDict,
     is_narrative_text: bool,
     bootstrap_samples: int,
+    filter_negatives: bool,
 ) -> dict[str, float]:
     radfact_metric = RadFactMetric(
         nli_config_name=radfact_config_name,
         phrase_config_name=phrases_config_name,
+        filtering_config_name=filtering_config_name,
         is_narrative_text=is_narrative_text,
+        filter_negatives=filter_negatives,
     )
     if bootstrap_samples == 0:
         _, results = radfact_metric.compute_metric_score(candidates, references)
@@ -118,6 +122,15 @@ def main() -> None:
         "initialization from the `configs` directory.",
         default=None,
     )
+    parser.add_argument(
+        "--filtering_config_name",
+        type=str,
+        help="The name of the config file for negative finding filtering. We use the default config file but you can "
+        "provide a custom config. Make sure the config follows the same structure as `configs/negative_filtering.yaml` "
+        "and is saved in the `configs` directory. This is necessary for hydra initialization from the `configs` "
+        "directory.",
+        default=None,
+    )
     parser.add_argument(
         "--output_dir",
         type=str,
@@ -131,6 +144,11 @@ def main() -> None:
         "bootstrapping.",
         default=500,
     )
+    parser.add_argument(
+        "--filter_negatives",
+        action="store_true",
+        help="Whether to filter negative findings from the parsed reports before computing the RadFact score.",
+    )
 
     args = parser.parse_args()
     input_path = Path(args.input_path)
@@ -138,7 +156,9 @@ def main() -> None:
     is_narrative_text = args.is_narrative_text
     radfact_config_name = args.radfact_config_name
     phrases_config_name = args.phrases_config_name
+    filtering_config_name = args.filtering_config_name
     bootstrap_samples = args.bootstrap_samples
+    filter_negatives = args.filter_negatives
 
     assert input_path.suffix in [".csv", ".json"], "Input file must be a csv or json file."
     assert input_path.suffix == ".csv" or not is_narrative_text, (
@@ -159,10 +179,12 @@ def main() -> None:
     results = compute_radfact_scores(
         radfact_config_name=radfact_config_name,
         phrases_config_name=phrases_config_name,
+        filtering_config_name=filtering_config_name,
         candidates=candidates,
         references=references,
         is_narrative_text=is_narrative_text,
         bootstrap_samples=bootstrap_samples,
+        filter_negatives=filter_negatives,
     )
 
     print_fn = print_results if bootstrap_samples == 0 else print_bootstrap_results
diff --git a/src/radfact/llm_utils/negative_filtering/processor.py b/src/radfact/llm_utils/negative_filtering/processor.py
index e54d707..6b903b3 100644
--- a/src/radfact/llm_utils/negative_filtering/processor.py
+++ b/src/radfact/llm_utils/negative_filtering/processor.py
@@ -8,7 +8,6 @@
 from pathlib import Path
 
 import pandas as pd
-from radfact.llm_utils.report_to_phrases.processor import StudyIdType
 from radfact.llm_utils.prompt_tasks import NegativeFilteringTaskOptions, ReportType
 from omegaconf import DictConfig
 
@@ -17,6 +16,10 @@
 from radfact.llm_utils.report_to_phrases.schema import ParsedReport, Rephrases, RephrasesExample, SentenceWithRephrases
 from radfact.paths import OUTPUT_DIR
 
+NEGATIVE_FILTERING_SUBFOLDER = "negative_report_filtering"
+ORIG = "orig"
+NEW = "new"
+
 
 def get_negative_filtering_phrase_processor(
     report_type: ReportType, log_dir: Path | None = None
@@ -43,22 +46,21 @@ def get_negative_filtering_phrase_processor(
 
 def load_filtering_queries_from_parsed_reports(
     reports: list[ParsedReport],
+    index_col: str,
 ) -> pd.DataFrame:
     """
     Load queries for filtering from a list of parsed reports. Queries consist of all the
     newly parsed phrases from phrasification, along with metadata including the study ID
     and original phrase.
     :param reports: A list of ParsedReport objects.
+    :param index_col: The column containing the index
     :return: A list of queries.
     """
     queries = []
-    report_ids: dict[StudyIdType, int] = defaultdict(int)
     for report in reports:
-        for sentence in report.sentence_list:
-            assert report.id is not None
-            queries.append([f"{str(report.id)}_{report_ids[report.id]}", sentence.orig, sentence.new])
-            report_ids[str(report.id)] += 1
-    query_df = pd.DataFrame(queries, columns=["study_id", "orig", "new_phrases"])
+        for i, sentence in enumerate(report.sentence_list):
+            queries.append([f"{report.id}_{i}", sentence.orig, sentence.new])
+    query_df = pd.DataFrame(queries, columns=[index_col, ORIG, NEW])
     return query_df
 
 
@@ -69,11 +71,10 @@ def get_negative_filtering_engine(cfg: DictConfig, parsed_reports: list[ParsedRe
     :param cfg: The configuration for the processing engine.
     :return: The processing engine.
     """
-    subfolder = cfg.dataset.name
-    root = OUTPUT_DIR / "negative_report_filtering"
-    output_folder = get_subfolder(root, subfolder)
-    final_output_folder = get_subfolder(root, subfolder)
-    log_dir = get_subfolder(root, "logs")
+    subfolder = NEGATIVE_FILTERING_SUBFOLDER
+    output_folder = get_subfolder(OUTPUT_DIR, subfolder)
+    final_output_folder = get_subfolder(OUTPUT_DIR, subfolder)
+    log_dir = get_subfolder(OUTPUT_DIR, "logs")
 
     report_type_value = cfg.get("report_type")
     try:
@@ -83,25 +84,26 @@ def get_negative_filtering_engine(cfg: DictConfig, parsed_reports: list[ParsedRe
             f"Invalid report_type '{report_type_value}'. Valid options are: {[rt.value for rt in ReportType]}"
         ) from e
 
-    query_df = load_filtering_queries_from_parsed_reports(parsed_reports)
+    query_df = load_filtering_queries_from_parsed_reports(parsed_reports, cfg.processing.index_col)
     negative_filtering_processor = get_negative_filtering_phrase_processor(report_type=report_type, log_dir=log_dir)
 
     engine = LLMEngine(
         cfg=cfg,
         processor=negative_filtering_processor,
         dataset_df=query_df,
-        row_to_query_fn=lambda row: row["new_phrases"],
+        row_to_query_fn=lambda row: row[NEW],
         progress_output_folder=output_folder,
         final_output_folder=final_output_folder,
     )
     return engine
 
 
-def process_filtered_reports(engine: LLMEngine) -> tuple[list[ParsedReport], int]:
+def process_filtered_reports(engine: LLMEngine, cfg: DictConfig) -> tuple[list[ParsedReport], int]:
     """
     Process the filtered reports using the provided engine.
 
     :param engine: The LLMEngine used for processing.
+    :param cfg: The configuration for negative filtering processing.
     :return: A tuple containing a list of ParsedReport objects and the number of rewritten sentences.
     """
     outputs = engine.return_raw_outputs
@@ -109,14 +111,15 @@ def process_filtered_reports(engine: LLMEngine) -> tuple[list[ParsedReport], int
 
     parsed_report_dict = defaultdict(list)
     num_rewritten_sentences = 0
+
     for k in outputs.keys():
         rephrases = outputs[k]
         metadata_df = metadata[k].df
 
         for idx, row in metadata_df.iterrows():
-            study_id = row["study_id"].split("_")[0]
-            orig = row["orig"]
-            unfiltered_phrases = set(row["new_phrases"])
+            study_id = row[cfg.processing.index_col].rsplit("_", 1)[0]
+            orig = row[ORIG]
+            unfiltered_phrases = set(row[NEW])
             filtered_phrases = set(rephrases[idx].new)
 
             if not filtered_phrases.issubset(unfiltered_phrases):
diff --git a/src/radfact/metric/radfact.py b/src/radfact/metric/radfact.py
index 6828ed3..1e408ad 100644
--- a/src/radfact/metric/radfact.py
+++ b/src/radfact/metric/radfact.py
@@ -46,6 +46,8 @@
 RADFACT_CONFIG = "radfact.yaml"
 # The YAML config file for the phrase processor in this setting.
 REPORT_TO_PHRASES_CONFIG = "report_to_phrases.yaml"
+# The YAML config file for the negative filtering processor in this setting.
+NEGATIVE_FILTERING_CONFIG = "negative_filtering.yaml"
 
 
 def init_hydra_config(config_name: str) -> DictConfig:
@@ -70,9 +72,11 @@ def __init__(
         self,
         nli_config_name: str | None = None,
         phrase_config_name: str | None = None,
+        filtering_config_name: str | None = None,
         image_size: int = 224,
         box_precision_threshold: float = 0.5,
         is_narrative_text: bool = False,
+        filter_negatives: bool = False,
     ) -> None:
         """
         Initializes the RadFactMetric with the necessary configurations. We need to know the image size so we can
@@ -88,13 +92,17 @@ def __init__(
             findings section. We need to convert this to lists GroundedPhrase before conducting entailment verification.
             If False, we are running the metric on grounded reports, where the phrases are already in the correct
             format for entailment verification.
+        :param filter_negatives: If True, we will filter negative findings from the parsed reports before computing
+            the RadFact score.
         """
         self.llm_nli_cfg = init_hydra_config(nli_config_name or RADFACT_CONFIG)
         self.llm_phrase_cfg = init_hydra_config(phrase_config_name or REPORT_TO_PHRASES_CONFIG)
+        self.llm_negative_filtering_cfg = init_hydra_config(filtering_config_name or NEGATIVE_FILTERING_CONFIG)
         self.image_size = image_size
         self.box_precision_threshold = box_precision_threshold
         self.is_narrative_text = is_narrative_text
         self.meta_metrics: dict[str, float] = {}  # Metrics about the metric, derived from processors. Not per-sample.
+        self.filter_negatives = filter_negatives
 
     def _are_boxes_entailed(self, boxes: list[NormalizedBox] | None, evidence_boxes: list[NormalizedBox]) -> bool:
         """
@@ -216,14 +224,14 @@ def convert_narrative_text_to_phrases(
                 {f"{metric_prefix}/{k}": float(v) for k, v in engine.aggregated_processor_stats.items()}
             )
 
-        if self.llm_phrase_cfg.dataset.filter_negatives:
+        if self.filter_negatives:
             assert (
                 self.llm_phrase_cfg.report_type == ReportType.CT.value
             ), "Negative filtering is only supported for CT reports."
             logger.info("Filtering negatives from previous run.")
-            engine = get_negative_filtering_engine(self.llm_phrase_cfg, parsed_reports)
+            engine = get_negative_filtering_engine(self.llm_negative_filtering_cfg, parsed_reports)
             engine.run()
-            parsed_reports, num_rewritten_sentences = process_filtered_reports(engine)
+            parsed_reports, num_rewritten_sentences = process_filtered_reports(engine, self.llm_negative_filtering_cfg)
             if engine.aggregated_processor_stats is not None:
                 self.meta_metrics.update(
                     {
diff --git a/tests/metric/test_radfact.py b/tests/metric/test_radfact.py
index b631d72..f54435e 100644
--- a/tests/metric/test_radfact.py
+++ b/tests/metric/test_radfact.py
@@ -3,11 +3,14 @@
 #  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
 #  ------------------------------------------------------------------------------------------
 
+import copy
 import shutil
 from pathlib import Path
 
 import mock
 import pandas as pd
+from radfact.llm_utils.report_to_phrases.schema import Rephrases
+from radfact.llm_utils.engine.data_subset import DataSubset
 from radfact.llm_utils.report_to_phrases.processor import get_report_to_phrases_engine
 from radfact.metric.radfact import REPORT_TO_PHRASES_CONFIG, init_hydra_config
 from radfact.llm_utils.nli.processor import get_ev_processor_singlephrase
@@ -285,6 +288,51 @@ def get_mock_phrase_engine(llm_phrase_cfg: DictConfig, df: pd.DataFrame) -> mock
     return mock_phrase_engine
 
 
+def get_mock_filtering_engine(llm_negative_filtering_cfg: DictConfig, parsed_reports: list[ParsedReport]) -> mock.Mock:
+    mock_filtering_engine = mock.Mock()
+    new_parsed_reports = copy.deepcopy(parsed_reports)
+
+    # Inject a rewritten phrase to simulate filtering mistake
+    new_parsed_reports[0].sentence_list[0].new.append("Dummy filtered phrase")
+    mock_filtering_engine.return_raw_outputs = {
+        "endpoint_1": [Rephrases(new=report.sentence_list[0].new) for report in new_parsed_reports]
+    }
+    if parsed_reports[0].sentence_list[0].orig == "The cat The dog The bird The rabbit":
+        mock_filtering_engine.return_dataset_subsets = {
+            "endpoint_1": DataSubset(
+                start_index=0,
+                end_index=1,
+                index_col="sentence_id",
+                output_folder=Path("output"),
+                df=pd.DataFrame(
+                    {
+                        "sentence_id": ["study1_0"],
+                        "orig": ["The cat The dog The bird The rabbit"],
+                        "new": [["The cat", "The dog", "The bird", "The rabbit"]],
+                    }
+                ),
+            )
+        }
+    else:
+        mock_filtering_engine.return_dataset_subsets = {
+            "endpoint_1": DataSubset(
+                start_index=0,
+                end_index=1,
+                index_col="sentence_id",
+                output_folder=Path("output"),
+                df=pd.DataFrame(
+                    {
+                        "sentence_id": ["study1_0"],
+                        "orig": ["The cat The dog The bird The shark"],
+                        "new": [["The cat", "The dog", "The bird", "The shark"]],
+                    }
+                ),
+            )
+        }
+    mock_filtering_engine.aggregated_processor_stats = {'num_failures': 0, 'num_success': 1}
+    return mock_filtering_engine
+
+
 def test_nli_processing_with_endpoint_and_report_to_phrases(mock_nli_engine: mock.Mock) -> None:
     """Test that the RadFact metric works end-to-end, with a mocked engine including report-to-phrases processing."""
     progress_subfolder = Path(LLMEngine.OUTPUT_FILES_PREFIX) / RADFACT_SUBFOLDER
@@ -325,6 +373,60 @@ def test_nli_processing_with_endpoint_and_report_to_phrases(mock_nli_engine: moc
     assert_equal(actual=details, desired=expected_details, verbose=True)
 
 
+def test_nli_processing_with_negative_filtering(mock_nli_engine: mock.Mock) -> None:
+    """Test that the GPT metric works end-to-end, when connecting to an actual endpoint, with the Redis Cache,
+    phrasification, and negative filtering.
+    """
+    progress_subfolder = Path(LLMEngine.OUTPUT_FILES_PREFIX) / RADFACT_SUBFOLDER
+    shutil.rmtree(progress_subfolder, ignore_errors=True)
+    metric = RadFactMetric(is_narrative_text=True, filter_negatives=True)
+    with mock.patch('radfact.metric.radfact.get_report_nli_engine', return_value=mock_nli_engine):
+        with mock.patch('radfact.metric.radfact.get_report_to_phrases_engine', side_effect=get_mock_phrase_engine):
+            with mock.patch(
+                'radfact.metric.radfact.get_negative_filtering_engine',
+                side_effect=get_mock_filtering_engine,
+            ):
+                result, details = metric.compute_metric_score(candidates_narrative, references_narrative)
+
+    assert isinstance(result, float)
+    assert result == 0.75
+    assert isinstance(details, dict)
+
+    expected_details = {
+        "logical_precision": 0.75,
+        "logical_recall": 0.75,
+        "spatial_precision": 0.0,
+        "spatial_recall": 0.0,
+        "grounding_precision": 0.0,
+        "grounding_recall": 0.0,
+        "num_candidate_phrases": 4,
+        "num_reference_phrases": 4,
+        "num_candidate_phrases_with_boxes": 0,
+        "num_reference_phrases_with_boxes": 0,
+        "logical_f1": 0.75,
+        "spatial_f1": 0.0,
+        "grounding_f1": 0.0,
+        "num_samples": 1,
+        "num_llm_failures": 0,
+        "num_llm_success": 8,
+        "num_llm_phrase_rewrites": 0,
+        "num_invalid_processed_samples": 0,
+        "report_to_phrases/generations/num_failures": 0,
+        "report_to_phrases/generations/num_success": 1,
+        'negative_filtering_report_to_phrases/generations/num_failures': 0.0,
+        'negative_filtering_report_to_phrases/generations/num_success': 1.0,
+        'negative_filtering_report_to_phrases/generations/num_rewritten_sentences': 1,
+        "report_to_phrases/ground_truth/num_failures": 0,
+        "report_to_phrases/ground_truth/num_success": 1,
+        'negative_filtering_report_to_phrases/ground_truth/num_failures': 0.0,
+        'negative_filtering_report_to_phrases/ground_truth/num_success': 1.0,
+        'negative_filtering_report_to_phrases/ground_truth/num_rewritten_sentences': 1,
+        "report_to_phrases/num_dropped_candidates": 0,
+        "report_to_phrases/num_dropped_references": 0,
+    }
+    assert_equal(actual=details, desired=expected_details, verbose=True)
+
+
 def test_convert_input_to_multimodal() -> None:
     """
     Test that we can convert the input to a multimodal grounded sequence correctly.

From 308e18317a09e6952f82495c851bbca4afc47cc6 Mon Sep 17 00:00:00 2001
From: Cynthia Lo <t-cynthialo@microsoft.com>
Date: Tue, 13 Jan 2026 07:49:35 -0800
Subject: [PATCH 06/14] Add report type cli support

---
 README.md                                         |  4 +++-
 configs/default.yaml                              |  2 --
 src/radfact/cli/run_radfact.py                    | 12 ++++++++++++
 src/radfact/llm_utils/nli/processor.py            | 13 +++++--------
 .../llm_utils/report_to_phrases/processor.py      | 15 +++++----------
 src/radfact/metric/radfact.py                     | 10 ++++++++--
 tests/metric/test_radfact.py                      | 11 +----------
 7 files changed, 34 insertions(+), 33 deletions(-)

diff --git a/README.md b/README.md
index 79da37b..63c76b8 100644
--- a/README.md
+++ b/README.md
@@ -157,6 +157,8 @@ options:
                         Path to the directory where the results will be saved as a json file.
   --bootstrap_samples BOOTSTRAP_SAMPLES
                         Number of bootstrap samples to use for computing the confidence intervals. Set to 0 to disable bootstrapping.
+  --report_type {cxr,ct}
+                        Type of report: 'cxr' for chest x-ray reports or 'ct' for CT reports.
 ```
 
 - for non-grounded reports (findings generation narrative text):
@@ -178,7 +180,7 @@ The script computes confidence intervals for the metrics using bootstrapping. Th
 ⚠️**WARNING**: Some queries may fail due to the endpoint limitations (timeouts, rate limits, etc.). When the LLM performing entailment verification fails, we **set these examples as not-entailed by default**. If this occurs in a significant number of cases, the results will not be reliable. The final metrics dict contains the number of such skipped queries under the key `num_llm_failures`. The script will print the number of skipped queries at the end of the run, and store these in the `skipped` directroy under the run id folder. You will also see a warning message in the logs for each failed query. `WARNING: No response for example {query_id}. Setting as NOT ENTAILED`.
 
 ### Supporting Multiple Report Rypes
-RadFact supports different report types through the `report_type` field in [`configs/default.yaml`](configs/default.yaml). Currently supported options are:
+RadFact supports different report types through the `report_type` field in the `RadFactMetric` class. Currently supported options are:
 
 - `cxr` - Chest X-ray reports (default)
 - `ct` - CT scan reports
diff --git a/configs/default.yaml b/configs/default.yaml
index abffd5e..28c4869 100644
--- a/configs/default.yaml
+++ b/configs/default.yaml
@@ -21,8 +21,6 @@ processing:
   end_index: null
   output_filename: "outputs.json"
 
-report_type: "cxr" # cxr or ct
-
 # The type of cache that should be set for langchain. This can be either "redis" or "sqlite".
 # Sqlite cache is useful for local development, it will be written to ~/.langchain.db
 # Redis cache is useful to share state across many evaluation runs in AzureML
diff --git a/src/radfact/cli/run_radfact.py b/src/radfact/cli/run_radfact.py
index a08a23e..f2140e9 100644
--- a/src/radfact/cli/run_radfact.py
+++ b/src/radfact/cli/run_radfact.py
@@ -10,6 +10,7 @@
 
 import pandas as pd
 
+from radfact.llm_utils.prompt_tasks import ReportType
 from radfact.data_utils.grounded_phrase_list import GroundedPhraseList
 from radfact.llm_utils.report_to_phrases.processor import StudyIdType
 from radfact.metric.bootstrapping import MetricBootstrapper
@@ -66,12 +67,14 @@ def compute_radfact_scores(
     candidates: InputDict,
     references: InputDict,
     is_narrative_text: bool,
+    report_type: ReportType,
     bootstrap_samples: int,
 ) -> dict[str, float]:
     radfact_metric = RadFactMetric(
         nli_config_name=radfact_config_name,
         phrase_config_name=phrases_config_name,
         is_narrative_text=is_narrative_text,
+        report_type=report_type,
     )
     if bootstrap_samples == 0:
         _, results = radfact_metric.compute_metric_score(candidates, references)
@@ -131,6 +134,13 @@ def main() -> None:
         "bootstrapping.",
         default=500,
     )
+    parser.add_argument(
+        "--report_type",
+        type=str,
+        choices=["cxr", "ct"],
+        help="Type of report: 'cxr' for chest x-ray reports or 'ct' for CT reports.",
+        default="cxr",
+    )
 
     args = parser.parse_args()
     input_path = Path(args.input_path)
@@ -139,6 +149,7 @@ def main() -> None:
     radfact_config_name = args.radfact_config_name
     phrases_config_name = args.phrases_config_name
     bootstrap_samples = args.bootstrap_samples
+    report_type = ReportType(args.report_type)
 
     assert input_path.suffix in [".csv", ".json"], "Input file must be a csv or json file."
     assert input_path.suffix == ".csv" or not is_narrative_text, (
@@ -163,6 +174,7 @@ def main() -> None:
         references=references,
         is_narrative_text=is_narrative_text,
         bootstrap_samples=bootstrap_samples,
+        report_type=report_type,
     )
 
     print_fn = print_results if bootstrap_samples == 0 else print_bootstrap_results
diff --git a/src/radfact/llm_utils/nli/processor.py b/src/radfact/llm_utils/nli/processor.py
index 6228f7e..59c5a0f 100644
--- a/src/radfact/llm_utils/nli/processor.py
+++ b/src/radfact/llm_utils/nli/processor.py
@@ -188,16 +188,13 @@ def format_row_to_nli_query_sample(row: "pd.Series[Any]") -> NLIQuerySample:
 
 
 def get_report_nli_engine(
-    cfg: DictConfig, candidates: dict[str, GroundedPhraseList], references: dict[str, GroundedPhraseList]
+    cfg: DictConfig,
+    candidates: dict[str, GroundedPhraseList],
+    references: dict[str, GroundedPhraseList],
+    report_type: ReportType = ReportType.CXR,
 ) -> LLMEngine:
     output_folder = get_subfolder(root=OUTPUT_DIR, subfolder=RADFACT_SUBFOLDER)
-    report_type_value = cfg.get("report_type")
-    try:
-        report_type = ReportType(report_type_value)
-    except ValueError as e:
-        raise ValueError(
-            f"Invalid report_type '{report_type_value}'. Valid options are: {[rt.value for rt in ReportType]}"
-        ) from e
+
     nli_report_processor = ReportGroundingNLIProcessor(
         report_type=report_type, format_query_fn=format_row_to_nli_query_sample
     )
diff --git a/src/radfact/llm_utils/report_to_phrases/processor.py b/src/radfact/llm_utils/report_to_phrases/processor.py
index b88c5ab..97c0acb 100644
--- a/src/radfact/llm_utils/report_to_phrases/processor.py
+++ b/src/radfact/llm_utils/report_to_phrases/processor.py
@@ -49,13 +49,15 @@ def get_findings_from_row(row: "pd.Series[Any]") -> str:
     return findings
 
 
-def get_report_to_phrases_engine(cfg: DictConfig, dataset_df: pd.DataFrame) -> LLMEngine:
+def get_report_to_phrases_engine(
+    cfg: DictConfig, dataset_df: pd.DataFrame, report_type: ReportType = ReportType.CXR
+) -> LLMEngine:
     """
     Create the processing engine for converting reports to phrases.
 
     :param cfg: The configuration for the processing engine.
     :param dataset_df: The dataset DataFrame.
-    :param subfolder: The subfolder to save the processing output.
+    :param report_type: The type of report, e.g., CXR or CT.
     :return: The processing engine.
     """
     subfolder = cfg.dataset.name
@@ -63,14 +65,7 @@ def get_report_to_phrases_engine(cfg: DictConfig, dataset_df: pd.DataFrame) -> L
     output_folder = get_subfolder(root, subfolder)
     final_output_folder = get_subfolder(root, subfolder)
     log_dir = get_subfolder(root, "logs")
-
-    report_type_value = cfg.get("report_type")
-    try:
-        report_type = ReportType(report_type_value)
-    except ValueError as e:
-        raise ValueError(
-            f"Invalid report_type '{report_type_value}'. Valid options are: {[rt.value for rt in ReportType]}"
-        ) from e
+    breakpoint()
 
     report_to_phrases_processor = get_report_to_phrases_processor(report_type=report_type, log_dir=log_dir)
     id_col = cfg.processing.index_col
diff --git a/src/radfact/metric/radfact.py b/src/radfact/metric/radfact.py
index 1da0b18..0dd93e5 100644
--- a/src/radfact/metric/radfact.py
+++ b/src/radfact/metric/radfact.py
@@ -7,6 +7,7 @@
 from dataclasses import asdict, replace
 from typing import Any, Iterable, Mapping
 
+from radfact.llm_utils.prompt_tasks import ReportType
 import hydra
 import numpy as np
 import pandas as pd
@@ -71,6 +72,7 @@ def __init__(
         image_size: int = 224,
         box_precision_threshold: float = 0.5,
         is_narrative_text: bool = False,
+        report_type: ReportType = ReportType.CXR,
     ) -> None:
         """
         Initializes the RadFactMetric with the necessary configurations. We need to know the image size so we can
@@ -86,9 +88,11 @@ def __init__(
             findings section. We need to convert this to lists GroundedPhrase before conducting entailment verification.
             If False, we are running the metric on grounded reports, where the phrases are already in the correct
             format for entailment verification.
+        :param report_type: The type of report, e.g. CXR or CT
         """
         self.llm_nli_cfg = init_hydra_config(nli_config_name or RADFACT_CONFIG)
         self.llm_phrase_cfg = init_hydra_config(phrase_config_name or REPORT_TO_PHRASES_CONFIG)
+        self.report_type = report_type
         self.image_size = image_size
         self.box_precision_threshold = box_precision_threshold
         self.is_narrative_text = is_narrative_text
@@ -206,7 +210,7 @@ def convert_narrative_text_to_phrases(
         texts_as_str_df = pd.DataFrame(
             {id_col: study_id, FINDINGS_SECTION: texts_as_str[study_id]} for study_id in texts_as_str.keys()
         )
-        engine = get_report_to_phrases_engine(self.llm_phrase_cfg, texts_as_str_df)
+        engine = get_report_to_phrases_engine(self.llm_phrase_cfg, texts_as_str_df, self.report_type)
         parsed_reports: list[ParsedReport] = engine.run()
         processed_texts = {
             parsed.id: parsed.to_grounded_phrases_list() for parsed in parsed_reports if parsed.id is not None
@@ -304,7 +308,9 @@ def compute_results_per_sample(self, candidates: InputDict, references: InputDic
         candidates_str_ids = {str(study_id): sequence for study_id, sequence in candidates_mm.items()}
         references_str_ids = {str(study_id): sequence for study_id, sequence in references_mm.items()}
 
-        llm_ev_engine = get_report_nli_engine(self.llm_nli_cfg, candidates_str_ids, references_str_ids)
+        llm_ev_engine = get_report_nli_engine(
+            self.llm_nli_cfg, candidates_str_ids, references_str_ids, self.report_type
+        )
         processed_samples: list[NLISample] = llm_ev_engine.run()
         if llm_ev_engine.aggregated_processor_stats:
             self.meta_metrics.update(llm_ev_engine.aggregated_processor_stats)
diff --git a/tests/metric/test_radfact.py b/tests/metric/test_radfact.py
index b631d72..776614f 100644
--- a/tests/metric/test_radfact.py
+++ b/tests/metric/test_radfact.py
@@ -8,8 +8,6 @@
 
 import mock
 import pandas as pd
-from radfact.llm_utils.report_to_phrases.processor import get_report_to_phrases_engine
-from radfact.metric.radfact import REPORT_TO_PHRASES_CONFIG, init_hydra_config
 from radfact.llm_utils.nli.processor import get_ev_processor_singlephrase
 from radfact.paths import OUTPUT_DIR
 from radfact.llm_utils.prompt_tasks import ReportType
@@ -255,7 +253,7 @@ def test_nli_processing_with_endpoint(mock_nli_engine: mock.Mock) -> None:
     }
 
 
-def get_mock_phrase_engine(llm_phrase_cfg: DictConfig, df: pd.DataFrame) -> mock.Mock:
+def get_mock_phrase_engine(llm_phrase_cfg: DictConfig, df: pd.DataFrame, report_type: ReportType) -> mock.Mock:
     mock_phrase_engine = mock.Mock()
     if df["FINDINGS"].values[0] == "The cat The dog The bird The rabbit":
         mock_phrase_engine.run.return_value = [
@@ -475,10 +473,3 @@ def test_report_type_nli(report_type_value: str) -> None:
         for single_phrase_sample in one_way_dict.values():
             few_shot_examples_single_phrase.extend(single_phrase_sample)
     assert few_shot_examples_single_phrase == processor.query_template.examples
-
-
-def test_invalid_report_type() -> None:
-    config = init_hydra_config(REPORT_TO_PHRASES_CONFIG)
-    config.report_type = "invalid_type"
-    with pytest.raises(ValueError):
-        get_report_to_phrases_engine(cfg=config, dataset_df=pd.DataFrame({}, columns=["study_id", "FINDINGS"]))

From 9a43812d4ebe43c999301a5ebab2840542e4931a Mon Sep 17 00:00:00 2001
From: Cynthia Lo <t-cynthialo@microsoft.com>
Date: Tue, 13 Jan 2026 07:57:50 -0800
Subject: [PATCH 07/14] Remove breakpoint

---
 src/radfact/llm_utils/report_to_phrases/processor.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/radfact/llm_utils/report_to_phrases/processor.py b/src/radfact/llm_utils/report_to_phrases/processor.py
index 97c0acb..39d623f 100644
--- a/src/radfact/llm_utils/report_to_phrases/processor.py
+++ b/src/radfact/llm_utils/report_to_phrases/processor.py
@@ -65,7 +65,6 @@ def get_report_to_phrases_engine(
     output_folder = get_subfolder(root, subfolder)
     final_output_folder = get_subfolder(root, subfolder)
     log_dir = get_subfolder(root, "logs")
-    breakpoint()
 
     report_to_phrases_processor = get_report_to_phrases_processor(report_type=report_type, log_dir=log_dir)
     id_col = cfg.processing.index_col

From 4c811a967026f6fa4e6577e72e16f1136d983af7 Mon Sep 17 00:00:00 2001
From: Cynthia Lo <t-cynthialo@microsoft.com>
Date: Thu, 15 Jan 2026 06:35:58 -0800
Subject: [PATCH 08/14] Upgrade ubuntu in ci checks

---
 .github/workflows/pr-checks.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/pr-checks.yaml b/.github/workflows/pr-checks.yaml
index ba7a8dc..a784c56 100644
--- a/.github/workflows/pr-checks.yaml
+++ b/.github/workflows/pr-checks.yaml
@@ -21,7 +21,7 @@ permissions:
 
 jobs:
   run_code_quality:
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v3
 
@@ -50,7 +50,7 @@ jobs:
         shell: bash -el {0}
 
   run_pytest:
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v3
         with:

From 07640cbb14768d9c1943c3fb087469fbaf7716a7 Mon Sep 17 00:00:00 2001
From: Cynthia Lo <t-cynthialo@microsoft.com>
Date: Thu, 15 Jan 2026 09:22:39 -0800
Subject: [PATCH 09/14] Rename rephrases to phrase list

---
 .../llm_utils/negative_filtering/processor.py | 17 +++++++++-----
 .../prompts/ct/few_shot_examples.json         | 22 +++++++++----------
 .../llm_utils/report_to_phrases/schema.py     | 11 +++++-----
 tests/metric/test_radfact.py                  |  4 ++--
 4 files changed, 29 insertions(+), 25 deletions(-)

diff --git a/src/radfact/llm_utils/negative_filtering/processor.py b/src/radfact/llm_utils/negative_filtering/processor.py
index 4b02913..3308510 100644
--- a/src/radfact/llm_utils/negative_filtering/processor.py
+++ b/src/radfact/llm_utils/negative_filtering/processor.py
@@ -13,7 +13,12 @@
 
 from radfact.llm_utils.engine.engine import LLMEngine, get_subfolder
 from radfact.llm_utils.processor.structured_processor import StructuredProcessor, parse_examples_from_json
-from radfact.llm_utils.report_to_phrases.schema import ParsedReport, Rephrases, RephrasesExample, SentenceWithRephrases
+from radfact.llm_utils.report_to_phrases.schema import (
+    ParsedReport,
+    PhraseList,
+    PhraseListExample,
+    SentenceWithRephrases,
+)
 from radfact.paths import OUTPUT_DIR
 
 NEGATIVE_FILTERING_SUBFOLDER = "negative_report_filtering"
@@ -23,7 +28,7 @@
 
 def get_negative_filtering_phrase_processor(
     report_type: ReportType, log_dir: Path | None = None
-) -> StructuredProcessor[list[str], Rephrases]:
+) -> StructuredProcessor[list[str], PhraseList]:
     """Return a processor for filtering negative findings from a list of phrases.
 
     :param report_type: The type of report, e.g., "ReportType.CXR" or "ReportType.CT".
@@ -32,10 +37,10 @@ def get_negative_filtering_phrase_processor(
     """
     task = NegativeFilteringTaskOptions[report_type.name].value
     system_prompt = task.system_message_path.read_text()
-    few_shot_examples = parse_examples_from_json(task.few_shot_examples_path, RephrasesExample)
+    few_shot_examples = parse_examples_from_json(task.few_shot_examples_path, PhraseListExample)
     processor = StructuredProcessor(
         query_type=list[str],
-        result_type=Rephrases,
+        result_type=PhraseList,
         system_prompt=system_prompt,
         format_query_fn=lambda x: json.dumps(x),
         few_shot_examples=few_shot_examples,
@@ -110,14 +115,14 @@ def process_filtered_reports(engine: LLMEngine, cfg: DictConfig) -> tuple[list[P
     num_rewritten_sentences = 0
 
     for k in outputs.keys():
-        rephrases = outputs[k]
+        phrase_list = outputs[k]
         metadata_df = metadata[k].df
 
         for idx, row in metadata_df.iterrows():
             study_id = row[cfg.processing.index_col].rsplit("_", 1)[0]
             orig = row[ORIG]
             unfiltered_phrases = set(row[NEW])
-            filtered_phrases = set(rephrases[idx].new)
+            filtered_phrases = set(phrase_list[idx].phrases)
 
             if not filtered_phrases.issubset(unfiltered_phrases):
                 rewritten_phrases = filtered_phrases - unfiltered_phrases
diff --git a/src/radfact/llm_utils/negative_filtering/prompts/ct/few_shot_examples.json b/src/radfact/llm_utils/negative_filtering/prompts/ct/few_shot_examples.json
index ac55c6a..cfd6fd3 100644
--- a/src/radfact/llm_utils/negative_filtering/prompts/ct/few_shot_examples.json
+++ b/src/radfact/llm_utils/negative_filtering/prompts/ct/few_shot_examples.json
@@ -8,7 +8,7 @@
             "Mild recessions are observed in the upper lobe of the left lung."
         ],
         "output": {
-            "new": [
+            "phrases": [
                 "There is a relative opacity observed in the left mid-to-lower lung, possibly located in the lingula.",
                 "Mild recessions are observed in the upper lobe of the left lung."
             ]
@@ -22,11 +22,10 @@
             "No signs of pleural effusions."
         ],
         "output": {
-
-        "new": [
-            "The left side still shows mediastinal shifting and volume loss."
-        ]
-    }
+            "phrases": [
+                "The left side still shows mediastinal shifting and volume loss."
+            ]
+        }
     }, {
         "input": [
             "There is a moderate right pleural effusion.",
@@ -36,11 +35,10 @@
             "The mediastinal contours are unremarkable."
         ],
         "output": {
-
-        "new": [
-            "There is a moderate right pleural effusion.",
-            "The radiograph shows linear opacities in the right middle lobe and left lower lobe, indicating atelectasis."
-        ]
-    }
+            "phrases": [
+                "There is a moderate right pleural effusion.",
+                "The radiograph shows linear opacities in the right middle lobe and left lower lobe, indicating atelectasis."
+            ]
+        }
     }
 ]
\ No newline at end of file
diff --git a/src/radfact/llm_utils/report_to_phrases/schema.py b/src/radfact/llm_utils/report_to_phrases/schema.py
index b185efd..0a9de94 100644
--- a/src/radfact/llm_utils/report_to_phrases/schema.py
+++ b/src/radfact/llm_utils/report_to_phrases/schema.py
@@ -13,12 +13,13 @@
 from radfact.llm_utils.processor.base_processor import BaseModelWithId
 
 
-class Rephrases(BaseModel):
-    new: list[str]
+class PhraseList(BaseModel):
+    phrases: list[str]
 
 
-class SentenceWithRephrases(Rephrases):
+class SentenceWithRephrases(BaseModel):
     orig: str
+    new: list[str]
 
 
 class ParsedReport(BaseModelWithId):
@@ -72,11 +73,11 @@ def to_grounded_phrases_list(self, rephrased: bool = True) -> GroundedPhraseList
         return sequence
 
 
-class RephrasesExample(BaseModel):
+class PhraseListExample(BaseModel):
     """A single example of a list of phrases before and after processing"""
 
     input: list[str]
-    output: Rephrases
+    output: PhraseList
 
 
 class PhraseParsingExample(BaseModel):
diff --git a/tests/metric/test_radfact.py b/tests/metric/test_radfact.py
index 43809b3..ba31f61 100644
--- a/tests/metric/test_radfact.py
+++ b/tests/metric/test_radfact.py
@@ -10,7 +10,7 @@
 
 import mock
 import pandas as pd
-from radfact.llm_utils.report_to_phrases.schema import Rephrases
+from radfact.llm_utils.report_to_phrases.schema import PhraseList
 from radfact.llm_utils.engine.data_subset import DataSubset
 from radfact.llm_utils.nli.processor import get_ev_processor_singlephrase
 from radfact.paths import OUTPUT_DIR
@@ -302,7 +302,7 @@ def get_mock_filtering_engine(
     # Inject a rewritten phrase to simulate filtering mistake
     new_parsed_reports[0].sentence_list[0].new.append("Dummy filtered phrase")
     mock_filtering_engine.return_raw_outputs = {
-        "endpoint_1": [Rephrases(new=report.sentence_list[0].new) for report in new_parsed_reports]
+        "endpoint_1": [PhraseList(phrases=report.sentence_list[0].new) for report in new_parsed_reports]
     }
     if parsed_reports[0].sentence_list[0].orig == "The cat The dog The bird The rabbit":
         mock_filtering_engine.return_dataset_subsets = {

From 3a31d660baa30b891f1719001b399a2b0ac9f866 Mon Sep 17 00:00:00 2001
From: Cynthia Lo <t-cynthialo@microsoft.com>
Date: Thu, 15 Jan 2026 09:27:03 -0800
Subject: [PATCH 10/14] Minor fixes

---
 src/radfact/llm_utils/negative_filtering/processor.py | 5 ++---
 src/radfact/llm_utils/report_to_phrases/schema.py     | 2 ++
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/radfact/llm_utils/negative_filtering/processor.py b/src/radfact/llm_utils/negative_filtering/processor.py
index 3308510..fb990c8 100644
--- a/src/radfact/llm_utils/negative_filtering/processor.py
+++ b/src/radfact/llm_utils/negative_filtering/processor.py
@@ -8,7 +8,7 @@
 from pathlib import Path
 
 import pandas as pd
-from radfact.llm_utils.prompt_tasks import NegativeFilteringTaskOptions, ReportType
+from radfact.llm_utils.prompt_tasks import NEGATIVE_FILTERING_PARSING_TASK, NegativeFilteringTaskOptions, ReportType
 from omegaconf import DictConfig
 
 from radfact.llm_utils.engine.engine import LLMEngine, get_subfolder
@@ -21,7 +21,6 @@
 )
 from radfact.paths import OUTPUT_DIR
 
-NEGATIVE_FILTERING_SUBFOLDER = "negative_report_filtering"
 ORIG = "orig"
 NEW = "new"
 
@@ -81,7 +80,7 @@ def get_negative_filtering_engine(
     :param report_type: The type of report, e.g., CT.
     :return: The processing engine.
     """
-    OUTPUT_FOLDER = OUTPUT_DIR / NEGATIVE_FILTERING_SUBFOLDER
+    OUTPUT_FOLDER = OUTPUT_DIR / NEGATIVE_FILTERING_PARSING_TASK
     output_folder = get_subfolder(OUTPUT_FOLDER, subfolder_prefix)
     final_output_folder = get_subfolder(OUTPUT_FOLDER, subfolder_prefix)
     log_dir = get_subfolder(OUTPUT_FOLDER, "logs")
diff --git a/src/radfact/llm_utils/report_to_phrases/schema.py b/src/radfact/llm_utils/report_to_phrases/schema.py
index 0a9de94..baee819 100644
--- a/src/radfact/llm_utils/report_to_phrases/schema.py
+++ b/src/radfact/llm_utils/report_to_phrases/schema.py
@@ -14,10 +14,12 @@
 
 
 class PhraseList(BaseModel):
+    """Dataclass for a list of phrases."""
     phrases: list[str]
 
 
 class SentenceWithRephrases(BaseModel):
+    """Dataclass for a sentence with rephrases. The source sentence is 'orig' and the rephrased sentences are 'new'."""
     orig: str
     new: list[str]
 

From 12d7e298c72be0ae2dbc3bd5024b91de847fd000 Mon Sep 17 00:00:00 2001
From: Cynthia Lo <cynthiadeniselo@gmail.com>
Date: Fri, 16 Jan 2026 09:08:16 +0000
Subject: [PATCH 11/14] Update
 src/radfact/llm_utils/negative_filtering/processor.py

Co-authored-by: Kenza Bouzid <37396332+kenza-bouzid@users.noreply.github.com>
Signed-off-by: Cynthia Lo <cynthiadeniselo@gmail.com>
---
 src/radfact/llm_utils/negative_filtering/processor.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/radfact/llm_utils/negative_filtering/processor.py b/src/radfact/llm_utils/negative_filtering/processor.py
index 3308510..f84697c 100644
--- a/src/radfact/llm_utils/negative_filtering/processor.py
+++ b/src/radfact/llm_utils/negative_filtering/processor.py
@@ -57,6 +57,7 @@ def load_filtering_queries_from_parsed_reports(
     Load queries for filtering from a list of parsed reports. Queries consist of all the
     newly parsed phrases from phrasification, along with metadata including the study ID
     and original phrase.
+ 
     :param reports: A list of ParsedReport objects.
     :param index_col: The column containing the index
     :return: A list of queries.

From e081497282ab0f8af97e59e7624a00c8479fe2c1 Mon Sep 17 00:00:00 2001
From: Cynthia Lo <cynthiadeniselo@gmail.com>
Date: Fri, 16 Jan 2026 09:08:26 +0000
Subject: [PATCH 12/14] Update
 src/radfact/llm_utils/negative_filtering/processor.py

Co-authored-by: Kenza Bouzid <37396332+kenza-bouzid@users.noreply.github.com>
Signed-off-by: Cynthia Lo <cynthiadeniselo@gmail.com>
---
 src/radfact/llm_utils/negative_filtering/processor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/radfact/llm_utils/negative_filtering/processor.py b/src/radfact/llm_utils/negative_filtering/processor.py
index f84697c..f930523 100644
--- a/src/radfact/llm_utils/negative_filtering/processor.py
+++ b/src/radfact/llm_utils/negative_filtering/processor.py
@@ -60,7 +60,7 @@ def load_filtering_queries_from_parsed_reports(
  
     :param reports: A list of ParsedReport objects.
     :param index_col: The column containing the index
-    :return: A list of queries.
+    :return: A dataframe of queries.
     """
     queries = []
     for report in reports:

From 47ec6af292dded32b4228e9c5bbb47383d3516d1 Mon Sep 17 00:00:00 2001
From: Cynthia Lo <t-cynthialo@microsoft.com>
Date: Fri, 16 Jan 2026 01:12:31 -0800
Subject: [PATCH 13/14] Update documentation

---
 README.md                     | 5 ++++-
 src/radfact/metric/radfact.py | 3 +++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 8431fd1..03fe2c9 100644
--- a/README.md
+++ b/README.md
@@ -134,7 +134,8 @@ You can refer to the [getting_started](getting_started.ipynb) notebook to see ho
 
 ```bash
 $ run_radfact --help
-usage: run_radfact [-h] [--radfact_config_name RADFACT_CONFIG_NAME] [--phrases_config_name PHRASES_CONFIG_NAME] --input_path INPUT_PATH [--is_narrative_text] [--output_dir OUTPUT_DIR] [--bootstrap_samples BOOTSTRAP_SAMPLES]
+usage: run_radfact [-h] --input_path INPUT_PATH [--is_narrative_text] [--radfact_config_name RADFACT_CONFIG_NAME] [--phrases_config_name PHRASES_CONFIG_NAME] [--filtering_config_name FILTERING_CONFIG_NAME] [--output_dir OUTPUT_DIR] 
+[--bootstrap_samples BOOTSTRAP_SAMPLES] [--report_type {cxr,ct}] [--filter_negatives]
 
 Compute RadFact metric for a set of samples and saves the results to a json file.
 
@@ -153,6 +154,8 @@ options:
                         The name of the config file for reports to phrases conversion. We use the default config file but you can provide a custom config. Make sure the config follows
                         the same structure as `configs/report_to_phrases.yaml` and is saved in the `configs` directory. This is necessary for hydra initialization from the `configs`
                         directory.
+  --filtering_config_name FILTERING_CONFIG_NAME
+                        The name of the config file for negative finding filtering. We use the default config file but you can provide a custom config. Make sure the config follows the same structure as `configs/negative_filtering.yaml` and is saved in the `configs` directory. This is necessary for hydra initialization from the `configs` directory.
   --output_dir OUTPUT_DIR
                         Path to the directory where the results will be saved as a json file.
   --bootstrap_samples BOOTSTRAP_SAMPLES
diff --git a/src/radfact/metric/radfact.py b/src/radfact/metric/radfact.py
index ff682ef..0b7f2b7 100644
--- a/src/radfact/metric/radfact.py
+++ b/src/radfact/metric/radfact.py
@@ -90,6 +90,9 @@ def __init__(
             different endpoints that the NLI processor will use. If None, the default config will be used.
         :param phrase_config_name: The name of the phrase processing config file. This is the config file that specifies
             the different endpoints that the phrase processor will use. If None, the default config will be used.
+        :param filtering_config_name: The name of the negative filtering processing config file. This is the config file
+            that specifies the different endpoints that the negative filtering processor will use. If None, the default config
+            will be used.
         :param image_size: The size of the images in the reports.
         :param box_precision_threshold: The threshold for precision computation for boxes.
         :param is_narrative_text: If True, we are running the metric on data narrative text data, e.g. the original

From a0586ebeedf2c10c94c3b29e28ff6cda5ad75dcf Mon Sep 17 00:00:00 2001
From: Cynthia Lo <t-cynthialo@microsoft.com>
Date: Fri, 16 Jan 2026 01:14:29 -0800
Subject: [PATCH 14/14] Lint

---
 src/radfact/llm_utils/negative_filtering/processor.py | 2 +-
 src/radfact/llm_utils/report_to_phrases/schema.py     | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/radfact/llm_utils/negative_filtering/processor.py b/src/radfact/llm_utils/negative_filtering/processor.py
index ab32c53..e6e949c 100644
--- a/src/radfact/llm_utils/negative_filtering/processor.py
+++ b/src/radfact/llm_utils/negative_filtering/processor.py
@@ -56,7 +56,7 @@ def load_filtering_queries_from_parsed_reports(
     Load queries for filtering from a list of parsed reports. Queries consist of all the
     newly parsed phrases from phrasification, along with metadata including the study ID
     and original phrase.
- 
+
     :param reports: A list of ParsedReport objects.
     :param index_col: The column containing the index
     :return: A dataframe of queries.
diff --git a/src/radfact/llm_utils/report_to_phrases/schema.py b/src/radfact/llm_utils/report_to_phrases/schema.py
index baee819..ef9e92e 100644
--- a/src/radfact/llm_utils/report_to_phrases/schema.py
+++ b/src/radfact/llm_utils/report_to_phrases/schema.py
@@ -15,11 +15,13 @@
 
 class PhraseList(BaseModel):
     """Dataclass for a list of phrases."""
+
     phrases: list[str]
 
 
 class SentenceWithRephrases(BaseModel):
     """Dataclass for a sentence with rephrases. The source sentence is 'orig' and the rephrased sentences are 'new'."""
+
     orig: str
     new: list[str]