microsoft · cynthia-d-lo · Jan 16, 2026 · Jan 12, 2026 · Jan 12, 2026 · Jan 12, 2026
diff --git a/README.md b/README.md
@@ -134,7 +134,8 @@ You can refer to the [getting_started](getting_started.ipynb) notebook to see ho
 
 ```bash
 $ run_radfact --help
-usage: run_radfact [-h] [--radfact_config_name RADFACT_CONFIG_NAME] [--phrases_config_name PHRASES_CONFIG_NAME] --input_path INPUT_PATH [--is_narrative_text] [--output_dir OUTPUT_DIR] [--bootstrap_samples BOOTSTRAP_SAMPLES]
+usage: run_radfact [-h] --input_path INPUT_PATH [--is_narrative_text] [--radfact_config_name RADFACT_CONFIG_NAME] [--phrases_config_name PHRASES_CONFIG_NAME] [--filtering_config_name FILTERING_CONFIG_NAME] [--output_dir OUTPUT_DIR] 
+[--bootstrap_samples BOOTSTRAP_SAMPLES] [--report_type {cxr,ct}] [--filter_negatives]
 
 Compute RadFact metric for a set of samples and saves the results to a json file.
 
@@ -153,12 +154,15 @@ options:
                         The name of the config file for reports to phrases conversion. We use the default config file but you can provide a custom config. Make sure the config follows
                         the same structure as `configs/report_to_phrases.yaml` and is saved in the `configs` directory. This is necessary for hydra initialization from the `configs`
                         directory.
+  --filtering_config_name FILTERING_CONFIG_NAME
+                        The name of the config file for negative finding filtering. We use the default config file but you can provide a custom config. Make sure the config follows the same structure as `configs/negative_filtering.yaml` and is saved in the `configs` directory. This is necessary for hydra initialization from the `configs` directory.
   --output_dir OUTPUT_DIR
                         Path to the directory where the results will be saved as a json file.
   --bootstrap_samples BOOTSTRAP_SAMPLES
                         Number of bootstrap samples to use for computing the confidence intervals. Set to 0 to disable bootstrapping.
   --report_type {cxr,ct}
                         Type of report: 'cxr' for chest x-ray reports or 'ct' for CT reports.
+  --filter_negatives    Whether to filter negative findings from the parsed reports before computing the RadFact score.
 ```
 
 - for non-grounded reports (findings generation narrative text):
@@ -179,7 +183,7 @@ The script computes confidence intervals for the metrics using bootstrapping. Th
 
 ⚠️**WARNING**: Some queries may fail due to the endpoint limitations (timeouts, rate limits, etc.). When the LLM performing entailment verification fails, we **set these examples as not-entailed by default**. If this occurs in a significant number of cases, the results will not be reliable. The final metrics dict contains the number of such skipped queries under the key `num_llm_failures`. The script will print the number of skipped queries at the end of the run, and store these in the `skipped` directroy under the run id folder. You will also see a warning message in the logs for each failed query. `WARNING: No response for example {query_id}. Setting as NOT ENTAILED`.
 
-### Supporting Multiple Report Rypes
+### Supporting Multiple Report Types
 RadFact supports different report types through the `report_type` field in the `RadFactMetric` class. Currently supported options are:
 
 - `cxr` - Chest X-ray reports (default)
@@ -195,6 +199,13 @@ We also provide a script to convert reports to phrases. This is useful when you
 
 This script is configurable using the `report_to_phrases.yaml` config file. You can specify the input file, output file, and the endpoint to use for the conversion.
 
+### Filtering Negative Phrases
+Radiology reports can have a disproportionate number of negative findings, and filtering these out can help focus evaluation on clinically relevant positive findings. For non-grounded reports, RadFact can be configured to filter out negative findings once reports have been converted to phrases. Note that this feature is currently only available for CT reports.
+
+```bash
+  run_radfact --input_path <path_to_input_file.json> --is_narrative_text --filter_negatives
+```
+
 ## What is RadFact?
 
 ![Illustration of RadFact](RadFact.png "Illustration of RadFact")

diff --git a/configs/negative_filtering.yaml b/configs/negative_filtering.yaml
@@ -0,0 +1,9 @@
+#@package __global__
+
+defaults:
+  - default
+  - override endpoints: azure_chat_openai
+  - _self_
+
+processing:
+  index_col: sentence_id
diff --git a/src/radfact/cli/run_radfact.py b/src/radfact/cli/run_radfact.py
@@ -64,17 +64,21 @@ def get_candidates_and_references_from_json(
 def compute_radfact_scores(
     radfact_config_name: str | None,
     phrases_config_name: str | None,
+    filtering_config_name: str | None,
     candidates: InputDict,
     references: InputDict,
     is_narrative_text: bool,
     report_type: ReportType,
     bootstrap_samples: int,
+    filter_negatives: bool,
 ) -> dict[str, float]:
     radfact_metric = RadFactMetric(
         nli_config_name=radfact_config_name,
         phrase_config_name=phrases_config_name,
+        filtering_config_name=filtering_config_name,
         is_narrative_text=is_narrative_text,
         report_type=report_type,
+        filter_negatives=filter_negatives,
     )
     if bootstrap_samples == 0:
         _, results = radfact_metric.compute_metric_score(candidates, references)
@@ -121,6 +125,15 @@ def main() -> None:
         "initialization from the `configs` directory.",
         default=None,
     )
+    parser.add_argument(
+        "--filtering_config_name",
+        type=str,
+        help="The name of the config file for negative finding filtering. We use the default config file but you can "
+        "provide a custom config. Make sure the config follows the same structure as `configs/negative_filtering.yaml` "
+        "and is saved in the `configs` directory. This is necessary for hydra initialization from the `configs` "
+        "directory.",
+        default=None,
+    )
     parser.add_argument(
         "--output_dir",
         type=str,
@@ -141,15 +154,22 @@ def main() -> None:
         help="Type of report: 'cxr' for chest x-ray reports or 'ct' for CT reports.",
         default="cxr",
     )
+    parser.add_argument(
+        "--filter_negatives",
+        action="store_true",
+        help="Whether to filter negative findings from the parsed reports before computing the RadFact score.",
+    )
 
     args = parser.parse_args()
     input_path = Path(args.input_path)
     output_dir = Path(args.output_dir)
     is_narrative_text = args.is_narrative_text
     radfact_config_name = args.radfact_config_name
     phrases_config_name = args.phrases_config_name
+    filtering_config_name = args.filtering_config_name
     bootstrap_samples = args.bootstrap_samples
     report_type = ReportType(args.report_type)
+    filter_negatives = args.filter_negatives
 
     assert input_path.suffix in [".csv", ".json"], "Input file must be a csv or json file."
     assert input_path.suffix == ".csv" or not is_narrative_text, (
@@ -170,11 +190,13 @@ def main() -> None:
     results = compute_radfact_scores(
         radfact_config_name=radfact_config_name,
         phrases_config_name=phrases_config_name,
+        filtering_config_name=filtering_config_name,
         candidates=candidates,
         references=references,
         is_narrative_text=is_narrative_text,
         bootstrap_samples=bootstrap_samples,
         report_type=report_type,
+        filter_negatives=filter_negatives,
     )
 
     print_fn = print_results if bootstrap_samples == 0 else print_bootstrap_results

diff --git a/src/radfact/llm_utils/negative_filtering/__init__.py b/src/radfact/llm_utils/negative_filtering/__init__.py
diff --git a/src/radfact/llm_utils/negative_filtering/processor.py b/src/radfact/llm_utils/negative_filtering/processor.py
@@ -0,0 +1,140 @@
+#  ------------------------------------------------------------------------------------------
+#  Copyright (c) Microsoft Corporation. All rights reserved.
+#  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
+#  ------------------------------------------------------------------------------------------
+
+from collections import defaultdict
+import json
+from pathlib import Path
+
+import pandas as pd
+from radfact.llm_utils.prompt_tasks import NEGATIVE_FILTERING_PARSING_TASK, NegativeFilteringTaskOptions, ReportType
+from omegaconf import DictConfig
+
+from radfact.llm_utils.engine.engine import LLMEngine, get_subfolder
+from radfact.llm_utils.processor.structured_processor import StructuredProcessor, parse_examples_from_json
+from radfact.llm_utils.report_to_phrases.schema import (
+    ParsedReport,
+    PhraseList,
+    PhraseListExample,
+    SentenceWithRephrases,
+)
+from radfact.paths import OUTPUT_DIR
+
+ORIG = "orig"
+NEW = "new"
+
+
+def get_negative_filtering_phrase_processor(
+    report_type: ReportType, log_dir: Path | None = None
+) -> StructuredProcessor[list[str], PhraseList]:
+    """Return a processor for filtering negative findings from a list of phrases.
+
+    :param report_type: The type of report, e.g., "ReportType.CXR" or "ReportType.CT".
+    :param log_dir: The directory to save logs.
+    :return: The processor for negative finding filtering.
+    """
+    task = NegativeFilteringTaskOptions[report_type.name].value
+    system_prompt = task.system_message_path.read_text()
+    few_shot_examples = parse_examples_from_json(task.few_shot_examples_path, PhraseListExample)
+    processor = StructuredProcessor(
+        query_type=list[str],
+        result_type=PhraseList,
+        system_prompt=system_prompt,
+        format_query_fn=lambda x: json.dumps(x),
+        few_shot_examples=few_shot_examples,
+        log_dir=log_dir,
+    )
+    return processor
+
+
+def load_filtering_queries_from_parsed_reports(
+    reports: list[ParsedReport],
+    index_col: str,
+) -> pd.DataFrame:
+    """
+    Load queries for filtering from a list of parsed reports. Queries consist of all the
+    newly parsed phrases from phrasification, along with metadata including the study ID
+    and original phrase.
+
+    :param reports: A list of ParsedReport objects.
+    :param index_col: The column containing the index
+    :return: A dataframe of queries.
+    """
+    queries = []
+    for report in reports:
+        for i, sentence in enumerate(report.sentence_list):
+            queries.append([f"{report.id}_{i}", sentence.orig, sentence.new])
+    query_df = pd.DataFrame(queries, columns=[index_col, ORIG, NEW])
+    return query_df
+
+
+def get_negative_filtering_engine(
+    cfg: DictConfig, parsed_reports: list[ParsedReport], subfolder_prefix: str, report_type: ReportType
+) -> LLMEngine:
+    """
+    Create the processing engine for filtering negative findings from parsed reports.
+
+    :param cfg: The configuration for the processing engine.
+    :param parsed_reports: A list of ParsedReport objects to filter.
+    :param subfolder_prefix: The prefix for the metric folder
+    :param report_type: The type of report, e.g., CT.
+    :return: The processing engine.
+    """
+    OUTPUT_FOLDER = OUTPUT_DIR / NEGATIVE_FILTERING_PARSING_TASK
+    output_folder = get_subfolder(OUTPUT_FOLDER, subfolder_prefix)
+    final_output_folder = get_subfolder(OUTPUT_FOLDER, subfolder_prefix)
+    log_dir = get_subfolder(OUTPUT_FOLDER, "logs")
+
+    query_df = load_filtering_queries_from_parsed_reports(parsed_reports, cfg.processing.index_col)
+    negative_filtering_processor = get_negative_filtering_phrase_processor(report_type=report_type, log_dir=log_dir)
+
+    engine = LLMEngine(
+        cfg=cfg,
+        processor=negative_filtering_processor,
+        dataset_df=query_df,
+        row_to_query_fn=lambda row: row[NEW],
+        progress_output_folder=output_folder,
+        final_output_folder=final_output_folder,
+    )
+    return engine
+
+
+def process_filtered_reports(engine: LLMEngine, cfg: DictConfig) -> tuple[list[ParsedReport], int]:
+    """
+    Process the filtered reports using the provided engine.
+
+    :param engine: The LLMEngine used for processing.
+    :param cfg: The configuration for negative filtering processing.
+    :return: A tuple containing a list of ParsedReport objects and the number of rewritten sentences.
+    """
+    outputs = engine.return_raw_outputs
+    metadata = engine.return_dataset_subsets
+
+    parsed_report_dict = defaultdict(list)
+    num_rewritten_sentences = 0
+
+    for k in outputs.keys():
+        phrase_list = outputs[k]
+        metadata_df = metadata[k].df
+
+        for idx, row in metadata_df.iterrows():
+            study_id = row[cfg.processing.index_col].rsplit("_", 1)[0]
+            orig = row[ORIG]
+            unfiltered_phrases = set(row[NEW])
+            filtered_phrases = set(phrase_list[idx].phrases)
+
+            if not filtered_phrases.issubset(unfiltered_phrases):
+                rewritten_phrases = filtered_phrases - unfiltered_phrases
+                print(
+                    f"New phrases {rewritten_phrases} not in original phrases {unfiltered_phrases}. Reverting back to original phrases."
+                )
+                filtered_phrases = unfiltered_phrases
+                num_rewritten_sentences += 1
+
+            parsed_report_dict[study_id].append(SentenceWithRephrases(orig=orig, new=list(filtered_phrases)))
+
+    parsed_reports = [
+        ParsedReport(id=study_id, sentence_list=sentences) for study_id, sentences in parsed_report_dict.items()
+    ]
+    return parsed_reports, num_rewritten_sentences
diff --git a/src/radfact/llm_utils/negative_filtering/prompts/ct/few_shot_examples.json b/src/radfact/llm_utils/negative_filtering/prompts/ct/few_shot_examples.json
@@ -0,0 +1,44 @@
+[
+    {
+        "input": [
+            "There is a relative opacity observed in the left mid-to-lower lung, possibly located in the lingula.",
+            "There is no evidence of pneumothorax.",
+            "The cardiac silhouette is unremarkable.",
+            "The mediastinal silhouette is unremarkable.",
+            "Mild recessions are observed in the upper lobe of the left lung."
+        ],
+        "output": {
+            "phrases": [
+                "There is a relative opacity observed in the left mid-to-lower lung, possibly located in the lingula.",
+                "Mild recessions are observed in the upper lobe of the left lung."
+            ]
+        }
+    }, {
+        "input": [
+            "The right lung is well aerated.",
+            "No signs of pulmonary edema.",
+            "No signs of focal consolidation.",
+            "The left side still shows mediastinal shifting and volume loss.",
+            "No signs of pleural effusions."
+        ],
+        "output": {
+            "phrases": [
+                "The left side still shows mediastinal shifting and volume loss."
+            ]
+        }
+    }, {
+        "input": [
+            "There is a moderate right pleural effusion.",
+            "There is no pneumothorax.",
+            "The heart size is within normal limits.",
+            "The radiograph shows linear opacities in the right middle lobe and left lower lobe, indicating atelectasis.",
+            "The mediastinal contours are unremarkable."
+        ],
+        "output": {
+            "phrases": [
+                "There is a moderate right pleural effusion.",
+                "The radiograph shows linear opacities in the right middle lobe and left lower lobe, indicating atelectasis."
+            ]
+        }
+    }
+]
diff --git a/src/radfact/llm_utils/negative_filtering/prompts/ct/system_message.txt b/src/radfact/llm_utils/negative_filtering/prompts/ct/system_message.txt
@@ -0,0 +1,13 @@
+You are an AI radiology assistant. You are helping process reports from CT (computed tomography) scans.
+
+You are given a list of phrases from a radiology report which refer to objects, findings, or anatomies visible in a CT scan, or the absence of such.
+
+Your goal is to filter phrases that do not refer to positive radiology findings.
+
+Rules:
+- Remove statements describing the absence of pathology (e.g. "No pneumothorax", "No pleural effusion detected")
+- Remove statements describing normal anatomical appearance, calibration, or function (e.g. "The liver is normal in size", "Upper abdominal organs are normal", "Thoracic esophageal calibration was normal", "The lungs are well aerated", "Lungs are clear")
+- Remove statements describing unremarkable appearances (e.g. "Kidneys appear unremarkable", "The mediastinum is unremarkable")
+- Keep statements referring to "mild" observations or conditions, as those are still considered positive radiology findings
+
+The objective is to remove phrases which do not refer to positive radiology findings.
diff --git a/src/radfact/llm_utils/processor/structured_processor.py b/src/radfact/llm_utils/processor/structured_processor.py
@@ -3,6 +3,7 @@
 #  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
 #  ------------------------------------------------------------------------------------------
 
+import json
 import logging
 from enum import Enum
 from functools import partial
@@ -22,6 +23,7 @@
 
 _QUERY_KEY = "query"
 ResultT = TypeVar("ResultT", bound=BaseModel)
+ExampleClassT = TypeVar("ExampleClassT", bound=BaseModel)
 ProcessorStats = dict[str, int]
 
 
@@ -55,6 +57,34 @@ class Example(Protocol, Generic[QueryT, ResultT]):
     output: ResultT
 
 
+def parse_examples_from_json(examples_path: Path | None, example_class: type[ExampleClassT]) -> list[ExampleClassT]:
+    """
+    This function returns a list of "parsed" examples from a JSON file.
+
+    This JSON file is expected to contain a list of JSON-formatted objects, which should
+    be parseable by the "example class" (expected to be some Pydantic model).
+
+    If no path is provided, an empty list is returned.
+
+    This function is especially useful for loading few-shot examples for a structured processor.
+
+    :param examples_path: Path to the JSON file containing the examples.
+        If None, an empty list is returned.
+    :param example_class: The class of the examples to load. A Pydantic model.
+        We will attempt to parse each object in the JSON file as an instance of this class.
+    :return: List of examples, as instances of the provided class.
+    """
+    parsed_examples: list[ExampleClassT] = []
+    if examples_path is None:
+        return parsed_examples
+
+    with open(examples_path) as f:
+        unparsed_examples = json.load(f)
+        for example in unparsed_examples:
+            parsed_examples.append(example_class.parse_obj(example))
+    return parsed_examples
+
+
 class QueryTemplate(BaseChatPromptTemplate, Generic[QueryT, ResultT]):
     """Query template for a structured processor."""