From 12165034470930f852671ab82a5464679d5e351e Mon Sep 17 00:00:00 2001 From: Wente Geong Date: Fri, 16 Jan 2026 22:33:33 +0800 Subject: [PATCH] Feat: Add support for OmniDocBench dataset evaluation --- .gitignore | 1 + config/custom/omnidocbench_qwen_mllm.yaml | 64 ++++++++ .../assets/datasets/loaders/jsonl_loader.py | 2 + .../assets/datasets/preprocessors/__init__.py | 3 +- .../assets/datasets/preprocessors/builtin.py | 9 ++ .../preprocessors/omnidoc_preprocessor.py | 121 ++++++++++++++ src/gage_eval/evaluation/sample_loop.py | 1 - src/gage_eval/metrics/aggregators.py | 151 +++++++++++++++++- src/gage_eval/metrics/builtin/__init__.py | 2 + .../metrics/builtin/ominidoc_all_metric.py | 44 +++++ src/gage_eval/metrics/registry.py | 3 + .../test_ominidoc_preprocessor.py | 38 +++++ 12 files changed, 436 insertions(+), 3 deletions(-) create mode 100644 config/custom/omnidocbench_qwen_mllm.yaml create mode 100644 src/gage_eval/assets/datasets/preprocessors/omnidoc_preprocessor.py create mode 100644 src/gage_eval/metrics/builtin/ominidoc_all_metric.py create mode 100644 tests/preprocessors/test_ominidoc_preprocessor.py diff --git a/.gitignore b/.gitignore index efb7c10..4141fd3 100755 --- a/.gitignore +++ b/.gitignore @@ -216,3 +216,4 @@ scripts/oneclick/.env.example __pycache__/ *.pyc *.pyo +run.sh diff --git a/config/custom/omnidocbench_qwen_mllm.yaml b/config/custom/omnidocbench_qwen_mllm.yaml new file mode 100644 index 0000000..dbca988 --- /dev/null +++ b/config/custom/omnidocbench_qwen_mllm.yaml @@ -0,0 +1,64 @@ +api_version: gage/v1alpha1 +kind: PipelineConfig +metadata: + name: omnidocbench_qwen_mllm + description: A multi-modal evaluation for OmniDocBench using a Qwen MLLM. + +custom: + steps: + - step: inference + - step: auto_eval + +# 1. Dataset Configuration +datasets: + - dataset_id: omnidocbench_val + loader: jsonl + params: + path: /mnt/sdb1/ywt/OmniDocBench-main/OmniDocBench1_5/omnidocbench15_gage_r.jsonl + preprocess: omnidoc_image_standardizer + preprocess_kwargs: + question_field: prompt + content_field: image + content_root: /mnt/sdb1/ywt/OmniDocBench-main/OmniDocBench1_5/images + doc_to_visual: gage_eval.assets.datasets.utils.multimodal:embed_local_message_images + # doc_to_visual_kwargs 自动继承 preprocess_kwargs 的同名字段 + +backends: + - backend_id: omnidocbench_qwen_mllm_backend + type: litellm + config: + provider: openai + api_base: http://127.0.0.1:8685/v1 # 替换为实际 Vision-LLM 服务地址 + model: Qwen/Qwen3-Omni-30B-A3B-Instruct + generation_parameters: + max_new_tokens: 4096 # 输出尽量短,避免内存/延迟 + temperature: 0.1 + async_max_concurrency: 2 # MacBook 本地轻量并发 + +role_adapters: + - adapter_id: omnidocbench_qwen_vl + role_type: dut_model + backend_id: omnidocbench_qwen_mllm_backend + # Capability must match what the backend/adapter supports for multi-modal input + capabilities: + - vision_chat + +# 4. Metric Configuration +metrics: + - metric_id: omnidocbench_all_metric + # This implementation needs to be created as per test-1113.md + implementation: gage_eval.metrics.builtin.ominidoc_all_metric:OmniDocBenchMetric + # implementation: OmniDocBenchMetric + aggregation: omnidoclazycalc + +# 5. Task Configuration +tasks: + - task_id: doc_parsing_eval + dataset_id: omnidocbench_val + max_samples: 20 # MacBook 快速 smoke,可 CLI 覆盖 + reporting: + sinks: + - type: console + - type: file + params: + output_path: ${GAGE_EVAL_SAVE_DIR:-./runs}/omnidocbench_events.jsonl diff --git a/src/gage_eval/assets/datasets/loaders/jsonl_loader.py b/src/gage_eval/assets/datasets/loaders/jsonl_loader.py index 4a7ce66..c4b2cb1 100644 --- a/src/gage_eval/assets/datasets/loaders/jsonl_loader.py +++ b/src/gage_eval/assets/datasets/loaders/jsonl_loader.py @@ -54,6 +54,8 @@ def load(self, hub_handle: Optional[DatasetHubHandle], *, trace=None) -> DataSou doc_to_text = resolve_doc_to_callable(self.spec, "doc_to_text") doc_to_visual = resolve_doc_to_callable(self.spec, "doc_to_visual") doc_to_audio = resolve_doc_to_callable(self.spec, "doc_to_audio") + from gage_eval.assets.datasets import preprocessors + # apply_preprocess need registered preprocessors but fail to import preprocessors.builtin.py at begin due to cross registry records = apply_preprocess( raw_records, self.spec, diff --git a/src/gage_eval/assets/datasets/preprocessors/__init__.py b/src/gage_eval/assets/datasets/preprocessors/__init__.py index 05a8e0d..274d1a4 100644 --- a/src/gage_eval/assets/datasets/preprocessors/__init__.py +++ b/src/gage_eval/assets/datasets/preprocessors/__init__.py @@ -1,3 +1,4 @@ """Preprocessor utilities.""" +from . import builtin -__all__ = [] +__all__ = ["builtin"] diff --git a/src/gage_eval/assets/datasets/preprocessors/builtin.py b/src/gage_eval/assets/datasets/preprocessors/builtin.py index c90060f..e9be828 100644 --- a/src/gage_eval/assets/datasets/preprocessors/builtin.py +++ b/src/gage_eval/assets/datasets/preprocessors/builtin.py @@ -6,6 +6,7 @@ from gage_eval.assets.datasets.preprocessors.default_preprocessor import DefaultPreprocessor from gage_eval.assets.datasets.preprocessors.multi_choice_preprocessor import MultiChoicePreprocessor as NewMultiChoice from gage_eval.assets.datasets.preprocessors.docvqa_preprocessor import DocVQAPreprocessor as NewDocVQA +from gage_eval.assets.datasets.preprocessors.omnidoc_preprocessor import OmniDocPreprocessor as NewOmniDoc from gage_eval.assets.datasets.preprocessors.mathvista_preprocessor import ( MathVistaPreprocessor as NewMathVista, MathVistaStructOnlyPreprocessor as NewMathVistaStructOnly, @@ -72,6 +73,14 @@ class MultiChoicePreprocessor(NewMultiChoice): class DocVQAPreprocessor(NewDocVQA): pass +@registry.asset( + "dataset_preprocessors", + "omnidoc_image_standardizer", + desc="OmniDocBench multimodal preprocessor (new)", + tags=("prompt", "vision", "omnidoc"), +) +class OmniDocPreprocessor(NewOmniDoc): + pass @registry.asset( "dataset_preprocessors", diff --git a/src/gage_eval/assets/datasets/preprocessors/omnidoc_preprocessor.py b/src/gage_eval/assets/datasets/preprocessors/omnidoc_preprocessor.py new file mode 100644 index 0000000..691d270 --- /dev/null +++ b/src/gage_eval/assets/datasets/preprocessors/omnidoc_preprocessor.py @@ -0,0 +1,121 @@ +"""Class-based DocVQA preprocessor (new implementation).""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any, Dict, List +import base64 +import mimetypes + +from gage_eval.assets.datasets.utils.multimodal import ( + _derive_root, + collect_content_fragments, + embed_local_message_images, + embed_local_image_as_data_url +) +from gage_eval.assets.datasets.preprocessors.base import BasePreprocessor +from gage_eval.assets.datasets.utils.mapping import extract_field +from gage_eval.assets.datasets.utils.normalization import list_images, ensure_chat_template_flags +from gage_eval.assets.datasets.utils.answers import parse_list_from_string, enrich_answer_with_options +from gage_eval.assets.datasets.utils.rendering import set_render_flags + +def encode_image_to_data_uri(image_path): + mime_type, _ = mimetypes.guess_type(image_path) + if mime_type is None: + mime_type = 'image/jpeg' + + with open(image_path, "rb") as image_file: + base64_data = base64.b64encode(image_file.read()).decode('utf-8') + + return f"data:{mime_type};base64,{base64_data}" + +class OmniDocPreprocessor(BasePreprocessor): + """Normalize OminiDoc samples with text + image content.""" + + def to_sample( + self, + record: Dict[str, Any], + *, + question_field: str = "question", + content_field: str = "image", + content_root: str | None = None, + data_path: str | None = None, + system_prompt: str | None = None, + instruction: str | None = None, + **kwargs: Any, + ) -> Dict[str, Any]: + + sample = dict(record) + question = extract_field(sample, question_field) + if question is None: + raise ValueError(f"OminiDoc sample missing question field '{question_field}'") + + # 1. Resolve Root Path + # Ensure dataset metadata path is set if data_path is provided, for _derive_root fallback + if data_path and "_dataset_metadata" not in sample: + sample["_dataset_metadata"] = {"path": data_path} + elif data_path: + sample.setdefault("_dataset_metadata", {})["path"] = data_path + + resolved_root = _derive_root(sample, content_root) + if resolved_root and isinstance(resolved_root, str) and not resolved_root.startswith(("http://", "https://", "data:")): + try: + resolved_root = str(Path(resolved_root).expanduser().resolve()) + except Exception: + resolved_root = str(Path(resolved_root).expanduser()) + + # 2. Construct Content + text_content = str(question).strip() + if instruction: + text_content = f"{text_content}\n\n{instruction.strip()}" + + user_content_parts = [{"type": "text", "text": text_content}] + + # 3. Embed Local Images + fragments = collect_content_fragments(sample, content_field=content_field, content_root=resolved_root) + + converted = embed_local_image_as_data_url( + sample, + image_field=content_field, + strict=False, + cache_dir=None, + content_root=content_root, + ) + + # 4. Build Messages + messages: List[Dict[str, Any]] = [] + if system_prompt: + messages.append({"role": "system", "content": [{"type": "text", "text": system_prompt.strip()}]}) + + visual_fragments=[{"type": "image_url", "image_url": sample['image']}] + user_content_parts.extend(visual_fragments) + messages.append({"role": "user", "content": user_content_parts}) + + sample["messages"] = messages + sample["prompt"] = question + sample["chat_template_mode"] = "preprocess" + sample["rendered_by"] = "preprocess" + sample["template_source"] = "manual" + sample["cache_suffix"] = "-converted" + + ensure_chat_template_flags(sample) + + # 5. Finalize Metadata + final_image_name = fragments + metadata = dict(sample.get("metadata") or {}) + metadata.pop("image_root", None) + metadata.update({ + "question_field": question_field, + "content_field": content_field, + }) + if final_image_name: + metadata["image_name"] = final_image_name + if resolved_root: + metadata["content_root"] = resolved_root + + sample["metadata"] = metadata + sample["inputs"] = sample.get("inputs") or {"prompt": question} + return sample + + +__all__ = ["OmniDocPreprocessor"] diff --git a/src/gage_eval/evaluation/sample_loop.py b/src/gage_eval/evaluation/sample_loop.py index 891e9bf..1bb67f3 100644 --- a/src/gage_eval/evaluation/sample_loop.py +++ b/src/gage_eval/evaluation/sample_loop.py @@ -123,7 +123,6 @@ def run(self, planner: TaskPlanner, role_manager: RoleManager, trace: Observabil daemon=True, ) producer.start() - if ff_mode: self._run_fire_and_forget( sample_queue, diff --git a/src/gage_eval/metrics/aggregators.py b/src/gage_eval/metrics/aggregators.py index 7be027b..f9a698c 100644 --- a/src/gage_eval/metrics/aggregators.py +++ b/src/gage_eval/metrics/aggregators.py @@ -92,7 +92,6 @@ def finalize(self) -> AggregatedMetric: count=self._total_samples, ) - class IdentityAggregator(MetricAggregator): """Returns all per-sample values without aggregation. @@ -117,6 +116,156 @@ def finalize(self) -> AggregatedMetric: metadata={"samples": [res.to_dict() for res in self._results]}, ) +class OmniDocLazyCalcAggregator(MetricAggregator): + """Returns all OmniDocBench values by calling its toolkit. + + This is mostly useful for direct calc the coupled metrics. + """ + + def __init__(self, spec: MetricSpec) -> None: + super().__init__(spec) + self._results: List[MetricResult] = [] + + def add(self, result: MetricResult) -> None: + self._results.append(result) + + def finalize(self) -> AggregatedMetric: + import os,shutil + omnidoc_home = os.getenv("OMNIDOCBENCH_HOME") + if not omnidoc_home: + raise EnvironmentError( + "Environment variable 'OMNIDOCBENCH_HOME' is not set. " + "Please set it to the root directory of the OmniDocBench" + "using 'export OMNIDOCBENCH_HOME=/path/to/OmniDocBench-main' in your terminal." + ) + write_folder = os.path.join(omnidoc_home, 'prediction', 'gage_run') + if os.path.exists(write_folder): + shutil.rmtree(write_folder) + os.makedirs(write_folder, exist_ok=True) # exist_ok for FS latency + + values = {str(idx): res.to_dict() for idx, res in enumerate(self._results)} + logger.debug("OmniDocLazyCalcAggregator captured {} samples for metric={}", len(self._results), self.spec.metric_id) + for idx,value in values.items(): + basename=os.path.basename(value['img_path']) + write_path=os.path.join(write_folder, f'{basename}.md') + with open(write_path,'w',encoding='utf8')as f: + f.write(value['prediction']) + logger.debug("Now, runing the OmniDocBench Toolkit for full metrics. It may take about 30min+ as CDM renders latex formulas") + bench_stdout=self.run_pdf_validation(omnidoc_home, write_folder) + logger.debug("OmniDocBench Toolkit Finished.") + values = self.get_metric_per_page(values, omnidoc_home, write_folder) + overall_dic=self.get_bench_overall(omnidoc_home, write_folder) + + return AggregatedMetric( + metric_id=self.spec.metric_id, + aggregation=self.spec.aggregation or "omnidoclazycalc", + values=values, + count=len(self._results), + metadata={"overall": overall_dic, "samples": [res.to_dict() for res in self._results]}, + ) + + def get_metric_per_page(self, values, omnidoc_home, write_folder): + import os,json + metric_names=[ + 'quick_match_text_block_per_page_edit', + 'quick_match_display_formula_per_page_edit', + 'quick_match_reading_order_per_page_edit', + 'quick_match_table_per_page_edit', + ] + pr_folder_name=os.path.basename(write_folder) + + metric_per_page_dic={} + for metric_name in metric_names: + json_path=os.path.join(omnidoc_home, "result", f"{pr_folder_name}_{metric_name}.json") + with open(json_path,'r',encoding='utf8')as f: + metric_data=json.load(f) + metric_per_page_dic[metric_name]=metric_data + + for idx, value in values.items(): + img_basename=os.path.basename(value['img_path']) + this_sample_metrics={} + for metric_name in metric_names: + this_sample_metrics[metric_name]=metric_per_page_dic[metric_name].get(img_basename) + values[idx]['metrics']=this_sample_metrics + return values + + def get_bench_overall(self, omnidoc_home, write_folder) -> dict: + import os,csv,subprocess + import pandas as pd + pr_folder_name=os.path.basename(write_folder) + gen_res_table_py=os.path.join(omnidoc_home, "tools/generate_result_tables.py") + if not os.path.exists(gen_res_table_py): + raise FileNotFoundError(f"Could not find {gen_res_table_py}. Please modify generate_result_tables.ipynb to generate_result_tables.py within df.to_csv('./overall.csv') ") + command = ["python", "tools/generate_result_tables.py", pr_folder_name] + result = subprocess.run(command, cwd=omnidoc_home,capture_output=True,text=True,) + df = pd.read_csv(os.path.join(omnidoc_home, "overall.csv"), index_col=0) + overall_dic=df.to_dict(orient='index') + return overall_dic + + def run_pdf_validation(self, omnidoc_home, write_folder) -> str: + import os,yaml,subprocess + gt_json = os.path.join(omnidoc_home, "OmniDocBench.json") + if not os.path.exists(gt_json): + raise EnvironmentError( + "The GT json OmniDocBench.json is missing, please download the dataset(not the toolkit)." + f" And loacate OmniDocBench.json under OMNIDOCBENCH_HOME={omnidoc_home}" + ) + config_dict = { + "end2end_eval": { + "metrics": { + "text_block": { + "metric": ["Edit_dist"] + }, + "display_formula": { + "metric": ["Edit_dist", "CDM"] + }, + "table": { + "metric": ["TEDS", "Edit_dist"] + }, + "reading_order": { + "metric": ["Edit_dist"] + } + }, + "dataset": { + "dataset_name": "end2end_dataset", + "ground_truth": { + "data_path": os.path.join(omnidoc_home, "OmniDocBench.json") + }, + "prediction": { + "data_path": write_folder + }, + "match_method": "quick_match" + } + } + } + + + temp_config_path = os.path.join(omnidoc_home, "configs", "end2end_gage.yaml") + with open(temp_config_path, "w", encoding="utf-8") as f: + yaml.dump(config_dict, f, allow_unicode=True) + + command = [ + "python", + "pdf_validation.py", + "--config", temp_config_path + ] + + try: + result = subprocess.run( + command, + cwd=omnidoc_home, + capture_output=True, + text=True, + # check=True + ) + return result.stdout + + except subprocess.CalledProcessError as e: + print(f"ERROR code:{e.returncode}") + print(f"{e.stdout}") + print(f"{e.stderr}") + raise + class CategoricalCountAggregator(MetricAggregator): """Counts occurrences of a categorical field in per-sample metadata.""" diff --git a/src/gage_eval/metrics/builtin/__init__.py b/src/gage_eval/metrics/builtin/__init__.py index 2726333..da1e719 100644 --- a/src/gage_eval/metrics/builtin/__init__.py +++ b/src/gage_eval/metrics/builtin/__init__.py @@ -7,6 +7,7 @@ ) from gage_eval.metrics.builtin.multi_choice import MultiChoiceAccuracyMetric from gage_eval.metrics.builtin.docvqa_anls import DocVQAANLSMetric +from gage_eval.metrics.builtin.ominidoc_all_metric import OmniDocBenchMetric from gage_eval.metrics.builtin.mmmu import MMMUAccuracyMetric from gage_eval.metrics.builtin.likelihood import LikelihoodMetric from gage_eval.metrics.builtin.ranking import RankingMetric @@ -52,4 +53,5 @@ "AppWorldDifficultyMetric", "LikelihoodMetric", "RankingMetric", + "OmniDocBenchMetric", ] diff --git a/src/gage_eval/metrics/builtin/ominidoc_all_metric.py b/src/gage_eval/metrics/builtin/ominidoc_all_metric.py new file mode 100644 index 0000000..9a4c8b4 --- /dev/null +++ b/src/gage_eval/metrics/builtin/ominidoc_all_metric.py @@ -0,0 +1,44 @@ +from gage_eval.metrics.base import BaseMetric +from gage_eval.registry import registry + +import os, json + +@registry.asset( + "metrics", + "omnidocbenchallmetric", + desc="All Metric for OmniDocBench", + tags=("vision", "prompt", "docparsing"), + default_aggregation="mean", +) +class OmniDocBenchMetric(BaseMetric): + """ + OmniDocBenchMetric的评测类 + """ + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def compute(self, context): + # OmniDocBench metrics rely on complex non-Python tools. Since tables and formulas are not present in every sample, the evaluation logic is dummied here. + # Instead we use the aggregator to call the bench kit. + result = DummyTextResult(context) + return result + +class DummyTextResult: + def __init__(self, context): + self.context = context + + def to_dict(self): + """ + Converts the results into a dictionary to match the output requirements of the evaluation framework. + """ + root = self.context.sample['metadata']['content_root'] + names = self.context.sample['metadata']['image_name'][0] + img_path = os.path.join(root, names) + return { + "sample_id": self.context.sample_id, + "prediction": self.context.model_output['answer'], + "img_path": img_path, + "metrics": None + } + +__all__ = ["OmniDocBenchMetric"] \ No newline at end of file diff --git a/src/gage_eval/metrics/registry.py b/src/gage_eval/metrics/registry.py index ac42ef1..4299795 100644 --- a/src/gage_eval/metrics/registry.py +++ b/src/gage_eval/metrics/registry.py @@ -45,6 +45,7 @@ def __init__(self) -> None: MeanAggregator, WeightedMeanAggregator, CategoricalCountAggregator, + OmniDocLazyCalcAggregator, ) self._aggregators: Dict[str, AggregatorFactory] = {} @@ -52,6 +53,8 @@ def __init__(self) -> None: self.register_aggregator("weighted_mean", lambda spec: WeightedMeanAggregator(spec)) self.register_aggregator("identity", lambda spec: IdentityAggregator(spec)) self.register_aggregator("categorical_count", lambda spec: CategoricalCountAggregator(spec)) + self.register_aggregator("omnidoclazycalc", lambda spec: OmniDocLazyCalcAggregator(spec)) + # Register MME-specific aggregator if available if MMEAccPlusAggregator is not None: self.register_aggregator("mme_acc_plus", lambda spec: MMEAccPlusAggregator(spec)) diff --git a/tests/preprocessors/test_ominidoc_preprocessor.py b/tests/preprocessors/test_ominidoc_preprocessor.py new file mode 100644 index 0000000..16b4522 --- /dev/null +++ b/tests/preprocessors/test_ominidoc_preprocessor.py @@ -0,0 +1,38 @@ +import sys +from pathlib import Path +import unittest + +ROOT = Path(__file__).resolve().parents[2] / "src" +if str(ROOT) not in sys.path: + sys.path.append(str(ROOT)) + +from gage_eval.assets.datasets.preprocessors.omnidoc_preprocessor import OmniDocPreprocessor +from gage_eval.assets.datasets.sample import ( + Sample, +) +from dataclasses import is_dataclass, asdict + +class OmniDocPreprocessorTests(unittest.TestCase): + def test_to_sample(self): + sample_id='omninidoc-123' + prompt = "You are an AI assistant specialized in converting PDF images to Markdown format. Please follow these instructions for the conversion:\n\n 1. Text Processing:\n - Accurately recognize all text content in the PDF image without guessing or inferring.\n - Convert the recognized text into Markdown format.\n - Maintain the original document structure, including headings, paragraphs, lists, etc.\n\n 2. Mathematical Formula Processing:\n - Convert all mathematical formulas to LaTeX format.\n - Enclose inline formulas with \\( \\). For example: This is an inline formula \\( E = mc^2 \\)\n - Enclose block formulas with \\\\[ \\\\]. For example: \\[ \\frac{-b \\pm \\sqrt{b^2 - 4ac}}{2a} \\]\n\n 3. Table Processing:\n - Convert tables to HTML format.\n - Wrap the entire table with and
.\n\n 4. Figure Handling:\n - Ignore figures content in the PDF image. Do not attempt to describe or convert images.\n\n 5. Output Format:\n - Ensure the output Markdown document has a clear structure with appropriate line breaks between elements.\n - For complex layouts, try to maintain the original document's structure and format as closely as possible.\n\n Please strictly follow these guidelines to ensure accuracy and consistency in the conversion. Your task is to accurately convert the content of the PDF image into Markdown format without adding any extra explanations or comments." + image='PPT_esea-app101_page_003.png' + _dataset_id='omnidocbench_val' + _dataset_metadata={'path': '/mnt/sdb1/ywt/OmniDocBench-main/OmniDocBench1_5/omnidocbench15_gage_mini_r.jsonl'} + sample = { + "id": sample_id, + "image": image, + 'prompt': prompt, + "_dataset_id": _dataset_id, + "_dataset_metadata": _dataset_metadata, + } + pre = OmniDocPreprocessor() + + ret = pre.to_sample(sample, question_field="prompt", content_field='image', content_root='/mnt/sdb1/ywt/OmniDocBench-main/OmniDocBench1_5/images') + + self.assertIsNotNone(ret) + self.assertIn(prompt, ret['messages'][0]['content'][0]['text']) + self.assertIsNotNone(ret['messages'][0]['content'][1]['image_url']) + self.assertIsNotNone(ret['id']) + self.assertIsNotNone(ret['image']) + self.assertIsNotNone(ret['_dataset_id']) \ No newline at end of file