diff --git a/.gitignore b/.gitignore index efb7c10..4141fd3 100755 --- a/.gitignore +++ b/.gitignore @@ -216,3 +216,4 @@ scripts/oneclick/.env.example __pycache__/ *.pyc *.pyo +run.sh diff --git a/config/custom/omnidocbench_qwen_mllm.yaml b/config/custom/omnidocbench_qwen_mllm.yaml new file mode 100644 index 0000000..dbca988 --- /dev/null +++ b/config/custom/omnidocbench_qwen_mllm.yaml @@ -0,0 +1,64 @@ +api_version: gage/v1alpha1 +kind: PipelineConfig +metadata: + name: omnidocbench_qwen_mllm + description: A multi-modal evaluation for OmniDocBench using a Qwen MLLM. + +custom: + steps: + - step: inference + - step: auto_eval + +# 1. Dataset Configuration +datasets: + - dataset_id: omnidocbench_val + loader: jsonl + params: + path: /mnt/sdb1/ywt/OmniDocBench-main/OmniDocBench1_5/omnidocbench15_gage_r.jsonl + preprocess: omnidoc_image_standardizer + preprocess_kwargs: + question_field: prompt + content_field: image + content_root: /mnt/sdb1/ywt/OmniDocBench-main/OmniDocBench1_5/images + doc_to_visual: gage_eval.assets.datasets.utils.multimodal:embed_local_message_images + # doc_to_visual_kwargs 自动继承 preprocess_kwargs 的同名字段 + +backends: + - backend_id: omnidocbench_qwen_mllm_backend + type: litellm + config: + provider: openai + api_base: http://127.0.0.1:8685/v1 # 替换为实际 Vision-LLM 服务地址 + model: Qwen/Qwen3-Omni-30B-A3B-Instruct + generation_parameters: + max_new_tokens: 4096 # 输出尽量短,避免内存/延迟 + temperature: 0.1 + async_max_concurrency: 2 # MacBook 本地轻量并发 + +role_adapters: + - adapter_id: omnidocbench_qwen_vl + role_type: dut_model + backend_id: omnidocbench_qwen_mllm_backend + # Capability must match what the backend/adapter supports for multi-modal input + capabilities: + - vision_chat + +# 4. Metric Configuration +metrics: + - metric_id: omnidocbench_all_metric + # This implementation needs to be created as per test-1113.md + implementation: gage_eval.metrics.builtin.ominidoc_all_metric:OmniDocBenchMetric + # implementation: OmniDocBenchMetric + aggregation: omnidoclazycalc + +# 5. Task Configuration +tasks: + - task_id: doc_parsing_eval + dataset_id: omnidocbench_val + max_samples: 20 # MacBook 快速 smoke,可 CLI 覆盖 + reporting: + sinks: + - type: console + - type: file + params: + output_path: ${GAGE_EVAL_SAVE_DIR:-./runs}/omnidocbench_events.jsonl diff --git a/src/gage_eval/assets/datasets/loaders/jsonl_loader.py b/src/gage_eval/assets/datasets/loaders/jsonl_loader.py index 4a7ce66..c4b2cb1 100644 --- a/src/gage_eval/assets/datasets/loaders/jsonl_loader.py +++ b/src/gage_eval/assets/datasets/loaders/jsonl_loader.py @@ -54,6 +54,8 @@ def load(self, hub_handle: Optional[DatasetHubHandle], *, trace=None) -> DataSou doc_to_text = resolve_doc_to_callable(self.spec, "doc_to_text") doc_to_visual = resolve_doc_to_callable(self.spec, "doc_to_visual") doc_to_audio = resolve_doc_to_callable(self.spec, "doc_to_audio") + from gage_eval.assets.datasets import preprocessors + # apply_preprocess need registered preprocessors but fail to import preprocessors.builtin.py at begin due to cross registry records = apply_preprocess( raw_records, self.spec, diff --git a/src/gage_eval/assets/datasets/preprocessors/__init__.py b/src/gage_eval/assets/datasets/preprocessors/__init__.py index 05a8e0d..274d1a4 100644 --- a/src/gage_eval/assets/datasets/preprocessors/__init__.py +++ b/src/gage_eval/assets/datasets/preprocessors/__init__.py @@ -1,3 +1,4 @@ """Preprocessor utilities.""" +from . import builtin -__all__ = [] +__all__ = ["builtin"] diff --git a/src/gage_eval/assets/datasets/preprocessors/builtin.py b/src/gage_eval/assets/datasets/preprocessors/builtin.py index c90060f..e9be828 100644 --- a/src/gage_eval/assets/datasets/preprocessors/builtin.py +++ b/src/gage_eval/assets/datasets/preprocessors/builtin.py @@ -6,6 +6,7 @@ from gage_eval.assets.datasets.preprocessors.default_preprocessor import DefaultPreprocessor from gage_eval.assets.datasets.preprocessors.multi_choice_preprocessor import MultiChoicePreprocessor as NewMultiChoice from gage_eval.assets.datasets.preprocessors.docvqa_preprocessor import DocVQAPreprocessor as NewDocVQA +from gage_eval.assets.datasets.preprocessors.omnidoc_preprocessor import OmniDocPreprocessor as NewOmniDoc from gage_eval.assets.datasets.preprocessors.mathvista_preprocessor import ( MathVistaPreprocessor as NewMathVista, MathVistaStructOnlyPreprocessor as NewMathVistaStructOnly, @@ -72,6 +73,14 @@ class MultiChoicePreprocessor(NewMultiChoice): class DocVQAPreprocessor(NewDocVQA): pass +@registry.asset( + "dataset_preprocessors", + "omnidoc_image_standardizer", + desc="OmniDocBench multimodal preprocessor (new)", + tags=("prompt", "vision", "omnidoc"), +) +class OmniDocPreprocessor(NewOmniDoc): + pass @registry.asset( "dataset_preprocessors", diff --git a/src/gage_eval/assets/datasets/preprocessors/omnidoc_preprocessor.py b/src/gage_eval/assets/datasets/preprocessors/omnidoc_preprocessor.py new file mode 100644 index 0000000..691d270 --- /dev/null +++ b/src/gage_eval/assets/datasets/preprocessors/omnidoc_preprocessor.py @@ -0,0 +1,121 @@ +"""Class-based DocVQA preprocessor (new implementation).""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any, Dict, List +import base64 +import mimetypes + +from gage_eval.assets.datasets.utils.multimodal import ( + _derive_root, + collect_content_fragments, + embed_local_message_images, + embed_local_image_as_data_url +) +from gage_eval.assets.datasets.preprocessors.base import BasePreprocessor +from gage_eval.assets.datasets.utils.mapping import extract_field +from gage_eval.assets.datasets.utils.normalization import list_images, ensure_chat_template_flags +from gage_eval.assets.datasets.utils.answers import parse_list_from_string, enrich_answer_with_options +from gage_eval.assets.datasets.utils.rendering import set_render_flags + +def encode_image_to_data_uri(image_path): + mime_type, _ = mimetypes.guess_type(image_path) + if mime_type is None: + mime_type = 'image/jpeg' + + with open(image_path, "rb") as image_file: + base64_data = base64.b64encode(image_file.read()).decode('utf-8') + + return f"data:{mime_type};base64,{base64_data}" + +class OmniDocPreprocessor(BasePreprocessor): + """Normalize OminiDoc samples with text + image content.""" + + def to_sample( + self, + record: Dict[str, Any], + *, + question_field: str = "question", + content_field: str = "image", + content_root: str | None = None, + data_path: str | None = None, + system_prompt: str | None = None, + instruction: str | None = None, + **kwargs: Any, + ) -> Dict[str, Any]: + + sample = dict(record) + question = extract_field(sample, question_field) + if question is None: + raise ValueError(f"OminiDoc sample missing question field '{question_field}'") + + # 1. Resolve Root Path + # Ensure dataset metadata path is set if data_path is provided, for _derive_root fallback + if data_path and "_dataset_metadata" not in sample: + sample["_dataset_metadata"] = {"path": data_path} + elif data_path: + sample.setdefault("_dataset_metadata", {})["path"] = data_path + + resolved_root = _derive_root(sample, content_root) + if resolved_root and isinstance(resolved_root, str) and not resolved_root.startswith(("http://", "https://", "data:")): + try: + resolved_root = str(Path(resolved_root).expanduser().resolve()) + except Exception: + resolved_root = str(Path(resolved_root).expanduser()) + + # 2. Construct Content + text_content = str(question).strip() + if instruction: + text_content = f"{text_content}\n\n{instruction.strip()}" + + user_content_parts = [{"type": "text", "text": text_content}] + + # 3. Embed Local Images + fragments = collect_content_fragments(sample, content_field=content_field, content_root=resolved_root) + + converted = embed_local_image_as_data_url( + sample, + image_field=content_field, + strict=False, + cache_dir=None, + content_root=content_root, + ) + + # 4. Build Messages + messages: List[Dict[str, Any]] = [] + if system_prompt: + messages.append({"role": "system", "content": [{"type": "text", "text": system_prompt.strip()}]}) + + visual_fragments=[{"type": "image_url", "image_url": sample['image']}] + user_content_parts.extend(visual_fragments) + messages.append({"role": "user", "content": user_content_parts}) + + sample["messages"] = messages + sample["prompt"] = question + sample["chat_template_mode"] = "preprocess" + sample["rendered_by"] = "preprocess" + sample["template_source"] = "manual" + sample["cache_suffix"] = "-converted" + + ensure_chat_template_flags(sample) + + # 5. Finalize Metadata + final_image_name = fragments + metadata = dict(sample.get("metadata") or {}) + metadata.pop("image_root", None) + metadata.update({ + "question_field": question_field, + "content_field": content_field, + }) + if final_image_name: + metadata["image_name"] = final_image_name + if resolved_root: + metadata["content_root"] = resolved_root + + sample["metadata"] = metadata + sample["inputs"] = sample.get("inputs") or {"prompt": question} + return sample + + +__all__ = ["OmniDocPreprocessor"] diff --git a/src/gage_eval/evaluation/sample_loop.py b/src/gage_eval/evaluation/sample_loop.py index 891e9bf..1bb67f3 100644 --- a/src/gage_eval/evaluation/sample_loop.py +++ b/src/gage_eval/evaluation/sample_loop.py @@ -123,7 +123,6 @@ def run(self, planner: TaskPlanner, role_manager: RoleManager, trace: Observabil daemon=True, ) producer.start() - if ff_mode: self._run_fire_and_forget( sample_queue, diff --git a/src/gage_eval/metrics/aggregators.py b/src/gage_eval/metrics/aggregators.py index 7be027b..f9a698c 100644 --- a/src/gage_eval/metrics/aggregators.py +++ b/src/gage_eval/metrics/aggregators.py @@ -92,7 +92,6 @@ def finalize(self) -> AggregatedMetric: count=self._total_samples, ) - class IdentityAggregator(MetricAggregator): """Returns all per-sample values without aggregation. @@ -117,6 +116,156 @@ def finalize(self) -> AggregatedMetric: metadata={"samples": [res.to_dict() for res in self._results]}, ) +class OmniDocLazyCalcAggregator(MetricAggregator): + """Returns all OmniDocBench values by calling its toolkit. + + This is mostly useful for direct calc the coupled metrics. + """ + + def __init__(self, spec: MetricSpec) -> None: + super().__init__(spec) + self._results: List[MetricResult] = [] + + def add(self, result: MetricResult) -> None: + self._results.append(result) + + def finalize(self) -> AggregatedMetric: + import os,shutil + omnidoc_home = os.getenv("OMNIDOCBENCH_HOME") + if not omnidoc_home: + raise EnvironmentError( + "Environment variable 'OMNIDOCBENCH_HOME' is not set. " + "Please set it to the root directory of the OmniDocBench" + "using 'export OMNIDOCBENCH_HOME=/path/to/OmniDocBench-main' in your terminal." + ) + write_folder = os.path.join(omnidoc_home, 'prediction', 'gage_run') + if os.path.exists(write_folder): + shutil.rmtree(write_folder) + os.makedirs(write_folder, exist_ok=True) # exist_ok for FS latency + + values = {str(idx): res.to_dict() for idx, res in enumerate(self._results)} + logger.debug("OmniDocLazyCalcAggregator captured {} samples for metric={}", len(self._results), self.spec.metric_id) + for idx,value in values.items(): + basename=os.path.basename(value['img_path']) + write_path=os.path.join(write_folder, f'{basename}.md') + with open(write_path,'w',encoding='utf8')as f: + f.write(value['prediction']) + logger.debug("Now, runing the OmniDocBench Toolkit for full metrics. It may take about 30min+ as CDM renders latex formulas") + bench_stdout=self.run_pdf_validation(omnidoc_home, write_folder) + logger.debug("OmniDocBench Toolkit Finished.") + values = self.get_metric_per_page(values, omnidoc_home, write_folder) + overall_dic=self.get_bench_overall(omnidoc_home, write_folder) + + return AggregatedMetric( + metric_id=self.spec.metric_id, + aggregation=self.spec.aggregation or "omnidoclazycalc", + values=values, + count=len(self._results), + metadata={"overall": overall_dic, "samples": [res.to_dict() for res in self._results]}, + ) + + def get_metric_per_page(self, values, omnidoc_home, write_folder): + import os,json + metric_names=[ + 'quick_match_text_block_per_page_edit', + 'quick_match_display_formula_per_page_edit', + 'quick_match_reading_order_per_page_edit', + 'quick_match_table_per_page_edit', + ] + pr_folder_name=os.path.basename(write_folder) + + metric_per_page_dic={} + for metric_name in metric_names: + json_path=os.path.join(omnidoc_home, "result", f"{pr_folder_name}_{metric_name}.json") + with open(json_path,'r',encoding='utf8')as f: + metric_data=json.load(f) + metric_per_page_dic[metric_name]=metric_data + + for idx, value in values.items(): + img_basename=os.path.basename(value['img_path']) + this_sample_metrics={} + for metric_name in metric_names: + this_sample_metrics[metric_name]=metric_per_page_dic[metric_name].get(img_basename) + values[idx]['metrics']=this_sample_metrics + return values + + def get_bench_overall(self, omnidoc_home, write_folder) -> dict: + import os,csv,subprocess + import pandas as pd + pr_folder_name=os.path.basename(write_folder) + gen_res_table_py=os.path.join(omnidoc_home, "tools/generate_result_tables.py") + if not os.path.exists(gen_res_table_py): + raise FileNotFoundError(f"Could not find {gen_res_table_py}. Please modify generate_result_tables.ipynb to generate_result_tables.py within df.to_csv('./overall.csv') ") + command = ["python", "tools/generate_result_tables.py", pr_folder_name] + result = subprocess.run(command, cwd=omnidoc_home,capture_output=True,text=True,) + df = pd.read_csv(os.path.join(omnidoc_home, "overall.csv"), index_col=0) + overall_dic=df.to_dict(orient='index') + return overall_dic + + def run_pdf_validation(self, omnidoc_home, write_folder) -> str: + import os,yaml,subprocess + gt_json = os.path.join(omnidoc_home, "OmniDocBench.json") + if not os.path.exists(gt_json): + raise EnvironmentError( + "The GT json OmniDocBench.json is missing, please download the dataset(not the toolkit)." + f" And loacate OmniDocBench.json under OMNIDOCBENCH_HOME={omnidoc_home}" + ) + config_dict = { + "end2end_eval": { + "metrics": { + "text_block": { + "metric": ["Edit_dist"] + }, + "display_formula": { + "metric": ["Edit_dist", "CDM"] + }, + "table": { + "metric": ["TEDS", "Edit_dist"] + }, + "reading_order": { + "metric": ["Edit_dist"] + } + }, + "dataset": { + "dataset_name": "end2end_dataset", + "ground_truth": { + "data_path": os.path.join(omnidoc_home, "OmniDocBench.json") + }, + "prediction": { + "data_path": write_folder + }, + "match_method": "quick_match" + } + } + } + + + temp_config_path = os.path.join(omnidoc_home, "configs", "end2end_gage.yaml") + with open(temp_config_path, "w", encoding="utf-8") as f: + yaml.dump(config_dict, f, allow_unicode=True) + + command = [ + "python", + "pdf_validation.py", + "--config", temp_config_path + ] + + try: + result = subprocess.run( + command, + cwd=omnidoc_home, + capture_output=True, + text=True, + # check=True + ) + return result.stdout + + except subprocess.CalledProcessError as e: + print(f"ERROR code:{e.returncode}") + print(f"{e.stdout}") + print(f"{e.stderr}") + raise + class CategoricalCountAggregator(MetricAggregator): """Counts occurrences of a categorical field in per-sample metadata.""" diff --git a/src/gage_eval/metrics/builtin/__init__.py b/src/gage_eval/metrics/builtin/__init__.py index 2726333..da1e719 100644 --- a/src/gage_eval/metrics/builtin/__init__.py +++ b/src/gage_eval/metrics/builtin/__init__.py @@ -7,6 +7,7 @@ ) from gage_eval.metrics.builtin.multi_choice import MultiChoiceAccuracyMetric from gage_eval.metrics.builtin.docvqa_anls import DocVQAANLSMetric +from gage_eval.metrics.builtin.ominidoc_all_metric import OmniDocBenchMetric from gage_eval.metrics.builtin.mmmu import MMMUAccuracyMetric from gage_eval.metrics.builtin.likelihood import LikelihoodMetric from gage_eval.metrics.builtin.ranking import RankingMetric @@ -52,4 +53,5 @@ "AppWorldDifficultyMetric", "LikelihoodMetric", "RankingMetric", + "OmniDocBenchMetric", ] diff --git a/src/gage_eval/metrics/builtin/ominidoc_all_metric.py b/src/gage_eval/metrics/builtin/ominidoc_all_metric.py new file mode 100644 index 0000000..9a4c8b4 --- /dev/null +++ b/src/gage_eval/metrics/builtin/ominidoc_all_metric.py @@ -0,0 +1,44 @@ +from gage_eval.metrics.base import BaseMetric +from gage_eval.registry import registry + +import os, json + +@registry.asset( + "metrics", + "omnidocbenchallmetric", + desc="All Metric for OmniDocBench", + tags=("vision", "prompt", "docparsing"), + default_aggregation="mean", +) +class OmniDocBenchMetric(BaseMetric): + """ + OmniDocBenchMetric的评测类 + """ + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def compute(self, context): + # OmniDocBench metrics rely on complex non-Python tools. Since tables and formulas are not present in every sample, the evaluation logic is dummied here. + # Instead we use the aggregator to call the bench kit. + result = DummyTextResult(context) + return result + +class DummyTextResult: + def __init__(self, context): + self.context = context + + def to_dict(self): + """ + Converts the results into a dictionary to match the output requirements of the evaluation framework. + """ + root = self.context.sample['metadata']['content_root'] + names = self.context.sample['metadata']['image_name'][0] + img_path = os.path.join(root, names) + return { + "sample_id": self.context.sample_id, + "prediction": self.context.model_output['answer'], + "img_path": img_path, + "metrics": None + } + +__all__ = ["OmniDocBenchMetric"] \ No newline at end of file diff --git a/src/gage_eval/metrics/registry.py b/src/gage_eval/metrics/registry.py index ac42ef1..4299795 100644 --- a/src/gage_eval/metrics/registry.py +++ b/src/gage_eval/metrics/registry.py @@ -45,6 +45,7 @@ def __init__(self) -> None: MeanAggregator, WeightedMeanAggregator, CategoricalCountAggregator, + OmniDocLazyCalcAggregator, ) self._aggregators: Dict[str, AggregatorFactory] = {} @@ -52,6 +53,8 @@ def __init__(self) -> None: self.register_aggregator("weighted_mean", lambda spec: WeightedMeanAggregator(spec)) self.register_aggregator("identity", lambda spec: IdentityAggregator(spec)) self.register_aggregator("categorical_count", lambda spec: CategoricalCountAggregator(spec)) + self.register_aggregator("omnidoclazycalc", lambda spec: OmniDocLazyCalcAggregator(spec)) + # Register MME-specific aggregator if available if MMEAccPlusAggregator is not None: self.register_aggregator("mme_acc_plus", lambda spec: MMEAccPlusAggregator(spec)) diff --git a/tests/preprocessors/test_ominidoc_preprocessor.py b/tests/preprocessors/test_ominidoc_preprocessor.py new file mode 100644 index 0000000..16b4522 --- /dev/null +++ b/tests/preprocessors/test_ominidoc_preprocessor.py @@ -0,0 +1,38 @@ +import sys +from pathlib import Path +import unittest + +ROOT = Path(__file__).resolve().parents[2] / "src" +if str(ROOT) not in sys.path: + sys.path.append(str(ROOT)) + +from gage_eval.assets.datasets.preprocessors.omnidoc_preprocessor import OmniDocPreprocessor +from gage_eval.assets.datasets.sample import ( + Sample, +) +from dataclasses import is_dataclass, asdict + +class OmniDocPreprocessorTests(unittest.TestCase): + def test_to_sample(self): + sample_id='omninidoc-123' + prompt = "You are an AI assistant specialized in converting PDF images to Markdown format. Please follow these instructions for the conversion:\n\n 1. Text Processing:\n - Accurately recognize all text content in the PDF image without guessing or inferring.\n - Convert the recognized text into Markdown format.\n - Maintain the original document structure, including headings, paragraphs, lists, etc.\n\n 2. Mathematical Formula Processing:\n - Convert all mathematical formulas to LaTeX format.\n - Enclose inline formulas with \\( \\). For example: This is an inline formula \\( E = mc^2 \\)\n - Enclose block formulas with \\\\[ \\\\]. For example: \\[ \\frac{-b \\pm \\sqrt{b^2 - 4ac}}{2a} \\]\n\n 3. Table Processing:\n - Convert tables to HTML format.\n - Wrap the entire table with