From 98286fa071cfa3f5450cf9f71c0b3605c8a57fdb Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Sat, 24 Jan 2026 15:00:46 +0900 Subject: [PATCH 01/38] feat: Initialize benchmark project with extraction and evaluation pipelines - Add example environment configuration file for OpenAI API keys. - Create .gitignore to exclude unnecessary files and directories. - Implement Makefile for easy setup and execution of benchmark tasks. - Write README.md with project overview, requirements, setup instructions, and output details. - Define project metadata and dependencies in pyproject.toml. - Set up directory structure and paths for data handling. - Develop CLI for extracting data from Excel files and querying OpenAI's API. - Implement evaluation functions to compare model responses against ground truth. - Create utility functions for file handling and JSON normalization. - Integrate OpenAI API client for text and image processing. - Add support for various extraction methods including OpenPyXL, PDF, HTML, and image rendering. --- .pre-commit-config.yaml | 2 + benchmark/.env.example | 4 + benchmark/.gitignore | 15 + benchmark/Makefile | 19 ++ benchmark/README.md | 45 +++ benchmark/pyproject.toml | 26 ++ benchmark/src/bench/__init__.py | 0 benchmark/src/bench/cli.py | 247 ++++++++++++++ benchmark/src/bench/manifest.py | 30 ++ benchmark/src/bench/paths.py | 14 + benchmark/src/eval/__init__.py | 0 benchmark/src/eval/exact_match.py | 12 + benchmark/src/eval/normalize.py | 26 ++ benchmark/src/eval/report.py | 14 + benchmark/src/llm/__init__.py | 0 benchmark/src/llm/openai_client.py | 105 ++++++ benchmark/src/llm/pricing.py | 19 ++ benchmark/src/pipeline/__init__.py | 0 benchmark/src/pipeline/common.py | 24 ++ benchmark/src/pipeline/exstruct_adapter.py | 0 benchmark/src/pipeline/html_text.py | 55 ++++ benchmark/src/pipeline/image_render.py | 34 ++ benchmark/src/pipeline/openpyxl_pandas.py | 45 +++ benchmark/src/pipeline/pdf_text.py | 48 +++ pyproject.toml | 5 + uv.lock | 363 ++++++++++++++++++++- 26 files changed, 1149 insertions(+), 3 deletions(-) create mode 100644 benchmark/.env.example create mode 100644 benchmark/.gitignore create mode 100644 benchmark/Makefile create mode 100644 benchmark/README.md create mode 100644 benchmark/pyproject.toml create mode 100644 benchmark/src/bench/__init__.py create mode 100644 benchmark/src/bench/cli.py create mode 100644 benchmark/src/bench/manifest.py create mode 100644 benchmark/src/bench/paths.py create mode 100644 benchmark/src/eval/__init__.py create mode 100644 benchmark/src/eval/exact_match.py create mode 100644 benchmark/src/eval/normalize.py create mode 100644 benchmark/src/eval/report.py create mode 100644 benchmark/src/llm/__init__.py create mode 100644 benchmark/src/llm/openai_client.py create mode 100644 benchmark/src/llm/pricing.py create mode 100644 benchmark/src/pipeline/__init__.py create mode 100644 benchmark/src/pipeline/common.py create mode 100644 benchmark/src/pipeline/exstruct_adapter.py create mode 100644 benchmark/src/pipeline/html_text.py create mode 100644 benchmark/src/pipeline/image_render.py create mode 100644 benchmark/src/pipeline/openpyxl_pandas.py create mode 100644 benchmark/src/pipeline/pdf_text.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9d84d26..56b2167 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,6 +3,7 @@ repos: rev: v0.4.5 hooks: - id: ruff + exclude: ^benchmark/ - id: ruff-format - repo: https://github.com/pre-commit/mirrors-mypy @@ -12,3 +13,4 @@ repos: additional_dependencies: - pydantic>=2.0.0 - types-PyYAML + exclude: ^benchmark/ diff --git a/benchmark/.env.example b/benchmark/.env.example new file mode 100644 index 0000000..0cd13a4 --- /dev/null +++ b/benchmark/.env.example @@ -0,0 +1,4 @@ +OPENAI_API_KEY=your_key_here +# optional +OPENAI_ORG= +OPENAI_PROJECT= diff --git a/benchmark/.gitignore b/benchmark/.gitignore new file mode 100644 index 0000000..237bb9c --- /dev/null +++ b/benchmark/.gitignore @@ -0,0 +1,15 @@ +# Python-generated files +__pycache__/ +*.py[oc] +build/ +dist/ +drafts/ +wheels/ +*.egg-info + +# Virtual environments +.venv +data/raw/ +*.log +outputs/ +.env \ No newline at end of file diff --git a/benchmark/Makefile b/benchmark/Makefile new file mode 100644 index 0000000..3024f35 --- /dev/null +++ b/benchmark/Makefile @@ -0,0 +1,19 @@ +.PHONY: setup extract ask eval report all + +setup: + python -m pip install -U pip + pip install -e . + +extract: + exbench extract --case all --method all + +ask: + exbench ask --case all --method all --model gpt-4o + +eval: + exbench eval --case all --method all + +report: + exbench report + +all: extract ask eval report diff --git a/benchmark/README.md b/benchmark/README.md new file mode 100644 index 0000000..e7735c6 --- /dev/null +++ b/benchmark/README.md @@ -0,0 +1,45 @@ +# ExStruct Benchmark + +This benchmark compares methods for answering questions about Excel documents using GPT-4o: + +- exstruct +- openpyxl +- pdf (xlsx->pdf->text) +- html (xlsx->html->table text) +- image_vlm (xlsx->pdf->png -> GPT-4o vision) + +## Requirements + +- Python 3.11+ +- LibreOffice (`soffice` in PATH) +- OPENAI_API_KEY in `.env` + +## Setup + +```bash +cd benchmark +cp .env.example .env +pip install -e . +``` + +## Run + +```bash +make all +``` + +Outputs: + +- outputs/extracted/\* : extracted context (text or images) +- outputs/prompts/\*.jsonl +- outputs/responses/\*.jsonl +- outputs/results/results.csv +- outputs/results/report.md + +## Notes: + +- GPT-4o Responses API supports text and image inputs. See docs: + - [https://platform.openai.com/docs/api-reference/responses](https://platform.openai.com/docs/api-reference/responses) + - [https://platform.openai.com/docs/guides/images-vision](https://platform.openai.com/docs/guides/images-vision) +- Pricing for gpt-4o used in cost estimation: + - https://platform.openai.com/docs/models/compare?model=gpt-4o diff --git a/benchmark/pyproject.toml b/benchmark/pyproject.toml new file mode 100644 index 0000000..e390725 --- /dev/null +++ b/benchmark/pyproject.toml @@ -0,0 +1,26 @@ +[project] +name = "benchmark" +version = "0.1.0" +description = "Add your description here" +readme = "README.md" +requires-python = ">=3.11" +dependencies = [ + "beautifulsoup4>=4.14.3", + "lxml>=6.0.2", + "openai>=2.15.0", + "openpyxl>=3.1.5", + "pandas>=2.3.3", + "pydantic>=2.12.5", + "pymupdf>=1.26.7", + "python-dotenv>=1.2.1", + "rich>=14.2.0", + "typer>=0.21.1", +] + +[project.scripts] +exbench = "bench.cli:app" + +[dependency-groups] +dev = [ + "ruff>=0.14.8", +] diff --git a/benchmark/src/bench/__init__.py b/benchmark/src/bench/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/benchmark/src/bench/cli.py b/benchmark/src/bench/cli.py new file mode 100644 index 0000000..3cd643d --- /dev/null +++ b/benchmark/src/bench/cli.py @@ -0,0 +1,247 @@ +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any + +from rich import print +from rich.console import Console +import typer + +from .eval.exact_match import exact_match +from .eval.normalize import normalize_json_text +from .eval.report import write_results_csv +from .llm.openai_client import OpenAIResponsesClient +from .manifest import Case, load_manifest +from .paths import DATA_DIR, EXTRACTED_DIR, PROMPTS_DIR, RESPONSES_DIR, RESULTS_DIR +from .pipeline.common import ensure_dir, sha256_text, write_json +from .pipeline.exstruct_adapter import extract_exstruct +from .pipeline.html_text import html_to_text, xlsx_to_html +from .pipeline.image_render import xlsx_to_pngs_via_pdf +from .pipeline.openpyxl_pandas import extract_openpyxl +from .pipeline.pdf_text import pdf_to_text, xlsx_to_pdf + +app = typer.Typer(add_completion=False) +console = Console() + +METHODS_TEXT = ["exstruct", "openpyxl", "pdf", "html"] +METHODS_ALL = ["exstruct", "openpyxl", "pdf", "html", "image_vlm"] + + +def _manifest_path() -> Path: + return DATA_DIR / "manifest.json" + + +def _select_cases(manifest_cases: list[Case], case: str) -> list[Case]: + if case == "all": + return manifest_cases + ids = {c.strip() for c in case.split(",")} + return [c for c in manifest_cases if c.id in ids] + + +def _select_methods(method: str) -> list[str]: + if method == "all": + return METHODS_ALL + return [m.strip() for m in method.split(",")] + + +@app.command() +def extract(case: str = "all", method: str = "all") -> None: + mf = load_manifest(_manifest_path()) + cases = _select_cases(mf.cases, case) + methods = _select_methods(method) + + for c in cases: + xlsx = Path(c.xlsx) + console.rule(f"EXTRACT {c.id} ({xlsx.name})") + + if "exstruct" in methods: + out_txt = EXTRACTED_DIR / "exstruct" / f"{c.id}.txt" + extract_exstruct(xlsx, out_txt, c.sheet_scope) + print(f"[green]exstruct -> {out_txt}[/green]") + + if "openpyxl" in methods: + out_txt = EXTRACTED_DIR / "openpyxl" / f"{c.id}.txt" + extract_openpyxl(xlsx, out_txt, c.sheet_scope) + print(f"[green]openpyxl -> {out_txt}[/green]") + + if "pdf" in methods: + out_pdf = EXTRACTED_DIR / "pdf" / f"{c.id}.pdf" + out_txt = EXTRACTED_DIR / "pdf" / f"{c.id}.txt" + xlsx_to_pdf(xlsx, out_pdf) + pdf_to_text(out_pdf, out_txt) + print(f"[green]pdf -> {out_txt}[/green]") + + if "html" in methods: + out_html = EXTRACTED_DIR / "html" / f"{c.id}.html" + out_txt = EXTRACTED_DIR / "html" / f"{c.id}.txt" + xlsx_to_html(xlsx, out_html) + html_to_text(out_html, out_txt) + print(f"[green]html -> {out_txt}[/green]") + + if "image_vlm" in methods: + out_dir = EXTRACTED_DIR / "image_vlm" / c.id + pngs = xlsx_to_pngs_via_pdf( + xlsx, out_dir, dpi=c.render.dpi, max_pages=c.render.max_pages + ) + write_json(out_dir / "images.json", {"images": [str(p) for p in pngs]}) + print(f"[green]image_vlm -> {len(pngs)} png(s) in {out_dir}[/green]") + + +@app.command() +def ask(case: str = "all", method: str = "all", model: str = "gpt-4o") -> None: + mf = load_manifest(_manifest_path()) + cases = _select_cases(mf.cases, case) + methods = _select_methods(method) + + client = OpenAIResponsesClient() + ensure_dir(PROMPTS_DIR) + ensure_dir(RESPONSES_DIR) + + for c in cases: + console.rule(f"ASK {c.id}") + q = c.question + + for m in methods: + prompt_rec: dict[str, Any] = { + "case_id": c.id, + "method": m, + "model": model, + "question": q, + } + resp_rec: dict[str, Any] = {"case_id": c.id, "method": m, "model": model} + + if m == "image_vlm": + img_dir = EXTRACTED_DIR / "image_vlm" / c.id + imgs = json.loads( + (img_dir / "images.json").read_text(encoding="utf-8") + )["images"] + img_paths = [Path(p) for p in imgs] + prompt_hash = sha256_text( + q + "|" + "|".join([p.name for p in img_paths]) + ) + prompt_rec["prompt_hash"] = prompt_hash + prompt_rec["images"] = [p.name for p in img_paths] + + res = client.ask_images(model=model, question=q, image_paths=img_paths) + + else: + txt_path = EXTRACTED_DIR / m / f"{c.id}.txt" + context = txt_path.read_text(encoding="utf-8") + prompt_hash = sha256_text(q + "|" + context) + prompt_rec["prompt_hash"] = prompt_hash + + res = client.ask_text(model=model, question=q, context_text=context) + + # save prompt/response + prompt_file = PROMPTS_DIR / f"{c.id}.jsonl" + resp_file = RESPONSES_DIR / f"{c.id}.jsonl" + + prompt_rec_line = json.dumps(prompt_rec, ensure_ascii=False) + resp_rec.update( + { + "prompt_hash": prompt_hash, + "text": res.text, + "input_tokens": res.input_tokens, + "output_tokens": res.output_tokens, + "cost_usd": res.cost_usd, + "raw": res.raw, + } + ) + resp_rec_line = json.dumps(resp_rec, ensure_ascii=False) + + with prompt_file.open("a", encoding="utf-8") as f: + f.write(prompt_rec_line + "\n") + with resp_file.open("a", encoding="utf-8") as f: + f.write(resp_rec_line + "\n") + + print( + f"[cyan]{c.id} {m}[/cyan] tokens(in/out)={res.input_tokens}/{res.output_tokens} cost=${res.cost_usd:.6f}" + ) + + +@app.command() +def eval(case: str = "all", method: str = "all") -> None: + mf = load_manifest(_manifest_path()) + cases = _select_cases(mf.cases, case) + methods = _select_methods(method) + + rows: list[dict[str, Any]] = [] + + for c in cases: + truth = json.loads(Path(c.truth).read_text(encoding="utf-8")) + resp_file = RESPONSES_DIR / f"{c.id}.jsonl" + if not resp_file.exists(): + print(f"[yellow]skip: no responses for {c.id}[/yellow]") + continue + + # 最新の各method結果を採用(同じmethodが複数行ある場合、最後の行が最新) + latest: dict[str, dict[str, Any]] = {} + for line in resp_file.read_text(encoding="utf-8").splitlines(): + rec = json.loads(line) + if rec["method"] in methods: + latest[rec["method"]] = rec + + for m, rec in latest.items(): + ok = False + pred_obj = None + err = None + try: + pred_obj = normalize_json_text(rec["text"]) + ok = exact_match(pred_obj, truth) + except Exception as e: + err = str(e) + + rows.append( + { + "case_id": c.id, + "type": c.type, + "method": m, + "model": rec.get("model"), + "ok": ok, + "input_tokens": rec.get("input_tokens", 0), + "output_tokens": rec.get("output_tokens", 0), + "cost_usd": rec.get("cost_usd", 0.0), + "error": err, + } + ) + + out_csv = RESULTS_DIR / "results.csv" + write_results_csv(rows, out_csv) + print(f"[green]Wrote {out_csv} ({len(rows)} rows)[/green]") + + +@app.command() +def report() -> None: + """ + 雑に Markdown レポートを作る(必要なら後で強化) + """ + csv_path = RESULTS_DIR / "results.csv" + if not csv_path.exists(): + raise typer.Exit(code=1) + + import pandas as pd + + df = pd.read_csv(csv_path) + # 集計: method別の正解率/平均トークン/平均コスト + g = ( + df.groupby("method") + .agg( + acc=("ok", "mean"), + avg_in=("input_tokens", "mean"), + avg_cost=("cost_usd", "mean"), + n=("ok", "count"), + ) + .reset_index() + ) + + md_lines = [] + md_lines.append("# Benchmark Report") + md_lines.append("") + md_lines.append("## Summary by method") + md_lines.append("") + md_lines.append(g.to_markdown(index=False)) + md_lines.append("") + out_md = RESULTS_DIR / "report.md" + out_md.write_text("\n".join(md_lines), encoding="utf-8") + print(f"[green]Wrote {out_md}[/green]") diff --git a/benchmark/src/bench/manifest.py b/benchmark/src/bench/manifest.py new file mode 100644 index 0000000..19e12e5 --- /dev/null +++ b/benchmark/src/bench/manifest.py @@ -0,0 +1,30 @@ +from __future__ import annotations + +import json +from pathlib import Path + +from pydantic import BaseModel + + +class RenderConfig(BaseModel): + dpi: int = 200 + max_pages: int = 6 + + +class Case(BaseModel): + id: str + type: str + xlsx: str + question: str + truth: str + sheet_scope: list[str] | None = None + render: RenderConfig = RenderConfig() + + +class Manifest(BaseModel): + cases: list[Case] + + +def load_manifest(path: Path) -> Manifest: + data = json.loads(path.read_text(encoding="utf-8")) + return Manifest(**data) diff --git a/benchmark/src/bench/paths.py b/benchmark/src/bench/paths.py new file mode 100644 index 0000000..082db16 --- /dev/null +++ b/benchmark/src/bench/paths.py @@ -0,0 +1,14 @@ +from __future__ import annotations + +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[2] # benchmark/ +DATA_DIR = ROOT / "data" +RAW_DIR = DATA_DIR / "raw" +TRUTH_DIR = DATA_DIR / "truth" + +OUT_DIR = ROOT / "outputs" +EXTRACTED_DIR = OUT_DIR / "extracted" +PROMPTS_DIR = OUT_DIR / "prompts" +RESPONSES_DIR = OUT_DIR / "responses" +RESULTS_DIR = OUT_DIR / "results" diff --git a/benchmark/src/eval/__init__.py b/benchmark/src/eval/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/benchmark/src/eval/exact_match.py b/benchmark/src/eval/exact_match.py new file mode 100644 index 0000000..da7dbdc --- /dev/null +++ b/benchmark/src/eval/exact_match.py @@ -0,0 +1,12 @@ +from __future__ import annotations + +import json +from typing import Any + + +def canonical(obj: Any) -> str: + return json.dumps(obj, ensure_ascii=False, sort_keys=True, separators=(",", ":")) + + +def exact_match(a: Any, b: Any) -> bool: + return canonical(a) == canonical(b) diff --git a/benchmark/src/eval/normalize.py b/benchmark/src/eval/normalize.py new file mode 100644 index 0000000..92f8d5f --- /dev/null +++ b/benchmark/src/eval/normalize.py @@ -0,0 +1,26 @@ +from __future__ import annotations + +import json +import re +from typing import Any + + +def _strip_code_fences(s: str) -> str: + s = s.strip() + s = re.sub(r"^```(json)?\s*", "", s) + s = re.sub(r"\s*```$", "", s) + return s.strip() + + +def normalize_json_text(s: str) -> Any: + """ + LLM出力を JSON として読み、正規化されたPythonオブジェクトを返す + """ + s = _strip_code_fences(s) + # 余計な前後テキストが入った場合の救済(最初の{...}を拾う) + if "{" in s and "}" in s: + start = s.find("{") + end = s.rfind("}") + s = s[start : end + 1] + obj = json.loads(s) + return obj diff --git a/benchmark/src/eval/report.py b/benchmark/src/eval/report.py new file mode 100644 index 0000000..3e26836 --- /dev/null +++ b/benchmark/src/eval/report.py @@ -0,0 +1,14 @@ +from __future__ import annotations + +import csv +from pathlib import Path +from typing import Any + + +def write_results_csv(rows: list[dict[str, Any]], out_csv: Path) -> None: + out_csv.parent.mkdir(parents=True, exist_ok=True) + keys = list(rows[0].keys()) if rows else [] + with out_csv.open("w", encoding="utf-8", newline="") as f: + w = csv.DictWriter(f, fieldnames=keys) + w.writeheader() + w.writerows(rows) diff --git a/benchmark/src/llm/__init__.py b/benchmark/src/llm/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/benchmark/src/llm/openai_client.py b/benchmark/src/llm/openai_client.py new file mode 100644 index 0000000..7ca4f8a --- /dev/null +++ b/benchmark/src/llm/openai_client.py @@ -0,0 +1,105 @@ +from __future__ import annotations + +import base64 +from dataclasses import dataclass +import json +from pathlib import Path +from typing import Any + +from dotenv import load_dotenv +from openai import OpenAI + +from .pricing import estimate_cost_usd + + +@dataclass +class LLMResult: + text: str + input_tokens: int + output_tokens: int + cost_usd: float + raw: dict[str, Any] + + +def _png_to_data_url(png_path: Path) -> str: + b = png_path.read_bytes() + b64 = base64.b64encode(b).decode("ascii") + return f"data:image/png;base64,{b64}" + + +class OpenAIResponsesClient: + def __init__(self) -> None: + load_dotenv() + self.client = OpenAI() + + def ask_text(self, *, model: str, question: str, context_text: str) -> LLMResult: + """ + Responses API: text-only + """ + resp = self.client.responses.create( + model=model, + input=[ + { + "role": "user", + "content": [ + { + "type": "input_text", + "text": "You are a strict JSON extraction engine. Output JSON only.", + }, + {"type": "input_text", "text": f"[QUESTION]\n{question}"}, + {"type": "input_text", "text": f"[CONTEXT]\n{context_text}"}, + ], + } + ], + ) + + text = resp.output_text # SDK helper + usage = getattr(resp, "usage", None) or {} + in_tok = int(usage.get("input_tokens", 0)) + out_tok = int(usage.get("output_tokens", 0)) + cost = estimate_cost_usd(model, in_tok, out_tok) + + raw = json.loads(resp.model_dump_json()) + return LLMResult( + text=text, + input_tokens=in_tok, + output_tokens=out_tok, + cost_usd=cost, + raw=raw, + ) + + def ask_images( + self, *, model: str, question: str, image_paths: list[Path] + ) -> LLMResult: + """ + Responses API: image + text + """ + content: list[dict[str, Any]] = [ + { + "type": "input_text", + "text": "You are a strict JSON extraction engine. Output JSON only.", + }, + {"type": "input_text", "text": f"[QUESTION]\n{question}"}, + ] + for p in image_paths: + content.append({"type": "input_image", "image_url": _png_to_data_url(p)}) + + resp = self.client.responses.create( + model=model, + input=[{"role": "user", "content": content}], + ) + + text = resp.output_text + usage = getattr(resp, "usage", None) or {} + in_tok = int(usage.get("input_tokens", 0)) + out_tok = int(usage.get("output_tokens", 0)) + cost = estimate_cost_usd(model, in_tok, out_tok) + + raw = json.loads(resp.model_dump_json()) + return LLMResult( + text=text, + input_tokens=in_tok, + output_tokens=out_tok, + cost_usd=cost, + raw=raw, + ) diff --git a/benchmark/src/llm/pricing.py b/benchmark/src/llm/pricing.py new file mode 100644 index 0000000..e409eef --- /dev/null +++ b/benchmark/src/llm/pricing.py @@ -0,0 +1,19 @@ +from __future__ import annotations + +# gpt-4o pricing: per 1M tokens +# Input $2.50 / 1M, Output $10.00 / 1M (cached input not used here) +# Source: model compare page +# https://platform.openai.com/docs/models/compare?model=gpt-4o +# (You will cite this in README/report; code keeps constants.) +GPT4O_INPUT_PER_1M = 2.50 +GPT4O_OUTPUT_PER_1M = 10.00 + + +def estimate_cost_usd(model: str, input_tokens: int, output_tokens: int) -> float: + if model != "gpt-4o": + # ベンチでは統一前提。拡張するならここをテーブル化。 + raise ValueError(f"Unsupported model for cost table: {model}") + + return (input_tokens / 1_000_000) * GPT4O_INPUT_PER_1M + ( + output_tokens / 1_000_000 + ) * GPT4O_OUTPUT_PER_1M diff --git a/benchmark/src/pipeline/__init__.py b/benchmark/src/pipeline/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/benchmark/src/pipeline/common.py b/benchmark/src/pipeline/common.py new file mode 100644 index 0000000..e08b8ff --- /dev/null +++ b/benchmark/src/pipeline/common.py @@ -0,0 +1,24 @@ +from __future__ import annotations + +import hashlib +import json +from pathlib import Path +from typing import Any + + +def ensure_dir(p: Path) -> None: + p.mkdir(parents=True, exist_ok=True) + + +def sha256_text(s: str) -> str: + return hashlib.sha256(s.encode("utf-8")).hexdigest() + + +def write_text(p: Path, text: str) -> None: + ensure_dir(p.parent) + p.write_text(text, encoding="utf-8") + + +def write_json(p: Path, obj: Any) -> None: + ensure_dir(p.parent) + p.write_text(json.dumps(obj, ensure_ascii=False, indent=2), encoding="utf-8") diff --git a/benchmark/src/pipeline/exstruct_adapter.py b/benchmark/src/pipeline/exstruct_adapter.py new file mode 100644 index 0000000..e69de29 diff --git a/benchmark/src/pipeline/html_text.py b/benchmark/src/pipeline/html_text.py new file mode 100644 index 0000000..3734fb0 --- /dev/null +++ b/benchmark/src/pipeline/html_text.py @@ -0,0 +1,55 @@ +from __future__ import annotations + +from pathlib import Path +import subprocess + +from bs4 import BeautifulSoup + +from .common import ensure_dir, write_text + + +def xlsx_to_html(xlsx_path: Path, out_html: Path) -> None: + ensure_dir(out_html.parent) + cmd = [ + "soffice", + "--headless", + "--nologo", + "--nolockcheck", + "--convert-to", + "html", + "--outdir", + str(out_html.parent), + str(xlsx_path), + ] + subprocess.run(cmd, check=True) + produced = out_html.parent / (xlsx_path.stem + ".html") + if not produced.exists(): + produced = out_html.parent / (xlsx_path.stem + ".htm") + produced.replace(out_html) + + +def html_to_text(html_path: Path, out_txt: Path) -> None: + soup = BeautifulSoup(html_path.read_text(encoding="utf-8", errors="ignore"), "lxml") + + # Excel HTMLはテーブルが中心。全テーブルのセルテキストを列挙。 + tables = soup.find_all("table") + lines: list[str] = [] + lines.append("[DOC_META]") + lines.append(f"source={html_path.name}") + lines.append("method=html_text") + lines.append("") + lines.append("[CONTENT]") + + for t_i, table in enumerate(tables, start=1): + lines.append(f"\n# TABLE {t_i}") + rows = table.find_all("tr") + for r in rows: + cells = r.find_all(["td", "th"]) + vals = [] + for c in cells: + txt = " ".join(c.get_text(separator=" ", strip=True).split()) + vals.append(txt) + if any(v for v in vals): + lines.append(" | ".join(vals)) + + write_text(out_txt, "\n".join(lines).strip() + "\n") diff --git a/benchmark/src/pipeline/image_render.py b/benchmark/src/pipeline/image_render.py new file mode 100644 index 0000000..ec99e2e --- /dev/null +++ b/benchmark/src/pipeline/image_render.py @@ -0,0 +1,34 @@ +from __future__ import annotations + +from pathlib import Path + +import fitz # PyMuPDF + +from .common import ensure_dir +from .pdf_text import xlsx_to_pdf + + +def xlsx_to_pngs_via_pdf( + xlsx_path: Path, out_dir: Path, dpi: int = 200, max_pages: int = 6 +) -> list[Path]: + """ + xlsx -> pdf (LibreOffice) -> png (PyMuPDF render) + 画像は VLM 入力に使う。OCRはしない。 + """ + ensure_dir(out_dir) + tmp_pdf = out_dir / f"{xlsx_path.stem}.pdf" + xlsx_to_pdf(xlsx_path, tmp_pdf) + + doc = fitz.open(tmp_pdf) + zoom = dpi / 72.0 + mat = fitz.Matrix(zoom, zoom) + + paths: list[Path] = [] + for i in range(min(doc.page_count, max_pages)): + page = doc.load_page(i) + pix = page.get_pixmap(matrix=mat, alpha=False) + p = out_dir / f"page_{i + 1:02d}.png" + pix.save(p) + paths.append(p) + + return paths diff --git a/benchmark/src/pipeline/openpyxl_pandas.py b/benchmark/src/pipeline/openpyxl_pandas.py new file mode 100644 index 0000000..ec73dfe --- /dev/null +++ b/benchmark/src/pipeline/openpyxl_pandas.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +from pathlib import Path + +import openpyxl + +from .common import write_text + + +def extract_openpyxl( + xlsx_path: Path, out_txt: Path, sheet_scope: list[str] | None = None +) -> None: + wb = openpyxl.load_workbook(xlsx_path, data_only=True) + sheets = sheet_scope or wb.sheetnames + + lines: list[str] = [] + lines.append("[DOC_META]") + lines.append(f"source={xlsx_path.name}") + lines.append("method=openpyxl") + lines.append("") + lines.append("[CONTENT]") + + for sname in sheets: + if sname not in wb.sheetnames: + continue + ws = wb[sname] + lines.append(f"\n# SHEET: {sname}") + max_row = ws.max_row or 1 + max_col = ws.max_column or 1 + + for r in range(1, max_row + 1): + row_cells = [] + for c in range(1, max_col + 1): + v = ws.cell(r, c).value + if v is None: + continue + txt = str(v).strip() + if not txt: + continue + # 座標付きで記録(後で人間が確認しやすい) + row_cells.append(f"R{r}C{c}:{txt}") + if row_cells: + lines.append(" | ".join(row_cells)) + + write_text(out_txt, "\n".join(lines).strip() + "\n") diff --git a/benchmark/src/pipeline/pdf_text.py b/benchmark/src/pipeline/pdf_text.py new file mode 100644 index 0000000..73c3786 --- /dev/null +++ b/benchmark/src/pipeline/pdf_text.py @@ -0,0 +1,48 @@ +from __future__ import annotations + +from pathlib import Path +import subprocess + +import fitz # PyMuPDF + +from .common import ensure_dir, write_text + + +def xlsx_to_pdf(xlsx_path: Path, out_pdf: Path) -> None: + ensure_dir(out_pdf.parent) + # LibreOffice headless convert + # soffice --headless --convert-to pdf --outdir + cmd = [ + "soffice", + "--headless", + "--nologo", + "--nolockcheck", + "--convert-to", + "pdf", + "--outdir", + str(out_pdf.parent), + str(xlsx_path), + ] + subprocess.run(cmd, check=True) + produced = out_pdf.parent / (xlsx_path.stem + ".pdf") + produced.replace(out_pdf) + + +def pdf_to_text(pdf_path: Path, out_txt: Path) -> None: + doc = fitz.open(pdf_path) + parts: list[str] = [] + for i in range(doc.page_count): + page = doc.load_page(i) + parts.append(f"\n# PAGE {i + 1}") + parts.append(page.get_text("text")) + text = "\n".join(parts).strip() + + lines: list[str] = [] + lines.append("[DOC_META]") + lines.append(f"source={pdf_path.name}") + lines.append("method=pdf_text") + lines.append("") + lines.append("[CONTENT]") + lines.append(text) + + write_text(out_txt, "\n".join(lines).strip() + "\n") diff --git a/pyproject.toml b/pyproject.toml index 6fdcbeb..535ba96 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -141,3 +141,8 @@ codecov-unit = "codecov-cli upload-process -f coverage.xml -F unit -C %CODECOV_S codecov-com = "codecov-cli upload-process -f coverage.xml -F com -C %CODECOV_SHA% -t %CODECOV_TOKEN%" docs = "mkdocs serve" build-docs = "mkdocs build && python scripts/gen_json_schema.py && python scripts/gen_model_docs.py" + +[tool.uv.workspace] +members = [ + "benchmark", +] diff --git a/uv.lock b/uv.lock index 56c5b3d..843b44e 100644 --- a/uv.lock +++ b/uv.lock @@ -6,6 +6,12 @@ resolution-markers = [ "python_full_version < '3.12'", ] +[manifest] +members = [ + "benchmark", + "exstruct", +] + [[package]] name = "annotated-types" version = "0.7.0" @@ -79,6 +85,58 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/02/e3/a4fa1946722c4c7b063cc25043a12d9ce9b4323777f89643be74cef2993c/backrefs-6.1-py39-none-any.whl", hash = "sha256:a9e99b8a4867852cad177a6430e31b0f6e495d65f8c6c134b68c14c3c95bf4b0", size = 381058, upload-time = "2025-11-15T14:52:06.698Z" }, ] +[[package]] +name = "beautifulsoup4" +version = "4.14.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "soupsieve" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c3/b0/1c6a16426d389813b48d95e26898aff79abbde42ad353958ad95cc8c9b21/beautifulsoup4-4.14.3.tar.gz", hash = "sha256:6292b1c5186d356bba669ef9f7f051757099565ad9ada5dd630bd9de5fa7fb86", size = 627737, upload-time = "2025-11-30T15:08:26.084Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1a/39/47f9197bdd44df24d67ac8893641e16f386c984a0619ef2ee4c51fbbc019/beautifulsoup4-4.14.3-py3-none-any.whl", hash = "sha256:0918bfe44902e6ad8d57732ba310582e98da931428d231a5ecb9e7c703a735bb", size = 107721, upload-time = "2025-11-30T15:08:24.087Z" }, +] + +[[package]] +name = "benchmark" +version = "0.1.0" +source = { virtual = "benchmark" } +dependencies = [ + { name = "beautifulsoup4" }, + { name = "lxml" }, + { name = "openai" }, + { name = "openpyxl" }, + { name = "pandas" }, + { name = "pydantic" }, + { name = "pymupdf" }, + { name = "python-dotenv" }, + { name = "rich" }, + { name = "typer" }, +] + +[package.dev-dependencies] +dev = [ + { name = "ruff" }, +] + +[package.metadata] +requires-dist = [ + { name = "beautifulsoup4", specifier = ">=4.14.3" }, + { name = "lxml", specifier = ">=6.0.2" }, + { name = "openai", specifier = ">=2.15.0" }, + { name = "openpyxl", specifier = ">=3.1.5" }, + { name = "pandas", specifier = ">=2.3.3" }, + { name = "pydantic", specifier = ">=2.12.5" }, + { name = "pymupdf", specifier = ">=1.26.7" }, + { name = "python-dotenv", specifier = ">=1.2.1" }, + { name = "rich", specifier = ">=14.2.0" }, + { name = "typer", specifier = ">=0.21.1" }, +] + +[package.metadata.requires-dev] +dev = [{ name = "ruff", specifier = ">=0.14.8" }] + [[package]] name = "certifi" version = "2025.11.12" @@ -441,6 +499,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/33/6b/e0547afaf41bf2c42e52430072fa5658766e3d65bd4b03a563d1b6336f57/distlib-0.4.0-py2.py3-none-any.whl", hash = "sha256:9659f7d87e46584a30b5780e43ac7a2143098441670ff0a49d5f9034c54a6c16", size = 469047, upload-time = "2025-07-17T16:51:58.613Z" }, ] +[[package]] +name = "distro" +version = "1.9.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/fc/f8/98eea607f65de6527f8a2e8885fc8015d3e6f5775df186e443e0964a11c3/distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed", size = 60722, upload-time = "2023-12-24T09:54:32.31Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2", size = 20277, upload-time = "2023-12-24T09:54:30.421Z" }, +] + [[package]] name = "et-xmlfile" version = "2.0.0" @@ -452,7 +519,7 @@ wheels = [ [[package]] name = "exstruct" -version = "0.4.1" +version = "0.4.2" source = { editable = "." } dependencies = [ { name = "numpy" }, @@ -505,8 +572,8 @@ dev = [ requires-dist = [ { name = "httpx", marker = "extra == 'all'", specifier = ">=0.27,<1.0" }, { name = "httpx", marker = "extra == 'mcp'", specifier = ">=0.27,<1.0" }, - { name = "mcp", marker = "extra == 'all'", specifier = ">=1.6.0,<2.0.0" }, - { name = "mcp", marker = "extra == 'mcp'", specifier = ">=1.6.0,<2.0.0" }, + { name = "mcp", marker = "extra == 'all'", specifier = ">=1.25.0,<2.0.0" }, + { name = "mcp", marker = "extra == 'mcp'", specifier = ">=1.25.0,<2.0.0" }, { name = "numpy", specifier = ">=2.3.5" }, { name = "openpyxl", specifier = ">=3.1.5" }, { name = "pandas", specifier = ">=2.3.3" }, @@ -736,6 +803,91 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" }, ] +[[package]] +name = "jiter" +version = "0.12.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/45/9d/e0660989c1370e25848bb4c52d061c71837239738ad937e83edca174c273/jiter-0.12.0.tar.gz", hash = "sha256:64dfcd7d5c168b38d3f9f8bba7fc639edb3418abcc74f22fdbe6b8938293f30b", size = 168294, upload-time = "2025-11-09T20:49:23.302Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/32/f9/eaca4633486b527ebe7e681c431f529b63fe2709e7c5242fc0f43f77ce63/jiter-0.12.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:d8f8a7e317190b2c2d60eb2e8aa835270b008139562d70fe732e1c0020ec53c9", size = 316435, upload-time = "2025-11-09T20:47:02.087Z" }, + { url = "https://files.pythonhosted.org/packages/10/c1/40c9f7c22f5e6ff715f28113ebaba27ab85f9af2660ad6e1dd6425d14c19/jiter-0.12.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2218228a077e784c6c8f1a8e5d6b8cb1dea62ce25811c356364848554b2056cd", size = 320548, upload-time = "2025-11-09T20:47:03.409Z" }, + { url = "https://files.pythonhosted.org/packages/6b/1b/efbb68fe87e7711b00d2cfd1f26bb4bfc25a10539aefeaa7727329ffb9cb/jiter-0.12.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9354ccaa2982bf2188fd5f57f79f800ef622ec67beb8329903abf6b10da7d423", size = 351915, upload-time = "2025-11-09T20:47:05.171Z" }, + { url = "https://files.pythonhosted.org/packages/15/2d/c06e659888c128ad1e838123d0638f0efad90cc30860cb5f74dd3f2fc0b3/jiter-0.12.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8f2607185ea89b4af9a604d4c7ec40e45d3ad03ee66998b031134bc510232bb7", size = 368966, upload-time = "2025-11-09T20:47:06.508Z" }, + { url = "https://files.pythonhosted.org/packages/6b/20/058db4ae5fb07cf6a4ab2e9b9294416f606d8e467fb74c2184b2a1eeacba/jiter-0.12.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3a585a5e42d25f2e71db5f10b171f5e5ea641d3aa44f7df745aa965606111cc2", size = 482047, upload-time = "2025-11-09T20:47:08.382Z" }, + { url = "https://files.pythonhosted.org/packages/49/bb/dc2b1c122275e1de2eb12905015d61e8316b2f888bdaac34221c301495d6/jiter-0.12.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bd9e21d34edff5a663c631f850edcb786719c960ce887a5661e9c828a53a95d9", size = 380835, upload-time = "2025-11-09T20:47:09.81Z" }, + { url = "https://files.pythonhosted.org/packages/23/7d/38f9cd337575349de16da575ee57ddb2d5a64d425c9367f5ef9e4612e32e/jiter-0.12.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4a612534770470686cd5431478dc5a1b660eceb410abade6b1b74e320ca98de6", size = 364587, upload-time = "2025-11-09T20:47:11.529Z" }, + { url = "https://files.pythonhosted.org/packages/f0/a3/b13e8e61e70f0bb06085099c4e2462647f53cc2ca97614f7fedcaa2bb9f3/jiter-0.12.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:3985aea37d40a908f887b34d05111e0aae822943796ebf8338877fee2ab67725", size = 390492, upload-time = "2025-11-09T20:47:12.993Z" }, + { url = "https://files.pythonhosted.org/packages/07/71/e0d11422ed027e21422f7bc1883c61deba2d9752b720538430c1deadfbca/jiter-0.12.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b1207af186495f48f72529f8d86671903c8c10127cac6381b11dddc4aaa52df6", size = 522046, upload-time = "2025-11-09T20:47:14.6Z" }, + { url = "https://files.pythonhosted.org/packages/9f/59/b968a9aa7102a8375dbbdfbd2aeebe563c7e5dddf0f47c9ef1588a97e224/jiter-0.12.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:ef2fb241de583934c9915a33120ecc06d94aa3381a134570f59eed784e87001e", size = 513392, upload-time = "2025-11-09T20:47:16.011Z" }, + { url = "https://files.pythonhosted.org/packages/ca/e4/7df62002499080dbd61b505c5cb351aa09e9959d176cac2aa8da6f93b13b/jiter-0.12.0-cp311-cp311-win32.whl", hash = "sha256:453b6035672fecce8007465896a25b28a6b59cfe8fbc974b2563a92f5a92a67c", size = 206096, upload-time = "2025-11-09T20:47:17.344Z" }, + { url = "https://files.pythonhosted.org/packages/bb/60/1032b30ae0572196b0de0e87dce3b6c26a1eff71aad5fe43dee3082d32e0/jiter-0.12.0-cp311-cp311-win_amd64.whl", hash = "sha256:ca264b9603973c2ad9435c71a8ec8b49f8f715ab5ba421c85a51cde9887e421f", size = 204899, upload-time = "2025-11-09T20:47:19.365Z" }, + { url = "https://files.pythonhosted.org/packages/49/d5/c145e526fccdb834063fb45c071df78b0cc426bbaf6de38b0781f45d956f/jiter-0.12.0-cp311-cp311-win_arm64.whl", hash = "sha256:cb00ef392e7d684f2754598c02c409f376ddcef857aae796d559e6cacc2d78a5", size = 188070, upload-time = "2025-11-09T20:47:20.75Z" }, + { url = "https://files.pythonhosted.org/packages/92/c9/5b9f7b4983f1b542c64e84165075335e8a236fa9e2ea03a0c79780062be8/jiter-0.12.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:305e061fa82f4680607a775b2e8e0bcb071cd2205ac38e6ef48c8dd5ebe1cf37", size = 314449, upload-time = "2025-11-09T20:47:22.999Z" }, + { url = "https://files.pythonhosted.org/packages/98/6e/e8efa0e78de00db0aee82c0cf9e8b3f2027efd7f8a71f859d8f4be8e98ef/jiter-0.12.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5c1860627048e302a528333c9307c818c547f214d8659b0705d2195e1a94b274", size = 319855, upload-time = "2025-11-09T20:47:24.779Z" }, + { url = "https://files.pythonhosted.org/packages/20/26/894cd88e60b5d58af53bec5c6759d1292bd0b37a8b5f60f07abf7a63ae5f/jiter-0.12.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:df37577a4f8408f7e0ec3205d2a8f87672af8f17008358063a4d6425b6081ce3", size = 350171, upload-time = "2025-11-09T20:47:26.469Z" }, + { url = "https://files.pythonhosted.org/packages/f5/27/a7b818b9979ac31b3763d25f3653ec3a954044d5e9f5d87f2f247d679fd1/jiter-0.12.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:75fdd787356c1c13a4f40b43c2156276ef7a71eb487d98472476476d803fb2cf", size = 365590, upload-time = "2025-11-09T20:47:27.918Z" }, + { url = "https://files.pythonhosted.org/packages/ba/7e/e46195801a97673a83746170b17984aa8ac4a455746354516d02ca5541b4/jiter-0.12.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1eb5db8d9c65b112aacf14fcd0faae9913d07a8afea5ed06ccdd12b724e966a1", size = 479462, upload-time = "2025-11-09T20:47:29.654Z" }, + { url = "https://files.pythonhosted.org/packages/ca/75/f833bfb009ab4bd11b1c9406d333e3b4357709ed0570bb48c7c06d78c7dd/jiter-0.12.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:73c568cc27c473f82480abc15d1301adf333a7ea4f2e813d6a2c7d8b6ba8d0df", size = 378983, upload-time = "2025-11-09T20:47:31.026Z" }, + { url = "https://files.pythonhosted.org/packages/71/b3/7a69d77943cc837d30165643db753471aff5df39692d598da880a6e51c24/jiter-0.12.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4321e8a3d868919bcb1abb1db550d41f2b5b326f72df29e53b2df8b006eb9403", size = 361328, upload-time = "2025-11-09T20:47:33.286Z" }, + { url = "https://files.pythonhosted.org/packages/b0/ac/a78f90caf48d65ba70d8c6efc6f23150bc39dc3389d65bbec2a95c7bc628/jiter-0.12.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:0a51bad79f8cc9cac2b4b705039f814049142e0050f30d91695a2d9a6611f126", size = 386740, upload-time = "2025-11-09T20:47:34.703Z" }, + { url = "https://files.pythonhosted.org/packages/39/b6/5d31c2cc8e1b6a6bcf3c5721e4ca0a3633d1ab4754b09bc7084f6c4f5327/jiter-0.12.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:2a67b678f6a5f1dd6c36d642d7db83e456bc8b104788262aaefc11a22339f5a9", size = 520875, upload-time = "2025-11-09T20:47:36.058Z" }, + { url = "https://files.pythonhosted.org/packages/30/b5/4df540fae4e9f68c54b8dab004bd8c943a752f0b00efd6e7d64aa3850339/jiter-0.12.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:efe1a211fe1fd14762adea941e3cfd6c611a136e28da6c39272dbb7a1bbe6a86", size = 511457, upload-time = "2025-11-09T20:47:37.932Z" }, + { url = "https://files.pythonhosted.org/packages/07/65/86b74010e450a1a77b2c1aabb91d4a91dd3cd5afce99f34d75fd1ac64b19/jiter-0.12.0-cp312-cp312-win32.whl", hash = "sha256:d779d97c834b4278276ec703dc3fc1735fca50af63eb7262f05bdb4e62203d44", size = 204546, upload-time = "2025-11-09T20:47:40.47Z" }, + { url = "https://files.pythonhosted.org/packages/1c/c7/6659f537f9562d963488e3e55573498a442503ced01f7e169e96a6110383/jiter-0.12.0-cp312-cp312-win_amd64.whl", hash = "sha256:e8269062060212b373316fe69236096aaf4c49022d267c6736eebd66bbbc60bb", size = 205196, upload-time = "2025-11-09T20:47:41.794Z" }, + { url = "https://files.pythonhosted.org/packages/21/f4/935304f5169edadfec7f9c01eacbce4c90bb9a82035ac1de1f3bd2d40be6/jiter-0.12.0-cp312-cp312-win_arm64.whl", hash = "sha256:06cb970936c65de926d648af0ed3d21857f026b1cf5525cb2947aa5e01e05789", size = 186100, upload-time = "2025-11-09T20:47:43.007Z" }, + { url = "https://files.pythonhosted.org/packages/3d/a6/97209693b177716e22576ee1161674d1d58029eb178e01866a0422b69224/jiter-0.12.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:6cc49d5130a14b732e0612bc76ae8db3b49898732223ef8b7599aa8d9810683e", size = 313658, upload-time = "2025-11-09T20:47:44.424Z" }, + { url = "https://files.pythonhosted.org/packages/06/4d/125c5c1537c7d8ee73ad3d530a442d6c619714b95027143f1b61c0b4dfe0/jiter-0.12.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:37f27a32ce36364d2fa4f7fdc507279db604d27d239ea2e044c8f148410defe1", size = 318605, upload-time = "2025-11-09T20:47:45.973Z" }, + { url = "https://files.pythonhosted.org/packages/99/bf/a840b89847885064c41a5f52de6e312e91fa84a520848ee56c97e4fa0205/jiter-0.12.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bbc0944aa3d4b4773e348cda635252824a78f4ba44328e042ef1ff3f6080d1cf", size = 349803, upload-time = "2025-11-09T20:47:47.535Z" }, + { url = "https://files.pythonhosted.org/packages/8a/88/e63441c28e0db50e305ae23e19c1d8fae012d78ed55365da392c1f34b09c/jiter-0.12.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:da25c62d4ee1ffbacb97fac6dfe4dcd6759ebdc9015991e92a6eae5816287f44", size = 365120, upload-time = "2025-11-09T20:47:49.284Z" }, + { url = "https://files.pythonhosted.org/packages/0a/7c/49b02714af4343970eb8aca63396bc1c82fa01197dbb1e9b0d274b550d4e/jiter-0.12.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:048485c654b838140b007390b8182ba9774621103bd4d77c9c3f6f117474ba45", size = 479918, upload-time = "2025-11-09T20:47:50.807Z" }, + { url = "https://files.pythonhosted.org/packages/69/ba/0a809817fdd5a1db80490b9150645f3aae16afad166960bcd562be194f3b/jiter-0.12.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:635e737fbb7315bef0037c19b88b799143d2d7d3507e61a76751025226b3ac87", size = 379008, upload-time = "2025-11-09T20:47:52.211Z" }, + { url = "https://files.pythonhosted.org/packages/5f/c3/c9fc0232e736c8877d9e6d83d6eeb0ba4e90c6c073835cc2e8f73fdeef51/jiter-0.12.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4e017c417b1ebda911bd13b1e40612704b1f5420e30695112efdbed8a4b389ed", size = 361785, upload-time = "2025-11-09T20:47:53.512Z" }, + { url = "https://files.pythonhosted.org/packages/96/61/61f69b7e442e97ca6cd53086ddc1cf59fb830549bc72c0a293713a60c525/jiter-0.12.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:89b0bfb8b2bf2351fba36bb211ef8bfceba73ef58e7f0c68fb67b5a2795ca2f9", size = 386108, upload-time = "2025-11-09T20:47:54.893Z" }, + { url = "https://files.pythonhosted.org/packages/e9/2e/76bb3332f28550c8f1eba3bf6e5efe211efda0ddbbaf24976bc7078d42a5/jiter-0.12.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:f5aa5427a629a824a543672778c9ce0c5e556550d1569bb6ea28a85015287626", size = 519937, upload-time = "2025-11-09T20:47:56.253Z" }, + { url = "https://files.pythonhosted.org/packages/84/d6/fa96efa87dc8bff2094fb947f51f66368fa56d8d4fc9e77b25d7fbb23375/jiter-0.12.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:ed53b3d6acbcb0fd0b90f20c7cb3b24c357fe82a3518934d4edfa8c6898e498c", size = 510853, upload-time = "2025-11-09T20:47:58.32Z" }, + { url = "https://files.pythonhosted.org/packages/8a/28/93f67fdb4d5904a708119a6ab58a8f1ec226ff10a94a282e0215402a8462/jiter-0.12.0-cp313-cp313-win32.whl", hash = "sha256:4747de73d6b8c78f2e253a2787930f4fffc68da7fa319739f57437f95963c4de", size = 204699, upload-time = "2025-11-09T20:47:59.686Z" }, + { url = "https://files.pythonhosted.org/packages/c4/1f/30b0eb087045a0abe2a5c9c0c0c8da110875a1d3be83afd4a9a4e548be3c/jiter-0.12.0-cp313-cp313-win_amd64.whl", hash = "sha256:e25012eb0c456fcc13354255d0338cd5397cce26c77b2832b3c4e2e255ea5d9a", size = 204258, upload-time = "2025-11-09T20:48:01.01Z" }, + { url = "https://files.pythonhosted.org/packages/2c/f4/2b4daf99b96bce6fc47971890b14b2a36aef88d7beb9f057fafa032c6141/jiter-0.12.0-cp313-cp313-win_arm64.whl", hash = "sha256:c97b92c54fe6110138c872add030a1f99aea2401ddcdaa21edf74705a646dd60", size = 185503, upload-time = "2025-11-09T20:48:02.35Z" }, + { url = "https://files.pythonhosted.org/packages/39/ca/67bb15a7061d6fe20b9b2a2fd783e296a1e0f93468252c093481a2f00efa/jiter-0.12.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:53839b35a38f56b8be26a7851a48b89bc47e5d88e900929df10ed93b95fea3d6", size = 317965, upload-time = "2025-11-09T20:48:03.783Z" }, + { url = "https://files.pythonhosted.org/packages/18/af/1788031cd22e29c3b14bc6ca80b16a39a0b10e611367ffd480c06a259831/jiter-0.12.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:94f669548e55c91ab47fef8bddd9c954dab1938644e715ea49d7e117015110a4", size = 345831, upload-time = "2025-11-09T20:48:05.55Z" }, + { url = "https://files.pythonhosted.org/packages/05/17/710bf8472d1dff0d3caf4ced6031060091c1320f84ee7d5dcbed1f352417/jiter-0.12.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:351d54f2b09a41600ffea43d081522d792e81dcfb915f6d2d242744c1cc48beb", size = 361272, upload-time = "2025-11-09T20:48:06.951Z" }, + { url = "https://files.pythonhosted.org/packages/fb/f1/1dcc4618b59761fef92d10bcbb0b038b5160be653b003651566a185f1a5c/jiter-0.12.0-cp313-cp313t-win_amd64.whl", hash = "sha256:2a5e90604620f94bf62264e7c2c038704d38217b7465b863896c6d7c902b06c7", size = 204604, upload-time = "2025-11-09T20:48:08.328Z" }, + { url = "https://files.pythonhosted.org/packages/d9/32/63cb1d9f1c5c6632a783c0052cde9ef7ba82688f7065e2f0d5f10a7e3edb/jiter-0.12.0-cp313-cp313t-win_arm64.whl", hash = "sha256:88ef757017e78d2860f96250f9393b7b577b06a956ad102c29c8237554380db3", size = 185628, upload-time = "2025-11-09T20:48:09.572Z" }, + { url = "https://files.pythonhosted.org/packages/a8/99/45c9f0dbe4a1416b2b9a8a6d1236459540f43d7fb8883cff769a8db0612d/jiter-0.12.0-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:c46d927acd09c67a9fb1416df45c5a04c27e83aae969267e98fba35b74e99525", size = 312478, upload-time = "2025-11-09T20:48:10.898Z" }, + { url = "https://files.pythonhosted.org/packages/4c/a7/54ae75613ba9e0f55fcb0bc5d1f807823b5167cc944e9333ff322e9f07dd/jiter-0.12.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:774ff60b27a84a85b27b88cd5583899c59940bcc126caca97eb2a9df6aa00c49", size = 318706, upload-time = "2025-11-09T20:48:12.266Z" }, + { url = "https://files.pythonhosted.org/packages/59/31/2aa241ad2c10774baf6c37f8b8e1f39c07db358f1329f4eb40eba179c2a2/jiter-0.12.0-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c5433fab222fb072237df3f637d01b81f040a07dcac1cb4a5c75c7aa9ed0bef1", size = 351894, upload-time = "2025-11-09T20:48:13.673Z" }, + { url = "https://files.pythonhosted.org/packages/54/4f/0f2759522719133a9042781b18cc94e335b6d290f5e2d3e6899d6af933e3/jiter-0.12.0-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f8c593c6e71c07866ec6bfb790e202a833eeec885022296aff6b9e0b92d6a70e", size = 365714, upload-time = "2025-11-09T20:48:15.083Z" }, + { url = "https://files.pythonhosted.org/packages/dc/6f/806b895f476582c62a2f52c453151edd8a0fde5411b0497baaa41018e878/jiter-0.12.0-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:90d32894d4c6877a87ae00c6b915b609406819dce8bc0d4e962e4de2784e567e", size = 478989, upload-time = "2025-11-09T20:48:16.706Z" }, + { url = "https://files.pythonhosted.org/packages/86/6c/012d894dc6e1033acd8db2b8346add33e413ec1c7c002598915278a37f79/jiter-0.12.0-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:798e46eed9eb10c3adbbacbd3bdb5ecd4cf7064e453d00dbef08802dae6937ff", size = 378615, upload-time = "2025-11-09T20:48:18.614Z" }, + { url = "https://files.pythonhosted.org/packages/87/30/d718d599f6700163e28e2c71c0bbaf6dace692e7df2592fd793ac9276717/jiter-0.12.0-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b3f1368f0a6719ea80013a4eb90ba72e75d7ea67cfc7846db2ca504f3df0169a", size = 364745, upload-time = "2025-11-09T20:48:20.117Z" }, + { url = "https://files.pythonhosted.org/packages/8f/85/315b45ce4b6ddc7d7fceca24068543b02bdc8782942f4ee49d652e2cc89f/jiter-0.12.0-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:65f04a9d0b4406f7e51279710b27484af411896246200e461d80d3ba0caa901a", size = 386502, upload-time = "2025-11-09T20:48:21.543Z" }, + { url = "https://files.pythonhosted.org/packages/74/0b/ce0434fb40c5b24b368fe81b17074d2840748b4952256bab451b72290a49/jiter-0.12.0-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:fd990541982a24281d12b67a335e44f117e4c6cbad3c3b75c7dea68bf4ce3a67", size = 519845, upload-time = "2025-11-09T20:48:22.964Z" }, + { url = "https://files.pythonhosted.org/packages/e8/a3/7a7a4488ba052767846b9c916d208b3ed114e3eb670ee984e4c565b9cf0d/jiter-0.12.0-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:b111b0e9152fa7df870ecaebb0bd30240d9f7fff1f2003bcb4ed0f519941820b", size = 510701, upload-time = "2025-11-09T20:48:24.483Z" }, + { url = "https://files.pythonhosted.org/packages/c3/16/052ffbf9d0467b70af24e30f91e0579e13ded0c17bb4a8eb2aed3cb60131/jiter-0.12.0-cp314-cp314-win32.whl", hash = "sha256:a78befb9cc0a45b5a5a0d537b06f8544c2ebb60d19d02c41ff15da28a9e22d42", size = 205029, upload-time = "2025-11-09T20:48:25.749Z" }, + { url = "https://files.pythonhosted.org/packages/e4/18/3cf1f3f0ccc789f76b9a754bdb7a6977e5d1d671ee97a9e14f7eb728d80e/jiter-0.12.0-cp314-cp314-win_amd64.whl", hash = "sha256:e1fe01c082f6aafbe5c8faf0ff074f38dfb911d53f07ec333ca03f8f6226debf", size = 204960, upload-time = "2025-11-09T20:48:27.415Z" }, + { url = "https://files.pythonhosted.org/packages/02/68/736821e52ecfdeeb0f024b8ab01b5a229f6b9293bbdb444c27efade50b0f/jiter-0.12.0-cp314-cp314-win_arm64.whl", hash = "sha256:d72f3b5a432a4c546ea4bedc84cce0c3404874f1d1676260b9c7f048a9855451", size = 185529, upload-time = "2025-11-09T20:48:29.125Z" }, + { url = "https://files.pythonhosted.org/packages/30/61/12ed8ee7a643cce29ac97c2281f9ce3956eb76b037e88d290f4ed0d41480/jiter-0.12.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:e6ded41aeba3603f9728ed2b6196e4df875348ab97b28fc8afff115ed42ba7a7", size = 318974, upload-time = "2025-11-09T20:48:30.87Z" }, + { url = "https://files.pythonhosted.org/packages/2d/c6/f3041ede6d0ed5e0e79ff0de4c8f14f401bbf196f2ef3971cdbe5fd08d1d/jiter-0.12.0-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a947920902420a6ada6ad51892082521978e9dd44a802663b001436e4b771684", size = 345932, upload-time = "2025-11-09T20:48:32.658Z" }, + { url = "https://files.pythonhosted.org/packages/d5/5d/4d94835889edd01ad0e2dbfc05f7bdfaed46292e7b504a6ac7839aa00edb/jiter-0.12.0-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:add5e227e0554d3a52cf390a7635edaffdf4f8fce4fdbcef3cc2055bb396a30c", size = 367243, upload-time = "2025-11-09T20:48:34.093Z" }, + { url = "https://files.pythonhosted.org/packages/fd/76/0051b0ac2816253a99d27baf3dda198663aff882fa6ea7deeb94046da24e/jiter-0.12.0-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3f9b1cda8fcb736250d7e8711d4580ebf004a46771432be0ae4796944b5dfa5d", size = 479315, upload-time = "2025-11-09T20:48:35.507Z" }, + { url = "https://files.pythonhosted.org/packages/70/ae/83f793acd68e5cb24e483f44f482a1a15601848b9b6f199dacb970098f77/jiter-0.12.0-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:deeb12a2223fe0135c7ff1356a143d57f95bbf1f4a66584f1fc74df21d86b993", size = 380714, upload-time = "2025-11-09T20:48:40.014Z" }, + { url = "https://files.pythonhosted.org/packages/b1/5e/4808a88338ad2c228b1126b93fcd8ba145e919e886fe910d578230dabe3b/jiter-0.12.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c596cc0f4cb574877550ce4ecd51f8037469146addd676d7c1a30ebe6391923f", size = 365168, upload-time = "2025-11-09T20:48:41.462Z" }, + { url = "https://files.pythonhosted.org/packages/0c/d4/04619a9e8095b42aef436b5aeb4c0282b4ff1b27d1db1508df9f5dc82750/jiter-0.12.0-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:5ab4c823b216a4aeab3fdbf579c5843165756bd9ad87cc6b1c65919c4715f783", size = 387893, upload-time = "2025-11-09T20:48:42.921Z" }, + { url = "https://files.pythonhosted.org/packages/17/ea/d3c7e62e4546fdc39197fa4a4315a563a89b95b6d54c0d25373842a59cbe/jiter-0.12.0-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:e427eee51149edf962203ff8db75a7514ab89be5cb623fb9cea1f20b54f1107b", size = 520828, upload-time = "2025-11-09T20:48:44.278Z" }, + { url = "https://files.pythonhosted.org/packages/cc/0b/c6d3562a03fd767e31cb119d9041ea7958c3c80cb3d753eafb19b3b18349/jiter-0.12.0-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:edb868841f84c111255ba5e80339d386d937ec1fdce419518ce1bd9370fac5b6", size = 511009, upload-time = "2025-11-09T20:48:45.726Z" }, + { url = "https://files.pythonhosted.org/packages/aa/51/2cb4468b3448a8385ebcd15059d325c9ce67df4e2758d133ab9442b19834/jiter-0.12.0-cp314-cp314t-win32.whl", hash = "sha256:8bbcfe2791dfdb7c5e48baf646d37a6a3dcb5a97a032017741dea9f817dca183", size = 205110, upload-time = "2025-11-09T20:48:47.033Z" }, + { url = "https://files.pythonhosted.org/packages/b2/c5/ae5ec83dec9c2d1af805fd5fe8f74ebded9c8670c5210ec7820ce0dbeb1e/jiter-0.12.0-cp314-cp314t-win_amd64.whl", hash = "sha256:2fa940963bf02e1d8226027ef461e36af472dea85d36054ff835aeed944dd873", size = 205223, upload-time = "2025-11-09T20:48:49.076Z" }, + { url = "https://files.pythonhosted.org/packages/97/9a/3c5391907277f0e55195550cf3fa8e293ae9ee0c00fb402fec1e38c0c82f/jiter-0.12.0-cp314-cp314t-win_arm64.whl", hash = "sha256:506c9708dd29b27288f9f8f1140c3cb0e3d8ddb045956d7757b1fa0e0f39a473", size = 185564, upload-time = "2025-11-09T20:48:50.376Z" }, + { url = "https://files.pythonhosted.org/packages/fe/54/5339ef1ecaa881c6948669956567a64d2670941925f245c434f494ffb0e5/jiter-0.12.0-graalpy311-graalpy242_311_native-macosx_10_12_x86_64.whl", hash = "sha256:4739a4657179ebf08f85914ce50332495811004cc1747852e8b2041ed2aab9b8", size = 311144, upload-time = "2025-11-09T20:49:10.503Z" }, + { url = "https://files.pythonhosted.org/packages/27/74/3446c652bffbd5e81ab354e388b1b5fc1d20daac34ee0ed11ff096b1b01a/jiter-0.12.0-graalpy311-graalpy242_311_native-macosx_11_0_arm64.whl", hash = "sha256:41da8def934bf7bec16cb24bd33c0ca62126d2d45d81d17b864bd5ad721393c3", size = 305877, upload-time = "2025-11-09T20:49:12.269Z" }, + { url = "https://files.pythonhosted.org/packages/a1/f4/ed76ef9043450f57aac2d4fbeb27175aa0eb9c38f833be6ef6379b3b9a86/jiter-0.12.0-graalpy311-graalpy242_311_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9c44ee814f499c082e69872d426b624987dbc5943ab06e9bbaa4f81989fdb79e", size = 340419, upload-time = "2025-11-09T20:49:13.803Z" }, + { url = "https://files.pythonhosted.org/packages/21/01/857d4608f5edb0664aa791a3d45702e1a5bcfff9934da74035e7b9803846/jiter-0.12.0-graalpy311-graalpy242_311_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cd2097de91cf03eaa27b3cbdb969addf83f0179c6afc41bbc4513705e013c65d", size = 347212, upload-time = "2025-11-09T20:49:15.643Z" }, + { url = "https://files.pythonhosted.org/packages/cb/f5/12efb8ada5f5c9edc1d4555fe383c1fb2eac05ac5859258a72d61981d999/jiter-0.12.0-graalpy312-graalpy250_312_native-macosx_10_12_x86_64.whl", hash = "sha256:e8547883d7b96ef2e5fe22b88f8a4c8725a56e7f4abafff20fd5272d634c7ecb", size = 309974, upload-time = "2025-11-09T20:49:17.187Z" }, + { url = "https://files.pythonhosted.org/packages/85/15/d6eb3b770f6a0d332675141ab3962fd4a7c270ede3515d9f3583e1d28276/jiter-0.12.0-graalpy312-graalpy250_312_native-macosx_11_0_arm64.whl", hash = "sha256:89163163c0934854a668ed783a2546a0617f71706a2551a4a0666d91ab365d6b", size = 304233, upload-time = "2025-11-09T20:49:18.734Z" }, + { url = "https://files.pythonhosted.org/packages/8c/3e/e7e06743294eea2cf02ced6aa0ff2ad237367394e37a0e2b4a1108c67a36/jiter-0.12.0-graalpy312-graalpy250_312_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d96b264ab7d34bbb2312dedc47ce07cd53f06835eacbc16dde3761f47c3a9e7f", size = 338537, upload-time = "2025-11-09T20:49:20.317Z" }, + { url = "https://files.pythonhosted.org/packages/2f/9c/6753e6522b8d0ef07d3a3d239426669e984fb0eba15a315cdbc1253904e4/jiter-0.12.0-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c24e864cb30ab82311c6425655b0cdab0a98c5d973b065c66a3f020740c2324c", size = 346110, upload-time = "2025-11-09T20:49:21.817Z" }, +] + [[package]] name = "jsonschema" version = "4.26.0" @@ -834,15 +986,98 @@ sdist = { url = "https://files.pythonhosted.org/packages/aa/88/262177de60548e5a2 wheels = [ { url = "https://files.pythonhosted.org/packages/77/d5/becbe1e2569b474a23f0c672ead8a29ac50b2dc1d5b9de184831bda8d14c/lxml-6.0.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:13e35cbc684aadf05d8711a5d1b5857c92e5e580efa9a0d2be197199c8def607", size = 8634365, upload-time = "2025-09-22T04:00:45.672Z" }, { url = "https://files.pythonhosted.org/packages/28/66/1ced58f12e804644426b85d0bb8a4478ca77bc1761455da310505f1a3526/lxml-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3b1675e096e17c6fe9c0e8c81434f5736c0739ff9ac6123c87c2d452f48fc938", size = 4650793, upload-time = "2025-09-22T04:00:47.783Z" }, + { url = "https://files.pythonhosted.org/packages/11/84/549098ffea39dfd167e3f174b4ce983d0eed61f9d8d25b7bf2a57c3247fc/lxml-6.0.2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8ac6e5811ae2870953390452e3476694196f98d447573234592d30488147404d", size = 4944362, upload-time = "2025-09-22T04:00:49.845Z" }, + { url = "https://files.pythonhosted.org/packages/ac/bd/f207f16abf9749d2037453d56b643a7471d8fde855a231a12d1e095c4f01/lxml-6.0.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5aa0fc67ae19d7a64c3fe725dc9a1bb11f80e01f78289d05c6f62545affec438", size = 5083152, upload-time = "2025-09-22T04:00:51.709Z" }, + { url = "https://files.pythonhosted.org/packages/15/ae/bd813e87d8941d52ad5b65071b1affb48da01c4ed3c9c99e40abb266fbff/lxml-6.0.2-cp311-cp311-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:de496365750cc472b4e7902a485d3f152ecf57bd3ba03ddd5578ed8ceb4c5964", size = 5023539, upload-time = "2025-09-22T04:00:53.593Z" }, + { url = "https://files.pythonhosted.org/packages/02/cd/9bfef16bd1d874fbe0cb51afb00329540f30a3283beb9f0780adbb7eec03/lxml-6.0.2-cp311-cp311-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:200069a593c5e40b8f6fc0d84d86d970ba43138c3e68619ffa234bc9bb806a4d", size = 5344853, upload-time = "2025-09-22T04:00:55.524Z" }, + { url = "https://files.pythonhosted.org/packages/b8/89/ea8f91594bc5dbb879734d35a6f2b0ad50605d7fb419de2b63d4211765cc/lxml-6.0.2-cp311-cp311-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7d2de809c2ee3b888b59f995625385f74629707c9355e0ff856445cdcae682b7", size = 5225133, upload-time = "2025-09-22T04:00:57.269Z" }, + { url = "https://files.pythonhosted.org/packages/b9/37/9c735274f5dbec726b2db99b98a43950395ba3d4a1043083dba2ad814170/lxml-6.0.2-cp311-cp311-manylinux_2_31_armv7l.whl", hash = "sha256:b2c3da8d93cf5db60e8858c17684c47d01fee6405e554fb55018dd85fc23b178", size = 4677944, upload-time = "2025-09-22T04:00:59.052Z" }, + { url = "https://files.pythonhosted.org/packages/20/28/7dfe1ba3475d8bfca3878365075abe002e05d40dfaaeb7ec01b4c587d533/lxml-6.0.2-cp311-cp311-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:442de7530296ef5e188373a1ea5789a46ce90c4847e597856570439621d9c553", size = 5284535, upload-time = "2025-09-22T04:01:01.335Z" }, + { url = "https://files.pythonhosted.org/packages/e7/cf/5f14bc0de763498fc29510e3532bf2b4b3a1c1d5d0dff2e900c16ba021ef/lxml-6.0.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:2593c77efde7bfea7f6389f1ab249b15ed4aa5bc5cb5131faa3b843c429fbedb", size = 5067343, upload-time = "2025-09-22T04:01:03.13Z" }, + { url = "https://files.pythonhosted.org/packages/1c/b0/bb8275ab5472f32b28cfbbcc6db7c9d092482d3439ca279d8d6fa02f7025/lxml-6.0.2-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:3e3cb08855967a20f553ff32d147e14329b3ae70ced6edc2f282b94afbc74b2a", size = 4725419, upload-time = "2025-09-22T04:01:05.013Z" }, + { url = "https://files.pythonhosted.org/packages/25/4c/7c222753bc72edca3b99dbadba1b064209bc8ed4ad448af990e60dcce462/lxml-6.0.2-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:2ed6c667fcbb8c19c6791bbf40b7268ef8ddf5a96940ba9404b9f9a304832f6c", size = 5275008, upload-time = "2025-09-22T04:01:07.327Z" }, + { url = "https://files.pythonhosted.org/packages/6c/8c/478a0dc6b6ed661451379447cdbec77c05741a75736d97e5b2b729687828/lxml-6.0.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b8f18914faec94132e5b91e69d76a5c1d7b0c73e2489ea8929c4aaa10b76bbf7", size = 5248906, upload-time = "2025-09-22T04:01:09.452Z" }, + { url = "https://files.pythonhosted.org/packages/2d/d9/5be3a6ab2784cdf9accb0703b65e1b64fcdd9311c9f007630c7db0cfcce1/lxml-6.0.2-cp311-cp311-win32.whl", hash = "sha256:6605c604e6daa9e0d7f0a2137bdc47a2e93b59c60a65466353e37f8272f47c46", size = 3610357, upload-time = "2025-09-22T04:01:11.102Z" }, + { url = "https://files.pythonhosted.org/packages/e2/7d/ca6fb13349b473d5732fb0ee3eec8f6c80fc0688e76b7d79c1008481bf1f/lxml-6.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:e5867f2651016a3afd8dd2c8238baa66f1e2802f44bc17e236f547ace6647078", size = 4036583, upload-time = "2025-09-22T04:01:12.766Z" }, + { url = "https://files.pythonhosted.org/packages/ab/a2/51363b5ecd3eab46563645f3a2c3836a2fc67d01a1b87c5017040f39f567/lxml-6.0.2-cp311-cp311-win_arm64.whl", hash = "sha256:4197fb2534ee05fd3e7afaab5d8bfd6c2e186f65ea7f9cd6a82809c887bd1285", size = 3680591, upload-time = "2025-09-22T04:01:14.874Z" }, { url = "https://files.pythonhosted.org/packages/f3/c8/8ff2bc6b920c84355146cd1ab7d181bc543b89241cfb1ebee824a7c81457/lxml-6.0.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:a59f5448ba2ceccd06995c95ea59a7674a10de0810f2ce90c9006f3cbc044456", size = 8661887, upload-time = "2025-09-22T04:01:17.265Z" }, { url = "https://files.pythonhosted.org/packages/37/6f/9aae1008083bb501ef63284220ce81638332f9ccbfa53765b2b7502203cf/lxml-6.0.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:e8113639f3296706fbac34a30813929e29247718e88173ad849f57ca59754924", size = 4667818, upload-time = "2025-09-22T04:01:19.688Z" }, + { url = "https://files.pythonhosted.org/packages/f1/ca/31fb37f99f37f1536c133476674c10b577e409c0a624384147653e38baf2/lxml-6.0.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:a8bef9b9825fa8bc816a6e641bb67219489229ebc648be422af695f6e7a4fa7f", size = 4950807, upload-time = "2025-09-22T04:01:21.487Z" }, + { url = "https://files.pythonhosted.org/packages/da/87/f6cb9442e4bada8aab5ae7e1046264f62fdbeaa6e3f6211b93f4c0dd97f1/lxml-6.0.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:65ea18d710fd14e0186c2f973dc60bb52039a275f82d3c44a0e42b43440ea534", size = 5109179, upload-time = "2025-09-22T04:01:23.32Z" }, + { url = "https://files.pythonhosted.org/packages/c8/20/a7760713e65888db79bbae4f6146a6ae5c04e4a204a3c48896c408cd6ed2/lxml-6.0.2-cp312-cp312-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c371aa98126a0d4c739ca93ceffa0fd7a5d732e3ac66a46e74339acd4d334564", size = 5023044, upload-time = "2025-09-22T04:01:25.118Z" }, + { url = "https://files.pythonhosted.org/packages/a2/b0/7e64e0460fcb36471899f75831509098f3fd7cd02a3833ac517433cb4f8f/lxml-6.0.2-cp312-cp312-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:700efd30c0fa1a3581d80a748157397559396090a51d306ea59a70020223d16f", size = 5359685, upload-time = "2025-09-22T04:01:27.398Z" }, + { url = "https://files.pythonhosted.org/packages/b9/e1/e5df362e9ca4e2f48ed6411bd4b3a0ae737cc842e96877f5bf9428055ab4/lxml-6.0.2-cp312-cp312-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c33e66d44fe60e72397b487ee92e01da0d09ba2d66df8eae42d77b6d06e5eba0", size = 5654127, upload-time = "2025-09-22T04:01:29.629Z" }, + { url = "https://files.pythonhosted.org/packages/c6/d1/232b3309a02d60f11e71857778bfcd4acbdb86c07db8260caf7d008b08f8/lxml-6.0.2-cp312-cp312-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:90a345bbeaf9d0587a3aaffb7006aa39ccb6ff0e96a57286c0cb2fd1520ea192", size = 5253958, upload-time = "2025-09-22T04:01:31.535Z" }, + { url = "https://files.pythonhosted.org/packages/35/35/d955a070994725c4f7d80583a96cab9c107c57a125b20bb5f708fe941011/lxml-6.0.2-cp312-cp312-manylinux_2_31_armv7l.whl", hash = "sha256:064fdadaf7a21af3ed1dcaa106b854077fbeada827c18f72aec9346847cd65d0", size = 4711541, upload-time = "2025-09-22T04:01:33.801Z" }, + { url = "https://files.pythonhosted.org/packages/1e/be/667d17363b38a78c4bd63cfd4b4632029fd68d2c2dc81f25ce9eb5224dd5/lxml-6.0.2-cp312-cp312-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:fbc74f42c3525ac4ffa4b89cbdd00057b6196bcefe8bce794abd42d33a018092", size = 5267426, upload-time = "2025-09-22T04:01:35.639Z" }, + { url = "https://files.pythonhosted.org/packages/ea/47/62c70aa4a1c26569bc958c9ca86af2bb4e1f614e8c04fb2989833874f7ae/lxml-6.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6ddff43f702905a4e32bc24f3f2e2edfe0f8fde3277d481bffb709a4cced7a1f", size = 5064917, upload-time = "2025-09-22T04:01:37.448Z" }, + { url = "https://files.pythonhosted.org/packages/bd/55/6ceddaca353ebd0f1908ef712c597f8570cc9c58130dbb89903198e441fd/lxml-6.0.2-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:6da5185951d72e6f5352166e3da7b0dc27aa70bd1090b0eb3f7f7212b53f1bb8", size = 4788795, upload-time = "2025-09-22T04:01:39.165Z" }, + { url = "https://files.pythonhosted.org/packages/cf/e8/fd63e15da5e3fd4c2146f8bbb3c14e94ab850589beab88e547b2dbce22e1/lxml-6.0.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:57a86e1ebb4020a38d295c04fc79603c7899e0df71588043eb218722dabc087f", size = 5676759, upload-time = "2025-09-22T04:01:41.506Z" }, + { url = "https://files.pythonhosted.org/packages/76/47/b3ec58dc5c374697f5ba37412cd2728f427d056315d124dd4b61da381877/lxml-6.0.2-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:2047d8234fe735ab77802ce5f2297e410ff40f5238aec569ad7c8e163d7b19a6", size = 5255666, upload-time = "2025-09-22T04:01:43.363Z" }, + { url = "https://files.pythonhosted.org/packages/19/93/03ba725df4c3d72afd9596eef4a37a837ce8e4806010569bedfcd2cb68fd/lxml-6.0.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6f91fd2b2ea15a6800c8e24418c0775a1694eefc011392da73bc6cef2623b322", size = 5277989, upload-time = "2025-09-22T04:01:45.215Z" }, + { url = "https://files.pythonhosted.org/packages/c6/80/c06de80bfce881d0ad738576f243911fccf992687ae09fd80b734712b39c/lxml-6.0.2-cp312-cp312-win32.whl", hash = "sha256:3ae2ce7d6fedfb3414a2b6c5e20b249c4c607f72cb8d2bb7cc9c6ec7c6f4e849", size = 3611456, upload-time = "2025-09-22T04:01:48.243Z" }, + { url = "https://files.pythonhosted.org/packages/f7/d7/0cdfb6c3e30893463fb3d1e52bc5f5f99684a03c29a0b6b605cfae879cd5/lxml-6.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:72c87e5ee4e58a8354fb9c7c84cbf95a1c8236c127a5d1b7683f04bed8361e1f", size = 4011793, upload-time = "2025-09-22T04:01:50.042Z" }, + { url = "https://files.pythonhosted.org/packages/ea/7b/93c73c67db235931527301ed3785f849c78991e2e34f3fd9a6663ffda4c5/lxml-6.0.2-cp312-cp312-win_arm64.whl", hash = "sha256:61cb10eeb95570153e0c0e554f58df92ecf5109f75eacad4a95baa709e26c3d6", size = 3672836, upload-time = "2025-09-22T04:01:52.145Z" }, { url = "https://files.pythonhosted.org/packages/53/fd/4e8f0540608977aea078bf6d79f128e0e2c2bba8af1acf775c30baa70460/lxml-6.0.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:9b33d21594afab46f37ae58dfadd06636f154923c4e8a4d754b0127554eb2e77", size = 8648494, upload-time = "2025-09-22T04:01:54.242Z" }, { url = "https://files.pythonhosted.org/packages/5d/f4/2a94a3d3dfd6c6b433501b8d470a1960a20ecce93245cf2db1706adf6c19/lxml-6.0.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:6c8963287d7a4c5c9a432ff487c52e9c5618667179c18a204bdedb27310f022f", size = 4661146, upload-time = "2025-09-22T04:01:56.282Z" }, + { url = "https://files.pythonhosted.org/packages/25/2e/4efa677fa6b322013035d38016f6ae859d06cac67437ca7dc708a6af7028/lxml-6.0.2-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1941354d92699fb5ffe6ed7b32f9649e43c2feb4b97205f75866f7d21aa91452", size = 4946932, upload-time = "2025-09-22T04:01:58.989Z" }, + { url = "https://files.pythonhosted.org/packages/ce/0f/526e78a6d38d109fdbaa5049c62e1d32fdd70c75fb61c4eadf3045d3d124/lxml-6.0.2-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bb2f6ca0ae2d983ded09357b84af659c954722bbf04dea98030064996d156048", size = 5100060, upload-time = "2025-09-22T04:02:00.812Z" }, + { url = "https://files.pythonhosted.org/packages/81/76/99de58d81fa702cc0ea7edae4f4640416c2062813a00ff24bd70ac1d9c9b/lxml-6.0.2-cp313-cp313-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:eb2a12d704f180a902d7fa778c6d71f36ceb7b0d317f34cdc76a5d05aa1dd1df", size = 5019000, upload-time = "2025-09-22T04:02:02.671Z" }, + { url = "https://files.pythonhosted.org/packages/b5/35/9e57d25482bc9a9882cb0037fdb9cc18f4b79d85df94fa9d2a89562f1d25/lxml-6.0.2-cp313-cp313-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:6ec0e3f745021bfed19c456647f0298d60a24c9ff86d9d051f52b509663feeb1", size = 5348496, upload-time = "2025-09-22T04:02:04.904Z" }, + { url = "https://files.pythonhosted.org/packages/a6/8e/cb99bd0b83ccc3e8f0f528e9aa1f7a9965dfec08c617070c5db8d63a87ce/lxml-6.0.2-cp313-cp313-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:846ae9a12d54e368933b9759052d6206a9e8b250291109c48e350c1f1f49d916", size = 5643779, upload-time = "2025-09-22T04:02:06.689Z" }, + { url = "https://files.pythonhosted.org/packages/d0/34/9e591954939276bb679b73773836c6684c22e56d05980e31d52a9a8deb18/lxml-6.0.2-cp313-cp313-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ef9266d2aa545d7374938fb5c484531ef5a2ec7f2d573e62f8ce722c735685fd", size = 5244072, upload-time = "2025-09-22T04:02:08.587Z" }, + { url = "https://files.pythonhosted.org/packages/8d/27/b29ff065f9aaca443ee377aff699714fcbffb371b4fce5ac4ca759e436d5/lxml-6.0.2-cp313-cp313-manylinux_2_31_armv7l.whl", hash = "sha256:4077b7c79f31755df33b795dc12119cb557a0106bfdab0d2c2d97bd3cf3dffa6", size = 4718675, upload-time = "2025-09-22T04:02:10.783Z" }, + { url = "https://files.pythonhosted.org/packages/2b/9f/f756f9c2cd27caa1a6ef8c32ae47aadea697f5c2c6d07b0dae133c244fbe/lxml-6.0.2-cp313-cp313-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:a7c5d5e5f1081955358533be077166ee97ed2571d6a66bdba6ec2f609a715d1a", size = 5255171, upload-time = "2025-09-22T04:02:12.631Z" }, + { url = "https://files.pythonhosted.org/packages/61/46/bb85ea42d2cb1bd8395484fd72f38e3389611aa496ac7772da9205bbda0e/lxml-6.0.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:8f8d0cbd0674ee89863a523e6994ac25fd5be9c8486acfc3e5ccea679bad2679", size = 5057175, upload-time = "2025-09-22T04:02:14.718Z" }, + { url = "https://files.pythonhosted.org/packages/95/0c/443fc476dcc8e41577f0af70458c50fe299a97bb6b7505bb1ae09aa7f9ac/lxml-6.0.2-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:2cbcbf6d6e924c28f04a43f3b6f6e272312a090f269eff68a2982e13e5d57659", size = 4785688, upload-time = "2025-09-22T04:02:16.957Z" }, + { url = "https://files.pythonhosted.org/packages/48/78/6ef0b359d45bb9697bc5a626e1992fa5d27aa3f8004b137b2314793b50a0/lxml-6.0.2-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:dfb874cfa53340009af6bdd7e54ebc0d21012a60a4e65d927c2e477112e63484", size = 5660655, upload-time = "2025-09-22T04:02:18.815Z" }, + { url = "https://files.pythonhosted.org/packages/ff/ea/e1d33808f386bc1339d08c0dcada6e4712d4ed8e93fcad5f057070b7988a/lxml-6.0.2-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:fb8dae0b6b8b7f9e96c26fdd8121522ce5de9bb5538010870bd538683d30e9a2", size = 5247695, upload-time = "2025-09-22T04:02:20.593Z" }, + { url = "https://files.pythonhosted.org/packages/4f/47/eba75dfd8183673725255247a603b4ad606f4ae657b60c6c145b381697da/lxml-6.0.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:358d9adae670b63e95bc59747c72f4dc97c9ec58881d4627fe0120da0f90d314", size = 5269841, upload-time = "2025-09-22T04:02:22.489Z" }, + { url = "https://files.pythonhosted.org/packages/76/04/5c5e2b8577bc936e219becb2e98cdb1aca14a4921a12995b9d0c523502ae/lxml-6.0.2-cp313-cp313-win32.whl", hash = "sha256:e8cd2415f372e7e5a789d743d133ae474290a90b9023197fd78f32e2dc6873e2", size = 3610700, upload-time = "2025-09-22T04:02:24.465Z" }, + { url = "https://files.pythonhosted.org/packages/fe/0a/4643ccc6bb8b143e9f9640aa54e38255f9d3b45feb2cbe7ae2ca47e8782e/lxml-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:b30d46379644fbfc3ab81f8f82ae4de55179414651f110a1514f0b1f8f6cb2d7", size = 4010347, upload-time = "2025-09-22T04:02:26.286Z" }, + { url = "https://files.pythonhosted.org/packages/31/ef/dcf1d29c3f530577f61e5fe2f1bd72929acf779953668a8a47a479ae6f26/lxml-6.0.2-cp313-cp313-win_arm64.whl", hash = "sha256:13dcecc9946dca97b11b7c40d29fba63b55ab4170d3c0cf8c0c164343b9bfdcf", size = 3671248, upload-time = "2025-09-22T04:02:27.918Z" }, { url = "https://files.pythonhosted.org/packages/03/15/d4a377b385ab693ce97b472fe0c77c2b16ec79590e688b3ccc71fba19884/lxml-6.0.2-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:b0c732aa23de8f8aec23f4b580d1e52905ef468afb4abeafd3fec77042abb6fe", size = 8659801, upload-time = "2025-09-22T04:02:30.113Z" }, { url = "https://files.pythonhosted.org/packages/c8/e8/c128e37589463668794d503afaeb003987373c5f94d667124ffd8078bbd9/lxml-6.0.2-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:4468e3b83e10e0317a89a33d28f7aeba1caa4d1a6fd457d115dd4ffe90c5931d", size = 4659403, upload-time = "2025-09-22T04:02:32.119Z" }, + { url = "https://files.pythonhosted.org/packages/00/ce/74903904339decdf7da7847bb5741fc98a5451b42fc419a86c0c13d26fe2/lxml-6.0.2-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:abd44571493973bad4598a3be7e1d807ed45aa2adaf7ab92ab7c62609569b17d", size = 4966974, upload-time = "2025-09-22T04:02:34.155Z" }, + { url = "https://files.pythonhosted.org/packages/1f/d3/131dec79ce61c5567fecf82515bd9bc36395df42501b50f7f7f3bd065df0/lxml-6.0.2-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:370cd78d5855cfbffd57c422851f7d3864e6ae72d0da615fca4dad8c45d375a5", size = 5102953, upload-time = "2025-09-22T04:02:36.054Z" }, + { url = "https://files.pythonhosted.org/packages/3a/ea/a43ba9bb750d4ffdd885f2cd333572f5bb900cd2408b67fdda07e85978a0/lxml-6.0.2-cp314-cp314-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:901e3b4219fa04ef766885fb40fa516a71662a4c61b80c94d25336b4934b71c0", size = 5055054, upload-time = "2025-09-22T04:02:38.154Z" }, + { url = "https://files.pythonhosted.org/packages/60/23/6885b451636ae286c34628f70a7ed1fcc759f8d9ad382d132e1c8d3d9bfd/lxml-6.0.2-cp314-cp314-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:a4bf42d2e4cf52c28cc1812d62426b9503cdb0c87a6de81442626aa7d69707ba", size = 5352421, upload-time = "2025-09-22T04:02:40.413Z" }, + { url = "https://files.pythonhosted.org/packages/48/5b/fc2ddfc94ddbe3eebb8e9af6e3fd65e2feba4967f6a4e9683875c394c2d8/lxml-6.0.2-cp314-cp314-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:b2c7fdaa4d7c3d886a42534adec7cfac73860b89b4e5298752f60aa5984641a0", size = 5673684, upload-time = "2025-09-22T04:02:42.288Z" }, + { url = "https://files.pythonhosted.org/packages/29/9c/47293c58cc91769130fbf85531280e8cc7868f7fbb6d92f4670071b9cb3e/lxml-6.0.2-cp314-cp314-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:98a5e1660dc7de2200b00d53fa00bcd3c35a3608c305d45a7bbcaf29fa16e83d", size = 5252463, upload-time = "2025-09-22T04:02:44.165Z" }, + { url = "https://files.pythonhosted.org/packages/9b/da/ba6eceb830c762b48e711ded880d7e3e89fc6c7323e587c36540b6b23c6b/lxml-6.0.2-cp314-cp314-manylinux_2_31_armv7l.whl", hash = "sha256:dc051506c30b609238d79eda75ee9cab3e520570ec8219844a72a46020901e37", size = 4698437, upload-time = "2025-09-22T04:02:46.524Z" }, + { url = "https://files.pythonhosted.org/packages/a5/24/7be3f82cb7990b89118d944b619e53c656c97dc89c28cfb143fdb7cd6f4d/lxml-6.0.2-cp314-cp314-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:8799481bbdd212470d17513a54d568f44416db01250f49449647b5ab5b5dccb9", size = 5269890, upload-time = "2025-09-22T04:02:48.812Z" }, + { url = "https://files.pythonhosted.org/packages/1b/bd/dcfb9ea1e16c665efd7538fc5d5c34071276ce9220e234217682e7d2c4a5/lxml-6.0.2-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:9261bb77c2dab42f3ecd9103951aeca2c40277701eb7e912c545c1b16e0e4917", size = 5097185, upload-time = "2025-09-22T04:02:50.746Z" }, + { url = "https://files.pythonhosted.org/packages/21/04/a60b0ff9314736316f28316b694bccbbabe100f8483ad83852d77fc7468e/lxml-6.0.2-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:65ac4a01aba353cfa6d5725b95d7aed6356ddc0a3cd734de00124d285b04b64f", size = 4745895, upload-time = "2025-09-22T04:02:52.968Z" }, + { url = "https://files.pythonhosted.org/packages/d6/bd/7d54bd1846e5a310d9c715921c5faa71cf5c0853372adf78aee70c8d7aa2/lxml-6.0.2-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:b22a07cbb82fea98f8a2fd814f3d1811ff9ed76d0fc6abc84eb21527596e7cc8", size = 5695246, upload-time = "2025-09-22T04:02:54.798Z" }, + { url = "https://files.pythonhosted.org/packages/fd/32/5643d6ab947bc371da21323acb2a6e603cedbe71cb4c99c8254289ab6f4e/lxml-6.0.2-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:d759cdd7f3e055d6bc8d9bec3ad905227b2e4c785dc16c372eb5b5e83123f48a", size = 5260797, upload-time = "2025-09-22T04:02:57.058Z" }, + { url = "https://files.pythonhosted.org/packages/33/da/34c1ec4cff1eea7d0b4cd44af8411806ed943141804ac9c5d565302afb78/lxml-6.0.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:945da35a48d193d27c188037a05fec5492937f66fb1958c24fc761fb9d40d43c", size = 5277404, upload-time = "2025-09-22T04:02:58.966Z" }, + { url = "https://files.pythonhosted.org/packages/82/57/4eca3e31e54dc89e2c3507e1cd411074a17565fa5ffc437c4ae0a00d439e/lxml-6.0.2-cp314-cp314-win32.whl", hash = "sha256:be3aaa60da67e6153eb15715cc2e19091af5dc75faef8b8a585aea372507384b", size = 3670072, upload-time = "2025-09-22T04:03:38.05Z" }, + { url = "https://files.pythonhosted.org/packages/e3/e0/c96cf13eccd20c9421ba910304dae0f619724dcf1702864fd59dd386404d/lxml-6.0.2-cp314-cp314-win_amd64.whl", hash = "sha256:fa25afbadead523f7001caf0c2382afd272c315a033a7b06336da2637d92d6ed", size = 4080617, upload-time = "2025-09-22T04:03:39.835Z" }, + { url = "https://files.pythonhosted.org/packages/d5/5d/b3f03e22b3d38d6f188ef044900a9b29b2fe0aebb94625ce9fe244011d34/lxml-6.0.2-cp314-cp314-win_arm64.whl", hash = "sha256:063eccf89df5b24e361b123e257e437f9e9878f425ee9aae3144c77faf6da6d8", size = 3754930, upload-time = "2025-09-22T04:03:41.565Z" }, { url = "https://files.pythonhosted.org/packages/5e/5c/42c2c4c03554580708fc738d13414801f340c04c3eff90d8d2d227145275/lxml-6.0.2-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:6162a86d86893d63084faaf4ff937b3daea233e3682fb4474db07395794fa80d", size = 8910380, upload-time = "2025-09-22T04:03:01.645Z" }, { url = "https://files.pythonhosted.org/packages/bf/4f/12df843e3e10d18d468a7557058f8d3733e8b6e12401f30b1ef29360740f/lxml-6.0.2-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:414aaa94e974e23a3e92e7ca5b97d10c0cf37b6481f50911032c69eeb3991bba", size = 4775632, upload-time = "2025-09-22T04:03:03.814Z" }, + { url = "https://files.pythonhosted.org/packages/e4/0c/9dc31e6c2d0d418483cbcb469d1f5a582a1cd00a1f4081953d44051f3c50/lxml-6.0.2-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:48461bd21625458dd01e14e2c38dd0aea69addc3c4f960c30d9f59d7f93be601", size = 4975171, upload-time = "2025-09-22T04:03:05.651Z" }, + { url = "https://files.pythonhosted.org/packages/e7/2b/9b870c6ca24c841bdd887504808f0417aa9d8d564114689266f19ddf29c8/lxml-6.0.2-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:25fcc59afc57d527cfc78a58f40ab4c9b8fd096a9a3f964d2781ffb6eb33f4ed", size = 5110109, upload-time = "2025-09-22T04:03:07.452Z" }, + { url = "https://files.pythonhosted.org/packages/bf/0c/4f5f2a4dd319a178912751564471355d9019e220c20d7db3fb8307ed8582/lxml-6.0.2-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5179c60288204e6ddde3f774a93350177e08876eaf3ab78aa3a3649d43eb7d37", size = 5041061, upload-time = "2025-09-22T04:03:09.297Z" }, + { url = "https://files.pythonhosted.org/packages/12/64/554eed290365267671fe001a20d72d14f468ae4e6acef1e179b039436967/lxml-6.0.2-cp314-cp314t-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:967aab75434de148ec80597b75062d8123cadf2943fb4281f385141e18b21338", size = 5306233, upload-time = "2025-09-22T04:03:11.651Z" }, + { url = "https://files.pythonhosted.org/packages/7a/31/1d748aa275e71802ad9722df32a7a35034246b42c0ecdd8235412c3396ef/lxml-6.0.2-cp314-cp314t-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:d100fcc8930d697c6561156c6810ab4a508fb264c8b6779e6e61e2ed5e7558f9", size = 5604739, upload-time = "2025-09-22T04:03:13.592Z" }, + { url = "https://files.pythonhosted.org/packages/8f/41/2c11916bcac09ed561adccacceaedd2bf0e0b25b297ea92aab99fd03d0fa/lxml-6.0.2-cp314-cp314t-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2ca59e7e13e5981175b8b3e4ab84d7da57993eeff53c07764dcebda0d0e64ecd", size = 5225119, upload-time = "2025-09-22T04:03:15.408Z" }, + { url = "https://files.pythonhosted.org/packages/99/05/4e5c2873d8f17aa018e6afde417c80cc5d0c33be4854cce3ef5670c49367/lxml-6.0.2-cp314-cp314t-manylinux_2_31_armv7l.whl", hash = "sha256:957448ac63a42e2e49531b9d6c0fa449a1970dbc32467aaad46f11545be9af1d", size = 4633665, upload-time = "2025-09-22T04:03:17.262Z" }, + { url = "https://files.pythonhosted.org/packages/0f/c9/dcc2da1bebd6275cdc723b515f93edf548b82f36a5458cca3578bc899332/lxml-6.0.2-cp314-cp314t-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:b7fc49c37f1786284b12af63152fe1d0990722497e2d5817acfe7a877522f9a9", size = 5234997, upload-time = "2025-09-22T04:03:19.14Z" }, + { url = "https://files.pythonhosted.org/packages/9c/e2/5172e4e7468afca64a37b81dba152fc5d90e30f9c83c7c3213d6a02a5ce4/lxml-6.0.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e19e0643cc936a22e837f79d01a550678da8377d7d801a14487c10c34ee49c7e", size = 5090957, upload-time = "2025-09-22T04:03:21.436Z" }, + { url = "https://files.pythonhosted.org/packages/a5/b3/15461fd3e5cd4ddcb7938b87fc20b14ab113b92312fc97afe65cd7c85de1/lxml-6.0.2-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:1db01e5cf14345628e0cbe71067204db658e2fb8e51e7f33631f5f4735fefd8d", size = 4764372, upload-time = "2025-09-22T04:03:23.27Z" }, + { url = "https://files.pythonhosted.org/packages/05/33/f310b987c8bf9e61c4dd8e8035c416bd3230098f5e3cfa69fc4232de7059/lxml-6.0.2-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:875c6b5ab39ad5291588aed6925fac99d0097af0dd62f33c7b43736043d4a2ec", size = 5634653, upload-time = "2025-09-22T04:03:25.767Z" }, + { url = "https://files.pythonhosted.org/packages/70/ff/51c80e75e0bc9382158133bdcf4e339b5886c6ee2418b5199b3f1a61ed6d/lxml-6.0.2-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:cdcbed9ad19da81c480dfd6dd161886db6096083c9938ead313d94b30aadf272", size = 5233795, upload-time = "2025-09-22T04:03:27.62Z" }, + { url = "https://files.pythonhosted.org/packages/56/4d/4856e897df0d588789dd844dbed9d91782c4ef0b327f96ce53c807e13128/lxml-6.0.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:80dadc234ebc532e09be1975ff538d154a7fa61ea5031c03d25178855544728f", size = 5257023, upload-time = "2025-09-22T04:03:30.056Z" }, + { url = "https://files.pythonhosted.org/packages/0f/85/86766dfebfa87bea0ab78e9ff7a4b4b45225df4b4d3b8cc3c03c5cd68464/lxml-6.0.2-cp314-cp314t-win32.whl", hash = "sha256:da08e7bb297b04e893d91087df19638dc7a6bb858a954b0cc2b9f5053c922312", size = 3911420, upload-time = "2025-09-22T04:03:32.198Z" }, + { url = "https://files.pythonhosted.org/packages/fe/1a/b248b355834c8e32614650b8008c69ffeb0ceb149c793961dd8c0b991bb3/lxml-6.0.2-cp314-cp314t-win_amd64.whl", hash = "sha256:252a22982dca42f6155125ac76d3432e548a7625d56f5a273ee78a5057216eca", size = 4406837, upload-time = "2025-09-22T04:03:34.027Z" }, + { url = "https://files.pythonhosted.org/packages/92/aa/df863bcc39c5e0946263454aba394de8a9084dbaff8ad143846b0d844739/lxml-6.0.2-cp314-cp314t-win_arm64.whl", hash = "sha256:bb4c1847b303835d89d785a18801a883436cdfd5dc3d62947f9c49e24f0f5a2c", size = 3822205, upload-time = "2025-09-22T04:03:36.249Z" }, { url = "https://files.pythonhosted.org/packages/0b/11/29d08bc103a62c0eba8016e7ed5aeebbf1e4312e83b0b1648dd203b0e87d/lxml-6.0.2-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:1c06035eafa8404b5cf475bb37a9f6088b0aca288d4ccc9d69389750d5543700", size = 3949829, upload-time = "2025-09-22T04:04:45.608Z" }, + { url = "https://files.pythonhosted.org/packages/12/b3/52ab9a3b31e5ab8238da241baa19eec44d2ab426532441ee607165aebb52/lxml-6.0.2-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:c7d13103045de1bdd6fe5d61802565f1a3537d70cd3abf596aa0af62761921ee", size = 4226277, upload-time = "2025-09-22T04:04:47.754Z" }, + { url = "https://files.pythonhosted.org/packages/a0/33/1eaf780c1baad88224611df13b1c2a9dfa460b526cacfe769103ff50d845/lxml-6.0.2-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0a3c150a95fbe5ac91de323aa756219ef9cf7fde5a3f00e2281e30f33fa5fa4f", size = 4330433, upload-time = "2025-09-22T04:04:49.907Z" }, + { url = "https://files.pythonhosted.org/packages/7a/c1/27428a2ff348e994ab4f8777d3a0ad510b6b92d37718e5887d2da99952a2/lxml-6.0.2-pp311-pypy311_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:60fa43be34f78bebb27812ed90f1925ec99560b0fa1decdb7d12b84d857d31e9", size = 4272119, upload-time = "2025-09-22T04:04:51.801Z" }, + { url = "https://files.pythonhosted.org/packages/f0/d0/3020fa12bcec4ab62f97aab026d57c2f0cfd480a558758d9ca233bb6a79d/lxml-6.0.2-pp311-pypy311_pp73-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:21c73b476d3cfe836be731225ec3421fa2f048d84f6df6a8e70433dff1376d5a", size = 4417314, upload-time = "2025-09-22T04:04:55.024Z" }, + { url = "https://files.pythonhosted.org/packages/6c/77/d7f491cbc05303ac6801651aabeb262d43f319288c1ea96c66b1d2692ff3/lxml-6.0.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:27220da5be049e936c3aca06f174e8827ca6445a4353a1995584311487fc4e3e", size = 3518768, upload-time = "2025-09-22T04:04:57.097Z" }, ] [[package]] @@ -854,6 +1089,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/70/81/54e3ce63502cd085a0c556652a4e1b919c45a446bd1e5300e10c44c8c521/markdown-3.10-py3-none-any.whl", hash = "sha256:b5b99d6951e2e4948d939255596523444c0e677c669700b1d17aa4a8a464cb7c", size = 107678, upload-time = "2025-11-03T19:51:13.887Z" }, ] +[[package]] +name = "markdown-it-py" +version = "4.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mdurl" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/5b/f5/4ec618ed16cc4f8fb3b701563655a69816155e79e24a17b651541804721d/markdown_it_py-4.0.0.tar.gz", hash = "sha256:cb0a2b4aa34f932c007117b194e945bd74e0ec24133ceb5bac59009cda1cb9f3", size = 73070, upload-time = "2025-08-11T12:57:52.854Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/94/54/e7d793b573f298e1c9013b8c4dade17d481164aa517d1d7148619c2cedbf/markdown_it_py-4.0.0-py3-none-any.whl", hash = "sha256:87327c59b172c5011896038353a81343b6754500a08cd7a4973bb48c6d578147", size = 87321, upload-time = "2025-08-11T12:57:51.923Z" }, +] + [[package]] name = "markupsafe" version = "3.0.3" @@ -953,6 +1200,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e2/fc/6dc7659c2ae5ddf280477011f4213a74f806862856b796ef08f028e664bf/mcp-1.25.0-py3-none-any.whl", hash = "sha256:b37c38144a666add0862614cc79ec276e97d72aa8ca26d622818d4e278b9721a", size = 233076, upload-time = "2025-12-19T10:19:55.416Z" }, ] +[[package]] +name = "mdurl" +version = "0.1.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d6/54/cfe61301667036ec958cb99bd3efefba235e65cdeb9c84d24a8293ba1d90/mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba", size = 8729, upload-time = "2022-08-14T12:40:10.846Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" }, +] + [[package]] name = "mergedeep" version = "1.3.4" @@ -1223,6 +1479,25 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2d/ee/346fa473e666fe14c52fcdd19ec2424157290a032d4c41f98127bfb31ac7/numpy-2.3.5-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:f16417ec91f12f814b10bafe79ef77e70113a2f5f7018640e7425ff979253425", size = 12967213, upload-time = "2025-11-16T22:52:39.38Z" }, ] +[[package]] +name = "openai" +version = "2.15.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "distro" }, + { name = "httpx" }, + { name = "jiter" }, + { name = "pydantic" }, + { name = "sniffio" }, + { name = "tqdm" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/94/f4/4690ecb5d70023ce6bfcfeabfe717020f654bde59a775058ec6ac4692463/openai-2.15.0.tar.gz", hash = "sha256:42eb8cbb407d84770633f31bf727d4ffb4138711c670565a41663d9439174fba", size = 627383, upload-time = "2026-01-09T22:10:08.603Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b5/df/c306f7375d42bafb379934c2df4c2fa3964656c8c782bac75ee10c102818/openai-2.15.0-py3-none-any.whl", hash = "sha256:6ae23b932cd7230f7244e52954daa6602716d6b9bf235401a107af731baea6c3", size = 1067879, upload-time = "2026-01-09T22:10:06.446Z" }, +] + [[package]] name = "openpyxl" version = "3.1.5" @@ -1623,6 +1898,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/93/78/b93cb80bd673bdc9f6ede63d8eb5b4646366953df15667eb3603be57a2b1/pymdown_extensions-10.17.2-py3-none-any.whl", hash = "sha256:bffae79a2e8b9e44aef0d813583a8fea63457b7a23643a43988055b7b79b4992", size = 266556, upload-time = "2025-11-26T15:43:55.162Z" }, ] +[[package]] +name = "pymupdf" +version = "1.26.7" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/48/d6/09b28f027b510838559f7748807192149c419b30cb90e6d5f0cf916dc9dc/pymupdf-1.26.7.tar.gz", hash = "sha256:71add8bdc8eb1aaa207c69a13400693f06ad9b927bea976f5d5ab9df0bb489c3", size = 84327033, upload-time = "2025-12-11T21:48:50.694Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/94/35/cd74cea1787b2247702ef8522186bdef32e9cb30a099e6bb864627ef6045/pymupdf-1.26.7-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:07085718dfdae5ab83b05eb5eb397f863bcc538fe05135318a01ea353e7a1353", size = 23179369, upload-time = "2025-12-11T21:47:21.587Z" }, + { url = "https://files.pythonhosted.org/packages/72/74/448b6172927c829c6a3fba80078d7b0a016ebbe2c9ee528821f5ea21677a/pymupdf-1.26.7-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:31aa9c8377ea1eea02934b92f4dcf79fb2abba0bf41f8a46d64c3e31546a3c02", size = 22470101, upload-time = "2025-12-11T21:47:37.105Z" }, + { url = "https://files.pythonhosted.org/packages/65/e7/47af26f3ac76be7ac3dd4d6cc7ee105948a8355d774e5ca39857bf91c11c/pymupdf-1.26.7-cp310-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:e419b609996434a14a80fa060adec72c434a1cca6a511ec54db9841bc5d51b3c", size = 23502486, upload-time = "2025-12-12T09:51:25.824Z" }, + { url = "https://files.pythonhosted.org/packages/2a/6b/3de1714d734ff949be1e90a22375d0598d3540b22ae73eb85c2d7d1f36a9/pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:69dfc78f206a96e5b3ac22741263ebab945fdf51f0dbe7c5757c3511b23d9d72", size = 24115727, upload-time = "2025-12-11T21:47:51.274Z" }, + { url = "https://files.pythonhosted.org/packages/62/9b/f86224847949577a523be2207315ae0fd3155b5d909cd66c274d095349a3/pymupdf-1.26.7-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:1d5106f46e1ca0d64d46bd51892372a4f82076bdc14a9678d33d630702abca36", size = 24324386, upload-time = "2025-12-12T14:58:45.483Z" }, + { url = "https://files.pythonhosted.org/packages/85/8e/a117d39092ca645fde8b903f4a941d9aa75b370a67b4f1f435f56393dc5a/pymupdf-1.26.7-cp310-abi3-win32.whl", hash = "sha256:7c9645b6f5452629c747690190350213d3e5bbdb6b2eca227d82702b327f6eee", size = 17203888, upload-time = "2025-12-12T13:59:57.613Z" }, + { url = "https://files.pythonhosted.org/packages/dd/c3/d0047678146c294469c33bae167c8ace337deafb736b0bf97b9bc481aa65/pymupdf-1.26.7-cp310-abi3-win_amd64.whl", hash = "sha256:425b1befe40d41b72eb0fe211711c7ae334db5eb60307e9dd09066ed060cceba", size = 18405952, upload-time = "2025-12-11T21:48:02.947Z" }, +] + [[package]] name = "pypdfium2" version = "5.1.0" @@ -1861,6 +2151,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/bb/ad/fdd56219f0e320293c513ef0b3cdd018802a1bcfdb29ed9bc0c3bcb97f31/responses-0.21.0-py3-none-any.whl", hash = "sha256:2dcc863ba63963c0c3d9ee3fa9507cbe36b7d7b0fccb4f0bdfd9e96c539b1487", size = 45987, upload-time = "2022-05-25T14:20:48.508Z" }, ] +[[package]] +name = "rich" +version = "14.2.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markdown-it-py" }, + { name = "pygments" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/fb/d2/8920e102050a0de7bfabeb4c4614a49248cf8d5d7a8d01885fbb24dc767a/rich-14.2.0.tar.gz", hash = "sha256:73ff50c7c0c1c77c8243079283f4edb376f0f6442433aecb8ce7e6d0b92d1fe4", size = 219990, upload-time = "2025-10-09T14:16:53.064Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/25/7a/b0178788f8dc6cafce37a212c99565fa1fe7872c70c6c9c1e1a372d9d88f/rich-14.2.0-py3-none-any.whl", hash = "sha256:76bc51fe2e57d2b1be1f96c524b890b816e334ab4c1e45888799bfaab0021edd", size = 243393, upload-time = "2025-10-09T14:16:51.245Z" }, +] + [[package]] name = "rpds-py" version = "0.30.0" @@ -2079,6 +2382,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/4d/19/8d77f9992e5cbfcaa9133c3bf63b4fbbb051248802e1e803fed5c552fbb2/sentry_sdk-2.48.0-py2.py3-none-any.whl", hash = "sha256:6b12ac256769d41825d9b7518444e57fa35b5642df4c7c5e322af4d2c8721172", size = 414555, upload-time = "2025-12-16T14:55:40.152Z" }, ] +[[package]] +name = "shellingham" +version = "1.5.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/58/15/8b3609fd3830ef7b27b655beb4b4e9c62313a4e8da8c676e142cc210d58e/shellingham-1.5.4.tar.gz", hash = "sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de", size = 10310, upload-time = "2023-10-24T04:13:40.426Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686", size = 9755, upload-time = "2023-10-24T04:13:38.866Z" }, +] + [[package]] name = "six" version = "1.17.0" @@ -2088,6 +2400,24 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" }, ] +[[package]] +name = "sniffio" +version = "1.3.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a2/87/a6771e1546d97e7e041b6ae58d80074f81b7d5121207425c964ddf5cfdbd/sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc", size = 20372, upload-time = "2024-02-25T23:20:04.057Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" }, +] + +[[package]] +name = "soupsieve" +version = "2.8.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/7b/ae/2d9c981590ed9999a0d91755b47fc74f74de286b0f5cee14c9269041e6c4/soupsieve-2.8.3.tar.gz", hash = "sha256:3267f1eeea4251fb42728b6dfb746edc9acaffc4a45b27e19450b676586e8349", size = 118627, upload-time = "2026-01-20T04:27:02.457Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/46/2c/1462b1d0a634697ae9e55b3cecdcb64788e8b7d63f54d923fcd0bb140aed/soupsieve-2.8.3-py3-none-any.whl", hash = "sha256:ed64f2ba4eebeab06cc4962affce381647455978ffc1e36bb79a545b91f45a95", size = 37016, upload-time = "2026-01-20T04:27:01.012Z" }, +] + [[package]] name = "sse-starlette" version = "3.2.0" @@ -2238,6 +2568,33 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/77/b8/0135fadc89e73be292b473cb820b4f5a08197779206b33191e801feeae40/tomli-2.3.0-py3-none-any.whl", hash = "sha256:e95b1af3c5b07d9e643909b5abbec77cd9f1217e6d0bca72b0234736b9fb1f1b", size = 14408, upload-time = "2025-10-08T22:01:46.04Z" }, ] +[[package]] +name = "tqdm" +version = "4.67.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a8/4b/29b4ef32e036bb34e4ab51796dd745cdba7ed47ad142a9f4a1eb8e0c744d/tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2", size = 169737, upload-time = "2024-11-24T20:12:22.481Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540, upload-time = "2024-11-24T20:12:19.698Z" }, +] + +[[package]] +name = "typer" +version = "0.21.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "click" }, + { name = "rich" }, + { name = "shellingham" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/36/bf/8825b5929afd84d0dabd606c67cd57b8388cb3ec385f7ef19c5cc2202069/typer-0.21.1.tar.gz", hash = "sha256:ea835607cd752343b6b2b7ce676893e5a0324082268b48f27aa058bdb7d2145d", size = 110371, upload-time = "2026-01-06T11:21:10.989Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a0/1d/d9257dd49ff2ca23ea5f132edf1281a0c4f9de8a762b9ae399b670a59235/typer-0.21.1-py3-none-any.whl", hash = "sha256:7985e89081c636b88d172c2ee0cfe33c253160994d47bdfdc302defd7d1f1d01", size = 47381, upload-time = "2026-01-06T11:21:09.824Z" }, +] + [[package]] name = "typing-extensions" version = "4.15.0" From f51515370479545c7e416b72d1808fef1765830d Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Sat, 24 Jan 2026 15:17:08 +0900 Subject: [PATCH 02/38] feat: Add manifest and truth data for application forms, flowcharts, and Gantt charts --- benchmark/data/manifest.json | 37 ++++++++++++++++++++++++++ benchmark/data/truth/ffr_425_01.json | 12 +++++++++ benchmark/data/truth/flowchart_01.json | 14 ++++++++++ benchmark/data/truth/gantt_01.json | 24 +++++++++++++++++ 4 files changed, 87 insertions(+) create mode 100644 benchmark/data/manifest.json create mode 100644 benchmark/data/truth/ffr_425_01.json create mode 100644 benchmark/data/truth/flowchart_01.json create mode 100644 benchmark/data/truth/gantt_01.json diff --git a/benchmark/data/manifest.json b/benchmark/data/manifest.json new file mode 100644 index 0000000..2e34660 --- /dev/null +++ b/benchmark/data/manifest.json @@ -0,0 +1,37 @@ +{ + "cases": [ + { + "id": "ffr_425_01", + "type": "application_form", + "xlsx": "data/raw/ffr_425_01.xlsx", + "question": "このExcel申請書(Federal Financial Report / SF-425)から、次の情報を抽出し、JSONのみで返してください。\n\n抽出項目:\n1. item_1_agency: 「1. Federal Agency and Organizational Element」に記載されている提出先機関名\n2. item_6_report_type: チェックされている Report Type(Quarterly / Semi-Annual / Annual / Final のいずれか)\n3. item_7_accounting_basis: Basis of Accounting(Cash または Accrual)\n4. item_9_reporting_period_end: Reporting Period End Date(YYYY-MM-DD形式、未記入の場合は null)\n5. not_required_by_epa_items: \"Not Required by EPA\" と明示されている取引項目の一覧(項目名の配列)\n\n出力形式(厳守):\n{\n \"item_1_agency\": \"...\",\n \"item_6_report_type\": \"...\",\n \"item_7_accounting_basis\": \"...\",\n \"item_9_reporting_period_end\": \"YYYY-MM-DD\" | null,\n \"not_required_by_epa_items\": [\"...\", \"...\"]\n}", + "truth": "data/truth/ffr_425_01.json", + "sheet_scope": null, + "render": { + "dpi": 220, + "max_pages": 2 + } + }, + { + "id": "flowchart_image_01", + "type": "flowchart", + "xlsx": "data/raw/flowchart_image_01.xlsx", + "question": "このフローチャートの開始から終了までの主要な処理ステップを順番に抽出し、次のJSON形式のみで返してください。\n\n出力形式(厳守):\n{\n \"steps\": [\"step1\", \"step2\", \"step3\", ...]\n}\n\n注意事項:\n- 開始ノードと終了ノードも含めてください\n- 分岐やループがある場合は、代表的な主経路として線形化してください\n- ステップ名は図中のラベル文字列をそのまま使用してください", + "truth": "data/truth/flowchart_image_01.json", + "sheet_scope": null, + "render": { + "dpi": 220, + "max_pages": 1 + } + }, + { + "id": "gantt_01", + "type": "gantt", + "xlsx": "data/raw/gantt_01.xlsx", + "question": "このガントチャートのPhase3のタスク名とその開始日、終了日を抽出し、次のJSON形式のみで返してください: {\"tasks\":[{\"name\":\"...\",\"start_date\":\"YYYY-MM-DD\",\"end_date\":\"YYYY-MM-DD\"}, ...]}", + "truth": "data/truth/gantt_01.json", + "sheet_scope": null, + "render": { "dpi": 200, "max_pages": 4 } + } + ] +} diff --git a/benchmark/data/truth/ffr_425_01.json b/benchmark/data/truth/ffr_425_01.json new file mode 100644 index 0000000..e356d9d --- /dev/null +++ b/benchmark/data/truth/ffr_425_01.json @@ -0,0 +1,12 @@ +{ + "item_1_agency": "United States Environmental Protection Agency", + "item_6_report_type": "Quarterly", + "item_7_accounting_basis": "Cash", + "item_9_reporting_period_end": null, + "not_required_by_epa_items": [ + "Federal Cash", + "Cash Receipts", + "Cash Disbursements", + "Cash on Hand" + ] +} diff --git a/benchmark/data/truth/flowchart_01.json b/benchmark/data/truth/flowchart_01.json new file mode 100644 index 0000000..deff5bb --- /dev/null +++ b/benchmark/data/truth/flowchart_01.json @@ -0,0 +1,14 @@ +{ + "steps": [ + "開始", + "要件抽出", + "ヒアリング", + "非機能要件", + "思考実験", + "再検証", + "まとめ", + "文書作成", + "締結", + "終了" + ] +} diff --git a/benchmark/data/truth/gantt_01.json b/benchmark/data/truth/gantt_01.json new file mode 100644 index 0000000..aebdd03 --- /dev/null +++ b/benchmark/data/truth/gantt_01.json @@ -0,0 +1,24 @@ +{ + "tasks": [ + { + "name": "Core Feature Dev", + "start_date": "2026-01-26", + "end_date": "2026-02-03" + }, + { + "name": "Edge Case Handling", + "start_date": "2026-01-27", + "end_date": "2026-02-03" + }, + { + "name": "Integration Work", + "start_date": "2026-01-29", + "end_date": "2026-02-04" + }, + { + "name": "Internal Review", + "start_date": "2026-02-01", + "end_date": "2026-02-04" + } + ] +} From 09164a4bc28881ef5a18a90a1e08d61a98d8609f Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Sat, 24 Jan 2026 15:32:58 +0900 Subject: [PATCH 03/38] fix --- benchmark/src/bench/cli.py | 8 ++++++++ benchmark/src/{ => bench}/eval/__init__.py | 0 benchmark/src/{ => bench}/eval/exact_match.py | 0 benchmark/src/{ => bench}/eval/normalize.py | 0 benchmark/src/{ => bench}/eval/report.py | 0 benchmark/src/{ => bench}/llm/__init__.py | 0 benchmark/src/{ => bench}/llm/openai_client.py | 0 benchmark/src/{ => bench}/llm/pricing.py | 0 benchmark/src/{ => bench}/pipeline/__init__.py | 0 benchmark/src/{ => bench}/pipeline/common.py | 0 benchmark/src/{ => bench}/pipeline/exstruct_adapter.py | 0 benchmark/src/{ => bench}/pipeline/html_text.py | 0 benchmark/src/{ => bench}/pipeline/image_render.py | 0 benchmark/src/{ => bench}/pipeline/openpyxl_pandas.py | 0 benchmark/src/{ => bench}/pipeline/pdf_text.py | 0 15 files changed, 8 insertions(+) rename benchmark/src/{ => bench}/eval/__init__.py (100%) rename benchmark/src/{ => bench}/eval/exact_match.py (100%) rename benchmark/src/{ => bench}/eval/normalize.py (100%) rename benchmark/src/{ => bench}/eval/report.py (100%) rename benchmark/src/{ => bench}/llm/__init__.py (100%) rename benchmark/src/{ => bench}/llm/openai_client.py (100%) rename benchmark/src/{ => bench}/llm/pricing.py (100%) rename benchmark/src/{ => bench}/pipeline/__init__.py (100%) rename benchmark/src/{ => bench}/pipeline/common.py (100%) rename benchmark/src/{ => bench}/pipeline/exstruct_adapter.py (100%) rename benchmark/src/{ => bench}/pipeline/html_text.py (100%) rename benchmark/src/{ => bench}/pipeline/image_render.py (100%) rename benchmark/src/{ => bench}/pipeline/openpyxl_pandas.py (100%) rename benchmark/src/{ => bench}/pipeline/pdf_text.py (100%) diff --git a/benchmark/src/bench/cli.py b/benchmark/src/bench/cli.py index 3cd643d..6341287 100644 --- a/benchmark/src/bench/cli.py +++ b/benchmark/src/bench/cli.py @@ -245,3 +245,11 @@ def report() -> None: out_md = RESULTS_DIR / "report.md" out_md.write_text("\n".join(md_lines), encoding="utf-8") print(f"[green]Wrote {out_md}[/green]") + + +import typer + +app = typer.Typer() + +if __name__ == "__main__": + app() diff --git a/benchmark/src/eval/__init__.py b/benchmark/src/bench/eval/__init__.py similarity index 100% rename from benchmark/src/eval/__init__.py rename to benchmark/src/bench/eval/__init__.py diff --git a/benchmark/src/eval/exact_match.py b/benchmark/src/bench/eval/exact_match.py similarity index 100% rename from benchmark/src/eval/exact_match.py rename to benchmark/src/bench/eval/exact_match.py diff --git a/benchmark/src/eval/normalize.py b/benchmark/src/bench/eval/normalize.py similarity index 100% rename from benchmark/src/eval/normalize.py rename to benchmark/src/bench/eval/normalize.py diff --git a/benchmark/src/eval/report.py b/benchmark/src/bench/eval/report.py similarity index 100% rename from benchmark/src/eval/report.py rename to benchmark/src/bench/eval/report.py diff --git a/benchmark/src/llm/__init__.py b/benchmark/src/bench/llm/__init__.py similarity index 100% rename from benchmark/src/llm/__init__.py rename to benchmark/src/bench/llm/__init__.py diff --git a/benchmark/src/llm/openai_client.py b/benchmark/src/bench/llm/openai_client.py similarity index 100% rename from benchmark/src/llm/openai_client.py rename to benchmark/src/bench/llm/openai_client.py diff --git a/benchmark/src/llm/pricing.py b/benchmark/src/bench/llm/pricing.py similarity index 100% rename from benchmark/src/llm/pricing.py rename to benchmark/src/bench/llm/pricing.py diff --git a/benchmark/src/pipeline/__init__.py b/benchmark/src/bench/pipeline/__init__.py similarity index 100% rename from benchmark/src/pipeline/__init__.py rename to benchmark/src/bench/pipeline/__init__.py diff --git a/benchmark/src/pipeline/common.py b/benchmark/src/bench/pipeline/common.py similarity index 100% rename from benchmark/src/pipeline/common.py rename to benchmark/src/bench/pipeline/common.py diff --git a/benchmark/src/pipeline/exstruct_adapter.py b/benchmark/src/bench/pipeline/exstruct_adapter.py similarity index 100% rename from benchmark/src/pipeline/exstruct_adapter.py rename to benchmark/src/bench/pipeline/exstruct_adapter.py diff --git a/benchmark/src/pipeline/html_text.py b/benchmark/src/bench/pipeline/html_text.py similarity index 100% rename from benchmark/src/pipeline/html_text.py rename to benchmark/src/bench/pipeline/html_text.py diff --git a/benchmark/src/pipeline/image_render.py b/benchmark/src/bench/pipeline/image_render.py similarity index 100% rename from benchmark/src/pipeline/image_render.py rename to benchmark/src/bench/pipeline/image_render.py diff --git a/benchmark/src/pipeline/openpyxl_pandas.py b/benchmark/src/bench/pipeline/openpyxl_pandas.py similarity index 100% rename from benchmark/src/pipeline/openpyxl_pandas.py rename to benchmark/src/bench/pipeline/openpyxl_pandas.py diff --git a/benchmark/src/pipeline/pdf_text.py b/benchmark/src/bench/pipeline/pdf_text.py similarity index 100% rename from benchmark/src/pipeline/pdf_text.py rename to benchmark/src/bench/pipeline/pdf_text.py From 1f0924db2f4593d8060dbf26d9763cbcfe86b6c1 Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Sat, 24 Jan 2026 16:03:18 +0900 Subject: [PATCH 04/38] feat: Update Makefile and README for exstruct installation; enhance paths and extraction logic --- benchmark/Makefile | 1 + benchmark/README.md | 1 + benchmark/pyproject.toml | 5 + benchmark/src/bench/cli.py | 266 +++++++++++++----- benchmark/src/bench/llm/openai_client.py | 70 ++++- benchmark/src/bench/paths.py | 15 + .../src/bench/pipeline/exstruct_adapter.py | 80 ++++++ uv.lock | 13 + 8 files changed, 370 insertions(+), 81 deletions(-) diff --git a/benchmark/Makefile b/benchmark/Makefile index 3024f35..8b33e84 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -2,6 +2,7 @@ setup: python -m pip install -U pip + pip install -e .. pip install -e . extract: diff --git a/benchmark/README.md b/benchmark/README.md index e7735c6..a2d1bdc 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -19,6 +19,7 @@ This benchmark compares methods for answering questions about Excel documents us ```bash cd benchmark cp .env.example .env +pip install -e .. # install exstruct from repo root pip install -e . ``` diff --git a/benchmark/pyproject.toml b/benchmark/pyproject.toml index e390725..ee2dd34 100644 --- a/benchmark/pyproject.toml +++ b/benchmark/pyproject.toml @@ -6,6 +6,7 @@ readme = "README.md" requires-python = ">=3.11" dependencies = [ "beautifulsoup4>=4.14.3", + "exstruct", "lxml>=6.0.2", "openai>=2.15.0", "openpyxl>=3.1.5", @@ -14,6 +15,7 @@ dependencies = [ "pymupdf>=1.26.7", "python-dotenv>=1.2.1", "rich>=14.2.0", + "tabulate>=0.9.0", "typer>=0.21.1", ] @@ -24,3 +26,6 @@ exbench = "bench.cli:app" dev = [ "ruff>=0.14.8", ] + +[tool.uv.sources] +exstruct = { workspace = true } diff --git a/benchmark/src/bench/cli.py b/benchmark/src/bench/cli.py index 6341287..d0398e1 100644 --- a/benchmark/src/bench/cli.py +++ b/benchmark/src/bench/cli.py @@ -4,6 +4,7 @@ from pathlib import Path from typing import Any +from pydantic import BaseModel from rich import print from rich.console import Console import typer @@ -13,7 +14,14 @@ from .eval.report import write_results_csv from .llm.openai_client import OpenAIResponsesClient from .manifest import Case, load_manifest -from .paths import DATA_DIR, EXTRACTED_DIR, PROMPTS_DIR, RESPONSES_DIR, RESULTS_DIR +from .paths import ( + DATA_DIR, + EXTRACTED_DIR, + PROMPTS_DIR, + RESPONSES_DIR, + RESULTS_DIR, + resolve_path, +) from .pipeline.common import ensure_dir, sha256_text, write_json from .pipeline.exstruct_adapter import extract_exstruct from .pipeline.html_text import html_to_text, xlsx_to_html @@ -24,35 +32,136 @@ app = typer.Typer(add_completion=False) console = Console() -METHODS_TEXT = ["exstruct", "openpyxl", "pdf", "html"] -METHODS_ALL = ["exstruct", "openpyxl", "pdf", "html", "image_vlm"] +METHODS_TEXT = ("exstruct", "openpyxl", "pdf", "html") +METHODS_ALL = METHODS_TEXT + ("image_vlm",) + + +class PromptRecord(BaseModel): + """Prompt metadata saved for each request.""" + + case_id: str + method: str + model: str + question: str + prompt_hash: str + images: list[str] | None = None + + +class ResponseRecord(BaseModel): + """Response metadata saved for each request.""" + + case_id: str + method: str + model: str + prompt_hash: str + text: str + input_tokens: int + output_tokens: int + cost_usd: float + raw: dict[str, Any] + + +class ResultRow(BaseModel): + """Evaluation row for CSV output.""" + + case_id: str + type: str + method: str + model: str | None + ok: bool + input_tokens: int + output_tokens: int + cost_usd: float + error: str | None def _manifest_path() -> Path: + """Return the path to the benchmark manifest. + + Returns: + Path to manifest.json. + """ return DATA_DIR / "manifest.json" def _select_cases(manifest_cases: list[Case], case: str) -> list[Case]: + """Select benchmark cases by id list or all. + + Args: + manifest_cases: List of cases from the manifest. + case: Comma-separated case ids or "all". + + Returns: + Filtered list of cases. + """ if case == "all": return manifest_cases - ids = {c.strip() for c in case.split(",")} + ids = {c.strip() for c in case.split(",") if c.strip()} return [c for c in manifest_cases if c.id in ids] def _select_methods(method: str) -> list[str]: + """Select methods by list or all, validating against known methods. + + Args: + method: Comma-separated method names or "all". + + Returns: + Ordered list of validated methods. + """ if method == "all": - return METHODS_ALL - return [m.strip() for m in method.split(",")] + selected = list(METHODS_ALL) + else: + selected = [m.strip() for m in method.split(",") if m.strip()] + + seen: set[str] = set() + deduped = [m for m in selected if not (m in seen or seen.add(m))] + invalid = [m for m in deduped if m not in METHODS_ALL] + if invalid: + raise typer.BadParameter( + f"Unknown method(s): {', '.join(invalid)}. Allowed: {', '.join(METHODS_ALL)}" + ) + if not deduped: + raise typer.BadParameter("No methods selected.") + return deduped + + +def _resolve_case_path(path_str: str, *, case_id: str, label: str) -> Path | None: + """Resolve a manifest path, warning if missing. + + Args: + path_str: Path string from the manifest. + case_id: Case identifier for log messages. + label: Label for the path type (e.g., "xlsx", "truth"). + + Returns: + Resolved Path if it exists, otherwise None. + """ + resolved = resolve_path(path_str) + if resolved.exists(): + return resolved + print(f"[yellow]skip: missing {label} for {case_id}: {resolved}[/yellow]") + return None @app.command() def extract(case: str = "all", method: str = "all") -> None: + """Extract contexts for selected cases and methods. + + Args: + case: Comma-separated case ids or "all". + method: Comma-separated method names or "all". + """ mf = load_manifest(_manifest_path()) cases = _select_cases(mf.cases, case) + if not cases: + raise typer.BadParameter(f"No cases matched: {case}") methods = _select_methods(method) for c in cases: - xlsx = Path(c.xlsx) + xlsx = _resolve_case_path(c.xlsx, case_id=c.id, label="xlsx") + if not xlsx: + continue console.rule(f"EXTRACT {c.id} ({xlsx.name})") if "exstruct" in methods: @@ -90,8 +199,17 @@ def extract(case: str = "all", method: str = "all") -> None: @app.command() def ask(case: str = "all", method: str = "all", model: str = "gpt-4o") -> None: + """Run LLM extraction against prepared contexts. + + Args: + case: Comma-separated case ids or "all". + method: Comma-separated method names or "all". + model: OpenAI model name. + """ mf = load_manifest(_manifest_path()) cases = _select_cases(mf.cases, case) + if not cases: + raise typer.BadParameter(f"No cases matched: {case}") methods = _select_methods(method) client = OpenAIResponsesClient() @@ -103,57 +221,69 @@ def ask(case: str = "all", method: str = "all", model: str = "gpt-4o") -> None: q = c.question for m in methods: - prompt_rec: dict[str, Any] = { - "case_id": c.id, - "method": m, - "model": model, - "question": q, - } - resp_rec: dict[str, Any] = {"case_id": c.id, "method": m, "model": model} - if m == "image_vlm": img_dir = EXTRACTED_DIR / "image_vlm" / c.id - imgs = json.loads( - (img_dir / "images.json").read_text(encoding="utf-8") - )["images"] + images_json = img_dir / "images.json" + if not images_json.exists(): + print(f"[yellow]skip: missing images for {c.id}[/yellow]") + continue + imgs = json.loads(images_json.read_text(encoding="utf-8"))["images"] img_paths = [Path(p) for p in imgs] + if not img_paths: + print(f"[yellow]skip: no images for {c.id}[/yellow]") + continue prompt_hash = sha256_text( q + "|" + "|".join([p.name for p in img_paths]) ) - prompt_rec["prompt_hash"] = prompt_hash - prompt_rec["images"] = [p.name for p in img_paths] - + prompt_rec = PromptRecord( + case_id=c.id, + method=m, + model=model, + question=q, + prompt_hash=prompt_hash, + images=[p.name for p in img_paths], + ) res = client.ask_images(model=model, question=q, image_paths=img_paths) - else: txt_path = EXTRACTED_DIR / m / f"{c.id}.txt" + if not txt_path.exists(): + print(f"[yellow]skip: missing context for {c.id} ({m})[/yellow]") + continue context = txt_path.read_text(encoding="utf-8") prompt_hash = sha256_text(q + "|" + context) - prompt_rec["prompt_hash"] = prompt_hash - + prompt_rec = PromptRecord( + case_id=c.id, + method=m, + model=model, + question=q, + prompt_hash=prompt_hash, + ) res = client.ask_text(model=model, question=q, context_text=context) - # save prompt/response prompt_file = PROMPTS_DIR / f"{c.id}.jsonl" resp_file = RESPONSES_DIR / f"{c.id}.jsonl" - - prompt_rec_line = json.dumps(prompt_rec, ensure_ascii=False) - resp_rec.update( - { - "prompt_hash": prompt_hash, - "text": res.text, - "input_tokens": res.input_tokens, - "output_tokens": res.output_tokens, - "cost_usd": res.cost_usd, - "raw": res.raw, - } + resp_rec = ResponseRecord( + case_id=c.id, + method=m, + model=model, + prompt_hash=prompt_hash, + text=res.text, + input_tokens=res.input_tokens, + output_tokens=res.output_tokens, + cost_usd=res.cost_usd, + raw=res.raw, ) - resp_rec_line = json.dumps(resp_rec, ensure_ascii=False) + prompt_line = json.dumps( + prompt_rec.model_dump(exclude_none=True), ensure_ascii=False + ) + resp_line = json.dumps( + resp_rec.model_dump(exclude_none=True), ensure_ascii=False + ) with prompt_file.open("a", encoding="utf-8") as f: - f.write(prompt_rec_line + "\n") + f.write(prompt_line + "\n") with resp_file.open("a", encoding="utf-8") as f: - f.write(resp_rec_line + "\n") + f.write(resp_line + "\n") print( f"[cyan]{c.id} {m}[/cyan] tokens(in/out)={res.input_tokens}/{res.output_tokens} cost=${res.cost_usd:.6f}" @@ -162,60 +292,67 @@ def ask(case: str = "all", method: str = "all", model: str = "gpt-4o") -> None: @app.command() def eval(case: str = "all", method: str = "all") -> None: + """Evaluate the latest responses and write results CSV. + + Args: + case: Comma-separated case ids or "all". + method: Comma-separated method names or "all". + """ mf = load_manifest(_manifest_path()) cases = _select_cases(mf.cases, case) + if not cases: + raise typer.BadParameter(f"No cases matched: {case}") methods = _select_methods(method) - rows: list[dict[str, Any]] = [] + rows: list[ResultRow] = [] for c in cases: - truth = json.loads(Path(c.truth).read_text(encoding="utf-8")) + truth_path = _resolve_case_path(c.truth, case_id=c.id, label="truth") + if not truth_path: + continue + truth = json.loads(truth_path.read_text(encoding="utf-8")) resp_file = RESPONSES_DIR / f"{c.id}.jsonl" if not resp_file.exists(): print(f"[yellow]skip: no responses for {c.id}[/yellow]") continue - # 最新の各method結果を採用(同じmethodが複数行ある場合、最後の行が最新) latest: dict[str, dict[str, Any]] = {} for line in resp_file.read_text(encoding="utf-8").splitlines(): rec = json.loads(line) - if rec["method"] in methods: + if rec.get("method") in methods: latest[rec["method"]] = rec for m, rec in latest.items(): ok = False - pred_obj = None - err = None + err: str | None = None try: pred_obj = normalize_json_text(rec["text"]) ok = exact_match(pred_obj, truth) - except Exception as e: - err = str(e) + except Exception as exc: + err = str(exc) rows.append( - { - "case_id": c.id, - "type": c.type, - "method": m, - "model": rec.get("model"), - "ok": ok, - "input_tokens": rec.get("input_tokens", 0), - "output_tokens": rec.get("output_tokens", 0), - "cost_usd": rec.get("cost_usd", 0.0), - "error": err, - } + ResultRow( + case_id=c.id, + type=c.type, + method=m, + model=rec.get("model"), + ok=ok, + input_tokens=int(rec.get("input_tokens", 0)), + output_tokens=int(rec.get("output_tokens", 0)), + cost_usd=float(rec.get("cost_usd", 0.0)), + error=err, + ) ) out_csv = RESULTS_DIR / "results.csv" - write_results_csv(rows, out_csv) + write_results_csv([row.model_dump() for row in rows], out_csv) print(f"[green]Wrote {out_csv} ({len(rows)} rows)[/green]") @app.command() def report() -> None: - """ - 雑に Markdown レポートを作る(必要なら後で強化) - """ + """Generate a Markdown report from the results CSV.""" csv_path = RESULTS_DIR / "results.csv" if not csv_path.exists(): raise typer.Exit(code=1) @@ -223,7 +360,6 @@ def report() -> None: import pandas as pd df = pd.read_csv(csv_path) - # 集計: method別の正解率/平均トークン/平均コスト g = ( df.groupby("method") .agg( @@ -247,9 +383,5 @@ def report() -> None: print(f"[green]Wrote {out_md}[/green]") -import typer - -app = typer.Typer() - if __name__ == "__main__": app() diff --git a/benchmark/src/bench/llm/openai_client.py b/benchmark/src/bench/llm/openai_client.py index 7ca4f8a..e8e8a32 100644 --- a/benchmark/src/bench/llm/openai_client.py +++ b/benchmark/src/bench/llm/openai_client.py @@ -1,19 +1,21 @@ from __future__ import annotations import base64 -from dataclasses import dataclass import json from pathlib import Path from typing import Any from dotenv import load_dotenv from openai import OpenAI +from pydantic import BaseModel +from ..paths import ROOT from .pricing import estimate_cost_usd -@dataclass -class LLMResult: +class LLMResult(BaseModel): + """Structured response data from the LLM call.""" + text: str input_tokens: int output_tokens: int @@ -22,19 +24,54 @@ class LLMResult: def _png_to_data_url(png_path: Path) -> str: + """Encode a PNG image as a data URL. + + Args: + png_path: PNG file path. + + Returns: + Base64 data URL string. + """ b = png_path.read_bytes() b64 = base64.b64encode(b).decode("ascii") return f"data:image/png;base64,{b64}" +def _extract_usage_tokens(usage: object | None) -> tuple[int, int]: + """Extract input/output tokens from the OpenAI usage payload. + + Args: + usage: Usage payload from the OpenAI SDK (object or dict). + + Returns: + Tuple of (input_tokens, output_tokens). + """ + if usage is None: + return 0, 0 + if isinstance(usage, dict): + return int(usage.get("input_tokens", 0)), int(usage.get("output_tokens", 0)) + input_tokens = int(getattr(usage, "input_tokens", 0)) + output_tokens = int(getattr(usage, "output_tokens", 0)) + return input_tokens, output_tokens + + class OpenAIResponsesClient: + """Thin wrapper around the OpenAI Responses API for this benchmark.""" + def __init__(self) -> None: - load_dotenv() + load_dotenv(dotenv_path=ROOT / ".env") self.client = OpenAI() def ask_text(self, *, model: str, question: str, context_text: str) -> LLMResult: - """ - Responses API: text-only + """Call Responses API with text-only input. + + Args: + model: OpenAI model name (e.g., "gpt-4o"). + question: User question to answer. + context_text: Extracted context text from the workbook. + + Returns: + LLMResult containing the model output and usage metadata. """ resp = self.client.responses.create( model=model, @@ -54,9 +91,8 @@ def ask_text(self, *, model: str, question: str, context_text: str) -> LLMResult ) text = resp.output_text # SDK helper - usage = getattr(resp, "usage", None) or {} - in_tok = int(usage.get("input_tokens", 0)) - out_tok = int(usage.get("output_tokens", 0)) + usage = getattr(resp, "usage", None) + in_tok, out_tok = _extract_usage_tokens(usage) cost = estimate_cost_usd(model, in_tok, out_tok) raw = json.loads(resp.model_dump_json()) @@ -71,8 +107,15 @@ def ask_text(self, *, model: str, question: str, context_text: str) -> LLMResult def ask_images( self, *, model: str, question: str, image_paths: list[Path] ) -> LLMResult: - """ - Responses API: image + text + """Call Responses API with image + text input. + + Args: + model: OpenAI model name (e.g., "gpt-4o"). + question: User question to answer. + image_paths: PNG image paths to include as vision input. + + Returns: + LLMResult containing the model output and usage metadata. """ content: list[dict[str, Any]] = [ { @@ -90,9 +133,8 @@ def ask_images( ) text = resp.output_text - usage = getattr(resp, "usage", None) or {} - in_tok = int(usage.get("input_tokens", 0)) - out_tok = int(usage.get("output_tokens", 0)) + usage = getattr(resp, "usage", None) + in_tok, out_tok = _extract_usage_tokens(usage) cost = estimate_cost_usd(model, in_tok, out_tok) raw = json.loads(resp.model_dump_json()) diff --git a/benchmark/src/bench/paths.py b/benchmark/src/bench/paths.py index 082db16..c8ba71a 100644 --- a/benchmark/src/bench/paths.py +++ b/benchmark/src/bench/paths.py @@ -12,3 +12,18 @@ PROMPTS_DIR = OUT_DIR / "prompts" RESPONSES_DIR = OUT_DIR / "responses" RESULTS_DIR = OUT_DIR / "results" + + +def resolve_path(path: str | Path) -> Path: + """Resolve a path relative to the benchmark root when needed. + + Args: + path: Path string or Path instance from the manifest. + + Returns: + Resolved Path anchored to the benchmark root when relative. + """ + candidate = Path(path) + if candidate.is_absolute(): + return candidate + return ROOT / candidate diff --git a/benchmark/src/bench/pipeline/exstruct_adapter.py b/benchmark/src/bench/pipeline/exstruct_adapter.py index e69de29..008f02f 100644 --- a/benchmark/src/bench/pipeline/exstruct_adapter.py +++ b/benchmark/src/bench/pipeline/exstruct_adapter.py @@ -0,0 +1,80 @@ +from __future__ import annotations + +import logging +from pathlib import Path + +from pydantic import BaseModel + +from exstruct import ExtractionMode, extract as exstruct_extract +from exstruct.models import SheetData, WorkbookData + +from .common import write_text + +logger = logging.getLogger(__name__) + + +class ExstructTextConfig(BaseModel): + """Configuration for ExStruct text extraction output.""" + + mode: ExtractionMode = "standard" + pretty: bool = False + indent: int | None = None + + +def _filter_workbook_sheets( + workbook: WorkbookData, sheet_scope: list[str] | None +) -> WorkbookData: + """Return a workbook filtered to the requested sheet scope. + + Args: + workbook: Extracted workbook payload from ExStruct. + sheet_scope: Optional list of sheet names to keep. + + Returns: + WorkbookData filtered to the requested sheets, or the original workbook if none match. + """ + if not sheet_scope: + return workbook + sheets: dict[str, SheetData] = { + name: sheet + for name, sheet in workbook.sheets.items() + if name in set(sheet_scope) + } + if not sheets: + logger.warning("No matching sheets found for scope: %s", sheet_scope) + return workbook + return WorkbookData(book_name=workbook.book_name, sheets=sheets) + + +def extract_exstruct( + xlsx_path: Path, + out_txt: Path, + sheet_scope: list[str] | None = None, + *, + config: ExstructTextConfig | None = None, +) -> None: + """Extract workbook with ExStruct and write JSON text for LLM context. + + Args: + xlsx_path: Excel workbook path. + out_txt: Destination text file path. + sheet_scope: Optional list of sheet names to keep. + config: Optional ExStruct text extraction configuration. + """ + resolved_config = config or ExstructTextConfig() + workbook = exstruct_extract(xlsx_path, mode=resolved_config.mode) + workbook = _filter_workbook_sheets(workbook, sheet_scope) + payload = workbook.to_json( + pretty=resolved_config.pretty, indent=resolved_config.indent + ) + + lines = [ + "[DOC_META]", + f"source={xlsx_path.name}", + "method=exstruct", + f"mode={resolved_config.mode}", + "", + "[CONTENT]", + payload, + ] + write_text(out_txt, "\n".join(lines).strip() + "\n") diff --git a/uv.lock b/uv.lock index 843b44e..f5a2867 100644 --- a/uv.lock +++ b/uv.lock @@ -104,6 +104,7 @@ version = "0.1.0" source = { virtual = "benchmark" } dependencies = [ { name = "beautifulsoup4" }, + { name = "exstruct" }, { name = "lxml" }, { name = "openai" }, { name = "openpyxl" }, @@ -112,6 +113,7 @@ dependencies = [ { name = "pymupdf" }, { name = "python-dotenv" }, { name = "rich" }, + { name = "tabulate" }, { name = "typer" }, ] @@ -123,6 +125,7 @@ dev = [ [package.metadata] requires-dist = [ { name = "beautifulsoup4", specifier = ">=4.14.3" }, + { name = "exstruct", editable = "." }, { name = "lxml", specifier = ">=6.0.2" }, { name = "openai", specifier = ">=2.15.0" }, { name = "openpyxl", specifier = ">=3.1.5" }, @@ -131,6 +134,7 @@ requires-dist = [ { name = "pymupdf", specifier = ">=1.26.7" }, { name = "python-dotenv", specifier = ">=1.2.1" }, { name = "rich", specifier = ">=14.2.0" }, + { name = "tabulate", specifier = ">=0.9.0" }, { name = "typer", specifier = ">=0.21.1" }, ] @@ -2444,6 +2448,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/81/0d/13d1d239a25cbfb19e740db83143e95c772a1fe10202dda4b76792b114dd/starlette-0.52.1-py3-none-any.whl", hash = "sha256:0029d43eb3d273bc4f83a08720b4912ea4b071087a3b48db01b7c839f7954d74", size = 74272, upload-time = "2026-01-18T13:34:09.188Z" }, ] +[[package]] +name = "tabulate" +version = "0.9.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ec/fe/802052aecb21e3797b8f7902564ab6ea0d60ff8ca23952079064155d1ae1/tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c", size = 81090, upload-time = "2022-10-06T17:21:48.54Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f", size = 35252, upload-time = "2022-10-06T17:21:44.262Z" }, +] + [[package]] name = "taskipy" version = "1.14.1" From ed3d7ead108ec1ae5a416667ad43587302710fca Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Sat, 24 Jan 2026 16:03:55 +0900 Subject: [PATCH 05/38] fix: Correct flowchart ID and file paths in manifest.json --- benchmark/data/manifest.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmark/data/manifest.json b/benchmark/data/manifest.json index 2e34660..58f0d75 100644 --- a/benchmark/data/manifest.json +++ b/benchmark/data/manifest.json @@ -13,11 +13,11 @@ } }, { - "id": "flowchart_image_01", + "id": "flowchart_01", "type": "flowchart", - "xlsx": "data/raw/flowchart_image_01.xlsx", + "xlsx": "data/raw/flowchart_01.xlsx", "question": "このフローチャートの開始から終了までの主要な処理ステップを順番に抽出し、次のJSON形式のみで返してください。\n\n出力形式(厳守):\n{\n \"steps\": [\"step1\", \"step2\", \"step3\", ...]\n}\n\n注意事項:\n- 開始ノードと終了ノードも含めてください\n- 分岐やループがある場合は、代表的な主経路として線形化してください\n- ステップ名は図中のラベル文字列をそのまま使用してください", - "truth": "data/truth/flowchart_image_01.json", + "truth": "data/truth/flowchart_01.json", "sheet_scope": null, "render": { "dpi": 220, From 32bf7713b464c82f83bf338a93724554712b98c3 Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Sat, 24 Jan 2026 16:06:41 +0900 Subject: [PATCH 06/38] feat: Add taskipy as a development dependency and update task definitions in pyproject.toml --- benchmark/pyproject.toml | 9 +++++++++ uv.lock | 6 +++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/benchmark/pyproject.toml b/benchmark/pyproject.toml index ee2dd34..a8043ca 100644 --- a/benchmark/pyproject.toml +++ b/benchmark/pyproject.toml @@ -25,7 +25,16 @@ exbench = "bench.cli:app" [dependency-groups] dev = [ "ruff>=0.14.8", + "taskipy>=1.14.1", ] [tool.uv.sources] exstruct = { workspace = true } + +[tool.taskipy.tasks] +setup = "python -m pip install -U pip && pip install -e .. && pip install -e ." +extract = "exbench extract --case all --method all" +ask = "exbench ask --case all --method all --model gpt-4o" +eval = "exbench eval --case all --method all" +report = "exbench report" +all = "task extract && task ask && task eval && task report" diff --git a/uv.lock b/uv.lock index f5a2867..59ca7df 100644 --- a/uv.lock +++ b/uv.lock @@ -120,6 +120,7 @@ dependencies = [ [package.dev-dependencies] dev = [ { name = "ruff" }, + { name = "taskipy" }, ] [package.metadata] @@ -139,7 +140,10 @@ requires-dist = [ ] [package.metadata.requires-dev] -dev = [{ name = "ruff", specifier = ">=0.14.8" }] +dev = [ + { name = "ruff", specifier = ">=0.14.8" }, + { name = "taskipy", specifier = ">=1.14.1" }, +] [[package]] name = "certifi" From 3d01848b69358a361557547e8557aac08d77fa6d Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Sat, 24 Jan 2026 16:31:31 +0900 Subject: [PATCH 07/38] fix --- benchmark/data/truth/flowchart_01.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmark/data/truth/flowchart_01.json b/benchmark/data/truth/flowchart_01.json index deff5bb..196bc95 100644 --- a/benchmark/data/truth/flowchart_01.json +++ b/benchmark/data/truth/flowchart_01.json @@ -1,6 +1,6 @@ { "steps": [ - "開始", + "S", "要件抽出", "ヒアリング", "非機能要件", @@ -9,6 +9,6 @@ "まとめ", "文書作成", "締結", - "終了" + "E" ] } From c9a14644bdd519adc6d2b17a097ca8cf615558e9 Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Sat, 24 Jan 2026 17:00:35 +0900 Subject: [PATCH 08/38] feat: Update LLM client and CLI to support temperature parameter for model queries --- benchmark/data/truth/ffr_425_01.json | 3 +- benchmark/src/bench/cli.py | 59 ++++++++++++++++++++---- benchmark/src/bench/llm/openai_client.py | 10 +++- 3 files changed, 59 insertions(+), 13 deletions(-) diff --git a/benchmark/data/truth/ffr_425_01.json b/benchmark/data/truth/ffr_425_01.json index e356d9d..81d6ca1 100644 --- a/benchmark/data/truth/ffr_425_01.json +++ b/benchmark/data/truth/ffr_425_01.json @@ -1,10 +1,9 @@ { "item_1_agency": "United States Environmental Protection Agency", - "item_6_report_type": "Quarterly", + "item_6_report_type": null, "item_7_accounting_basis": "Cash", "item_9_reporting_period_end": null, "not_required_by_epa_items": [ - "Federal Cash", "Cash Receipts", "Cash Disbursements", "Cash on Hand" diff --git a/benchmark/src/bench/cli.py b/benchmark/src/bench/cli.py index d0398e1..9618b68 100644 --- a/benchmark/src/bench/cli.py +++ b/benchmark/src/bench/cli.py @@ -42,6 +42,7 @@ class PromptRecord(BaseModel): case_id: str method: str model: str + temperature: float question: str prompt_hash: str images: list[str] | None = None @@ -53,6 +54,7 @@ class ResponseRecord(BaseModel): case_id: str method: str model: str + temperature: float prompt_hash: str text: str input_tokens: int @@ -144,6 +146,29 @@ def _resolve_case_path(path_str: str, *, case_id: str, label: str) -> Path | Non return None +def _reset_case_outputs(case_id: str) -> None: + """Delete existing prompt/response logs for a case.""" + for directory in (PROMPTS_DIR, RESPONSES_DIR): + path = directory / f"{case_id}.jsonl" + if path.exists(): + path.unlink() + + +def _dump_jsonl(obj: BaseModel) -> str: + """Serialize a record for JSONL output. + + Args: + obj: Pydantic model to serialize. + + Returns: + Single-line JSON string with stable key ordering. + """ + payload = obj.model_dump(exclude_none=True) + return json.dumps( + payload, ensure_ascii=False, sort_keys=True, separators=(", ", ": ") + ) + + @app.command() def extract(case: str = "all", method: str = "all") -> None: """Extract contexts for selected cases and methods. @@ -198,13 +223,19 @@ def extract(case: str = "all", method: str = "all") -> None: @app.command() -def ask(case: str = "all", method: str = "all", model: str = "gpt-4o") -> None: +def ask( + case: str = "all", + method: str = "all", + model: str = "gpt-4o", + temperature: float = 0.0, +) -> None: """Run LLM extraction against prepared contexts. Args: case: Comma-separated case ids or "all". method: Comma-separated method names or "all". model: OpenAI model name. + temperature: Sampling temperature for the model. """ mf = load_manifest(_manifest_path()) cases = _select_cases(mf.cases, case) @@ -219,6 +250,7 @@ def ask(case: str = "all", method: str = "all", model: str = "gpt-4o") -> None: for c in cases: console.rule(f"ASK {c.id}") q = c.question + _reset_case_outputs(c.id) for m in methods: if m == "image_vlm": @@ -239,11 +271,17 @@ def ask(case: str = "all", method: str = "all", model: str = "gpt-4o") -> None: case_id=c.id, method=m, model=model, + temperature=temperature, question=q, prompt_hash=prompt_hash, images=[p.name for p in img_paths], ) - res = client.ask_images(model=model, question=q, image_paths=img_paths) + res = client.ask_images( + model=model, + question=q, + image_paths=img_paths, + temperature=temperature, + ) else: txt_path = EXTRACTED_DIR / m / f"{c.id}.txt" if not txt_path.exists(): @@ -255,10 +293,16 @@ def ask(case: str = "all", method: str = "all", model: str = "gpt-4o") -> None: case_id=c.id, method=m, model=model, + temperature=temperature, question=q, prompt_hash=prompt_hash, ) - res = client.ask_text(model=model, question=q, context_text=context) + res = client.ask_text( + model=model, + question=q, + context_text=context, + temperature=temperature, + ) prompt_file = PROMPTS_DIR / f"{c.id}.jsonl" resp_file = RESPONSES_DIR / f"{c.id}.jsonl" @@ -266,6 +310,7 @@ def ask(case: str = "all", method: str = "all", model: str = "gpt-4o") -> None: case_id=c.id, method=m, model=model, + temperature=temperature, prompt_hash=prompt_hash, text=res.text, input_tokens=res.input_tokens, @@ -274,12 +319,8 @@ def ask(case: str = "all", method: str = "all", model: str = "gpt-4o") -> None: raw=res.raw, ) - prompt_line = json.dumps( - prompt_rec.model_dump(exclude_none=True), ensure_ascii=False - ) - resp_line = json.dumps( - resp_rec.model_dump(exclude_none=True), ensure_ascii=False - ) + prompt_line = _dump_jsonl(prompt_rec) + resp_line = _dump_jsonl(resp_rec) with prompt_file.open("a", encoding="utf-8") as f: f.write(prompt_line + "\n") with resp_file.open("a", encoding="utf-8") as f: diff --git a/benchmark/src/bench/llm/openai_client.py b/benchmark/src/bench/llm/openai_client.py index e8e8a32..a44ce25 100644 --- a/benchmark/src/bench/llm/openai_client.py +++ b/benchmark/src/bench/llm/openai_client.py @@ -62,19 +62,23 @@ def __init__(self) -> None: load_dotenv(dotenv_path=ROOT / ".env") self.client = OpenAI() - def ask_text(self, *, model: str, question: str, context_text: str) -> LLMResult: + def ask_text( + self, *, model: str, question: str, context_text: str, temperature: float + ) -> LLMResult: """Call Responses API with text-only input. Args: model: OpenAI model name (e.g., "gpt-4o"). question: User question to answer. context_text: Extracted context text from the workbook. + temperature: Sampling temperature for the response. Returns: LLMResult containing the model output and usage metadata. """ resp = self.client.responses.create( model=model, + temperature=temperature, input=[ { "role": "user", @@ -105,7 +109,7 @@ def ask_text(self, *, model: str, question: str, context_text: str) -> LLMResult ) def ask_images( - self, *, model: str, question: str, image_paths: list[Path] + self, *, model: str, question: str, image_paths: list[Path], temperature: float ) -> LLMResult: """Call Responses API with image + text input. @@ -113,6 +117,7 @@ def ask_images( model: OpenAI model name (e.g., "gpt-4o"). question: User question to answer. image_paths: PNG image paths to include as vision input. + temperature: Sampling temperature for the response. Returns: LLMResult containing the model output and usage metadata. @@ -129,6 +134,7 @@ def ask_images( resp = self.client.responses.create( model=model, + temperature=temperature, input=[{"role": "user", "content": content}], ) From ab8a2a35896157bbb71c297649a113a67df54b59 Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Sat, 24 Jan 2026 17:22:24 +0900 Subject: [PATCH 09/38] feat: Update manifest and truth files for improved data extraction; add certificate of employment JSON structure --- benchmark/data/manifest.json | 19 ++- .../truth/certificate_of_employment_01.json | 58 ++++++++ benchmark/data/truth/ffr_425_01.json | 23 ++-- benchmark/src/bench/cli.py | 32 +++-- benchmark/src/bench/eval/score.py | 127 ++++++++++++++++++ 5 files changed, 234 insertions(+), 25 deletions(-) create mode 100644 benchmark/data/truth/certificate_of_employment_01.json create mode 100644 benchmark/src/bench/eval/score.py diff --git a/benchmark/data/manifest.json b/benchmark/data/manifest.json index 58f0d75..351c999 100644 --- a/benchmark/data/manifest.json +++ b/benchmark/data/manifest.json @@ -4,13 +4,10 @@ "id": "ffr_425_01", "type": "application_form", "xlsx": "data/raw/ffr_425_01.xlsx", - "question": "このExcel申請書(Federal Financial Report / SF-425)から、次の情報を抽出し、JSONのみで返してください。\n\n抽出項目:\n1. item_1_agency: 「1. Federal Agency and Organizational Element」に記載されている提出先機関名\n2. item_6_report_type: チェックされている Report Type(Quarterly / Semi-Annual / Annual / Final のいずれか)\n3. item_7_accounting_basis: Basis of Accounting(Cash または Accrual)\n4. item_9_reporting_period_end: Reporting Period End Date(YYYY-MM-DD形式、未記入の場合は null)\n5. not_required_by_epa_items: \"Not Required by EPA\" と明示されている取引項目の一覧(項目名の配列)\n\n出力形式(厳守):\n{\n \"item_1_agency\": \"...\",\n \"item_6_report_type\": \"...\",\n \"item_7_accounting_basis\": \"...\",\n \"item_9_reporting_period_end\": \"YYYY-MM-DD\" | null,\n \"not_required_by_epa_items\": [\"...\", \"...\"]\n}", + "question": "このExcel帳票(Federal Financial Report / SF-425)について、次の情報を抽出し、JSONのみで返してください。\n\n(1) checkbox_groups: チェックボックスのグループ名と、その選択肢ラベル一覧を抽出してください(\"Report Type\" と \"Basis of Accounting\" の2グループのみ)。\n(2) not_required_by_epa_scope: 赤字の注記 \"Not Required by EPA\" がかかっているセクション名を返してください(例: \"Federal Cash\")。\n(3) section_headers: 帳票上部の番号付きセクション見出し(1〜9)のうち、見出しテキストのみを配列で返してください(例: \"Federal Agency and Organizational Element to Which Report is Submitted\" など)。\n\n出力形式(厳守):\n{\n \"checkbox_groups\": {\n \"Report Type\": [\"Quarterly\", \"Semi-Annual\", \"Annual\", \"Final\"],\n \"Basis of Accounting\": [\"Cash\", \"Accrual\"]\n },\n \"not_required_by_epa_scope\": \"...\",\n \"section_headers\": [\"...\", \"...\", \"...\"]\n}\n\n注意:\n- チェックボックスの記号(□など)は含めないでください。ラベル文字列のみを返してください。\n- section_headers は表示順(上から左→右)で返してください。\n- 余分な説明文やコードフェンスは付けず、JSONのみを返してください。", "truth": "data/truth/ffr_425_01.json", "sheet_scope": null, - "render": { - "dpi": 220, - "max_pages": 2 - } + "render": { "dpi": 220, "max_pages": 2 } }, { "id": "flowchart_01", @@ -32,6 +29,18 @@ "truth": "data/truth/gantt_01.json", "sheet_scope": null, "render": { "dpi": 200, "max_pages": 4 } + }, + { + "id": "certificate_of_employment_01", + "type": "application_form", + "xlsx": "data/raw/certificate_of_employment_01.xlsx", + "question": "このExcel帳票(就労証明書)について、次の3点を抽出し、JSONのみで返してください。\n\n(1) checkbox_groups: 以下の3つのチェックボックス項目について、それぞれの選択肢ラベルを抽出してください。\n - 業種\n - 雇用の形態\n - 雇用(予定)期間等(無期 / 有期)\n\n(2) numbered_sections: 帳票の「No.」列に対応する番号付き項目の見出し(1〜14)を、番号をキーとして抽出してください。\n\n(3) warning_text: 赤字で記載されている注意文を、そのまま1つの文字列として抽出してください。\n\n出力形式(厳守):\n{\n \"checkbox_groups\": {\n \"業種\": [\"...\", \"...\"],\n \"雇用の形態\": [\"...\", \"...\"],\n \"雇用(予定)期間等\": [\"...\", \"...\"]\n },\n \"numbered_sections\": {\n \"1\": \"...\",\n \"2\": \"...\",\n \"3\": \"...\"\n },\n \"warning_text\": \"...\"\n}\n\n注意:\n- チェックボックス記号(□など)は含めず、ラベル文字列のみを返してください。\n- numbered_sections は 1〜14 すべてを含めてください。\n- 余分な説明文やコードフェンスは付けず、JSONのみを返してください。", + "truth": "data/truth/certificate_of_employment_01.json", + "sheet_scope": null, + "render": { + "dpi": 220, + "max_pages": 1 + } } ] } diff --git a/benchmark/data/truth/certificate_of_employment_01.json b/benchmark/data/truth/certificate_of_employment_01.json new file mode 100644 index 0000000..9dc4cd3 --- /dev/null +++ b/benchmark/data/truth/certificate_of_employment_01.json @@ -0,0 +1,58 @@ +{ + "checkbox_groups": { + "業種": [ + "農業・林業", + "漁業", + "鉱業・採石業・砂利採取業", + "建設業", + "製造業", + "電気・ガス・熱供給・水道業", + "情報通信業", + "運輸業・郵便業", + "卸売業・小売業", + "金融業・保険業", + "不動産業・物品賃貸業", + "学術研究・専門・技術サービス", + "宿泊業・飲食サービス業", + "生活関連サービス業・娯楽業", + "医療・福祉", + "教育・学習支援業", + "複合サービス事業", + "公務", + "その他" + ], + "雇用の形態": [ + "正社員", + "パート・アルバイト", + "派遣社員", + "契約社員", + "会計年度任用職員", + "非常勤・臨時職員", + "役員", + "自営業主", + "自営業専従者", + "家族従業者", + "内職", + "業務委託", + "その他" + ], + "雇用(予定)期間等": ["無期", "有期"] + }, + "numbered_sections": { + "1": "業種", + "2": "本人氏名", + "3": "雇用(予定)期間等", + "4": "本人就労先事業所", + "5": "雇用の形態", + "6": "就労時間(固定就労の場合)", + "7": "就労時間(変則就労の場合)", + "8": "就労実績", + "9": "産前・産後休業の取得", + "10": "育児休業の取得", + "11": "産休・育休以外の休業の取得", + "12": "復職(予定)年月日", + "13": "育児のための短時間勤務制度利用有無", + "14": "保育士等としての勤務実態の有無" + }, + "warning_text": "※本証明書の内容について、就労先事業者等に無断で作成又は改変を行ったときは、刑法上の罪に問われる場合があります。" +} diff --git a/benchmark/data/truth/ffr_425_01.json b/benchmark/data/truth/ffr_425_01.json index 81d6ca1..a53b43d 100644 --- a/benchmark/data/truth/ffr_425_01.json +++ b/benchmark/data/truth/ffr_425_01.json @@ -1,11 +1,18 @@ { - "item_1_agency": "United States Environmental Protection Agency", - "item_6_report_type": null, - "item_7_accounting_basis": "Cash", - "item_9_reporting_period_end": null, - "not_required_by_epa_items": [ - "Cash Receipts", - "Cash Disbursements", - "Cash on Hand" + "checkbox_groups": { + "Report Type": ["Quarterly", "Semi-Annual", "Annual", "Final"], + "Basis of Accounting": ["Cash", "Accrual"] + }, + "not_required_by_epa_scope": "Federal Cash", + "section_headers": [ + "Federal Agency and Organizational Element to Which Report is Submitted", + "Federal Grant or Other Identifying Number Assigned by Federal Agency", + "Recipient Organization (Name and complete address including Zip code)", + "DUNS Number", + "Recipient Account Number or Identifying Number", + "Report Type", + "Basis of Accounting", + "Project/Grant Period", + "Reporting Period End Date" ] } diff --git a/benchmark/src/bench/cli.py b/benchmark/src/bench/cli.py index 9618b68..c69441d 100644 --- a/benchmark/src/bench/cli.py +++ b/benchmark/src/bench/cli.py @@ -9,9 +9,9 @@ from rich.console import Console import typer -from .eval.exact_match import exact_match from .eval.normalize import normalize_json_text from .eval.report import write_results_csv +from .eval.score import key_score, key_score_ordered from .llm.openai_client import OpenAIResponsesClient from .manifest import Case, load_manifest from .paths import ( @@ -70,6 +70,8 @@ class ResultRow(BaseModel): type: str method: str model: str | None + score: float + score_ordered: float ok: bool input_tokens: int output_tokens: int @@ -365,10 +367,14 @@ def eval(case: str = "all", method: str = "all") -> None: for m, rec in latest.items(): ok = False + score = 0.0 + score_ordered = 0.0 err: str | None = None try: pred_obj = normalize_json_text(rec["text"]) - ok = exact_match(pred_obj, truth) + score = key_score(truth, pred_obj) + score_ordered = key_score_ordered(truth, pred_obj) + ok = score == 1.0 except Exception as exc: err = str(exc) @@ -378,6 +384,8 @@ def eval(case: str = "all", method: str = "all") -> None: type=c.type, method=m, model=rec.get("model"), + score=score, + score_ordered=score_ordered, ok=ok, input_tokens=int(rec.get("input_tokens", 0)), output_tokens=int(rec.get("output_tokens", 0)), @@ -401,16 +409,16 @@ def report() -> None: import pandas as pd df = pd.read_csv(csv_path) - g = ( - df.groupby("method") - .agg( - acc=("ok", "mean"), - avg_in=("input_tokens", "mean"), - avg_cost=("cost_usd", "mean"), - n=("ok", "count"), - ) - .reset_index() - ) + score_col = "score" if "score" in df.columns else "ok" + agg: dict[str, tuple[str, str]] = { + "acc": (score_col, "mean"), + "avg_in": ("input_tokens", "mean"), + "avg_cost": ("cost_usd", "mean"), + "n": (score_col, "count"), + } + if "score_ordered" in df.columns: + agg["acc_ordered"] = ("score_ordered", "mean") + g = df.groupby("method").agg(**agg).reset_index() md_lines = [] md_lines.append("# Benchmark Report") diff --git a/benchmark/src/bench/eval/score.py b/benchmark/src/bench/eval/score.py new file mode 100644 index 0000000..5525670 --- /dev/null +++ b/benchmark/src/bench/eval/score.py @@ -0,0 +1,127 @@ +from __future__ import annotations + +from typing import Any + +from .exact_match import exact_match + + +def _list_score(truth_list: list[Any], pred_list: Any) -> float: + """Compute a partial match score for lists. + + Args: + truth_list: Ground-truth list. + pred_list: Predicted list. + + Returns: + Fraction of truth elements present in prediction (order-insensitive). + """ + if not isinstance(pred_list, list): + return 0.0 + if not truth_list: + return 0.0 + # Use exact match on elements; ignore order and duplicates. + truth_set = {_normalize_scalar(v) for v in truth_list} + pred_set = {_normalize_scalar(v) for v in pred_list} + if not truth_set: + return 0.0 + return len(truth_set & pred_set) / len(truth_set) + + +def _normalize_scalar(value: Any) -> str: + """Normalize scalar values for set comparison.""" + if value is None: + return "null" + return str(value).strip() + + +def _list_score_ordered(truth_list: list[Any], pred_list: Any) -> float: + """Compute an order-aware partial match score for lists. + + Args: + truth_list: Ground-truth list. + pred_list: Predicted list. + + Returns: + LCS-based fraction of truth elements matched in order. + """ + if not isinstance(pred_list, list): + return 0.0 + if not truth_list: + return 0.0 + truth_norm = [_normalize_scalar(v) for v in truth_list] + pred_norm = [_normalize_scalar(v) for v in pred_list] + lcs_len = _lcs_length(truth_norm, pred_norm) + return lcs_len / len(truth_norm) + + +def _lcs_length(a: list[str], b: list[str]) -> int: + """Compute the length of the longest common subsequence.""" + if not a or not b: + return 0 + dp = [0] * (len(b) + 1) + for i in range(1, len(a) + 1): + prev = 0 + for j in range(1, len(b) + 1): + temp = dp[j] + if a[i - 1] == b[j - 1]: + dp[j] = prev + 1 + else: + dp[j] = max(dp[j], dp[j - 1]) + prev = temp + return dp[-1] + + +def key_score(truth: Any, pred: Any) -> float: + """Compute a key-level score against the truth payload. + + Args: + truth: Ground-truth JSON payload. + pred: Predicted JSON payload. + + Returns: + Score in [0, 1]. For dict payloads, this is the fraction of truth keys + that exactly match in the prediction. For non-dict payloads, this is + 1.0 if exactly equal, else 0.0. + """ + if isinstance(truth, dict): + total = len(truth) + if total == 0: + return 0.0 + if not isinstance(pred, dict): + return 0.0 + score_sum = 0.0 + for key, truth_val in truth.items(): + if key not in pred: + continue + pred_val = pred[key] + if isinstance(truth_val, list): + score_sum += _list_score(truth_val, pred_val) + continue + score_sum += 1.0 if exact_match(truth_val, pred_val) else 0.0 + return score_sum / total + if isinstance(truth, list): + return _list_score(truth, pred) + return 1.0 if exact_match(truth, pred) else 0.0 + + +def key_score_ordered(truth: Any, pred: Any) -> float: + """Compute a key-level score that respects list order.""" + if isinstance(truth, dict): + total = len(truth) + if total == 0: + return 0.0 + if not isinstance(pred, dict): + return 0.0 + score_sum = 0.0 + for key, truth_val in truth.items(): + if key not in pred: + continue + pred_val = pred[key] + if isinstance(truth_val, list): + score_sum += _list_score_ordered(truth_val, pred_val) + continue + score_sum += 1.0 if exact_match(truth_val, pred_val) else 0.0 + return score_sum / total + if isinstance(truth, list): + return _list_score_ordered(truth, pred) + return 1.0 if exact_match(truth, pred) else 0.0 From a40b4b18438197afbbdca5fe95844b75018925c4 Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Sat, 24 Jan 2026 17:25:45 +0900 Subject: [PATCH 10/38] feat: Add tax report case to manifest and corresponding truth data --- benchmark/data/manifest.json | 9 +++++++ benchmark/data/truth/tax_report_01.json | 33 +++++++++++++++++++++++++ benchmark/src/bench/cli.py | 2 ++ 3 files changed, 44 insertions(+) create mode 100644 benchmark/data/truth/tax_report_01.json diff --git a/benchmark/data/manifest.json b/benchmark/data/manifest.json index 351c999..3ae38db 100644 --- a/benchmark/data/manifest.json +++ b/benchmark/data/manifest.json @@ -41,6 +41,15 @@ "dpi": 220, "max_pages": 1 } + }, + { + "id": "tax_report_01", + "type": "application_form", + "xlsx": "data/raw/tax_report_01.xlsx", + "question": "このExcel帳票(令和○年度分 市民税・県民税申告書)について、次の3点を抽出し、JSONのみで返してください。\n\n(1) income_section_labels: 右側の「1 収入金額等」セクションに並んでいる収入項目のラベルを、上から順に配列で返してください。\n\n(2) deduction_section_labels: 右下の「4 所得から差し引かれる金額」セクションに並んでいる控除項目のラベルを、上から順に配列で返してください。\n\n(3) required_documents_note: 右端に縦書きで記載されている注意文のうち、「源泉徴収票、控除証明書などの必要書類(コピー可)は…」で始まる文を、そのまま1つの文字列として返してください。\n\n出力形式(厳守):\n{\n \"income_section_labels\": [\"...\", \"...\"],\n \"deduction_section_labels\": [\"...\", \"...\"],\n \"required_documents_note\": \"...\"\n}\n\n注意:\n- チェックボックス記号(□など)は含めないでください。\n- ラベルは帳票に印字されている見たままの文字列(空白や中黒を含む場合はそのまま)にしてください。\n- 余分な説明文やコードフェンスは付けず、JSONのみを返してください。", + "truth": "data/truth/tax_report_01.json", + "sheet_scope": null, + "render": { "dpi": 220, "max_pages": 1 } } ] } diff --git a/benchmark/data/truth/tax_report_01.json b/benchmark/data/truth/tax_report_01.json new file mode 100644 index 0000000..3ff18ba --- /dev/null +++ b/benchmark/data/truth/tax_report_01.json @@ -0,0 +1,33 @@ +{ + "income_section_labels": [ + "事 業", + "農 業", + "不動産", + "利 子", + "配 当", + "給 与", + "公的年金等", + "業 務", + "そ の 他", + "合計(①+⑧+⑨)", + "総 合 譲 渡・一 時", + "合 計" + ], + "deduction_section_labels": [ + "社会保険料控除", + "小規模企業共済等掛金控除", + "生命保険料控除", + "地震保険料控除", + "寡婦、ひとり親控除", + "勤労学生、障害者控除", + "配偶者(特別)控除", + "扶養控除", + "特定親族特別控除", + "基礎控除", + "⑬から㉕までの計", + "雑損控除", + "医療費控除", + "合計(⑳+㉗+㉘)" + ], + "required_documents_note": "源泉徴収票、控除証明書などの必要書類(コピー可)は、この申告書には貼り付けずに共に提出してください。" +} diff --git a/benchmark/src/bench/cli.py b/benchmark/src/bench/cli.py index c69441d..b8eb9cf 100644 --- a/benchmark/src/bench/cli.py +++ b/benchmark/src/bench/cli.py @@ -430,6 +430,8 @@ def report() -> None: out_md = RESULTS_DIR / "report.md" out_md.write_text("\n".join(md_lines), encoding="utf-8") print(f"[green]Wrote {out_md}[/green]") + print("[cyan]Summary (from report.md)[/cyan]") + print(g.to_markdown(index=False)) if __name__ == "__main__": From a16086ae4d7f16300dbfc71ce2ee8208d46c8bdb Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Sat, 24 Jan 2026 17:40:55 +0900 Subject: [PATCH 11/38] feat: Enhance scoring functions with normalization and support for nested structures --- benchmark/src/bench/eval/score.py | 98 +++++++++++++++++++++---------- 1 file changed, 66 insertions(+), 32 deletions(-) diff --git a/benchmark/src/bench/eval/score.py b/benchmark/src/bench/eval/score.py index 5525670..3a6b407 100644 --- a/benchmark/src/bench/eval/score.py +++ b/benchmark/src/bench/eval/score.py @@ -1,8 +1,10 @@ from __future__ import annotations +import re +import unicodedata from typing import Any -from .exact_match import exact_match +from .exact_match import canonical, exact_match def _list_score(truth_list: list[Any], pred_list: Any) -> float: @@ -20,8 +22,8 @@ def _list_score(truth_list: list[Any], pred_list: Any) -> float: if not truth_list: return 0.0 # Use exact match on elements; ignore order and duplicates. - truth_set = {_normalize_scalar(v) for v in truth_list} - pred_set = {_normalize_scalar(v) for v in pred_list} + truth_set = {_normalize_element(v) for v in truth_list} + pred_set = {_normalize_element(v) for v in pred_list} if not truth_set: return 0.0 return len(truth_set & pred_set) / len(truth_set) @@ -31,7 +33,24 @@ def _normalize_scalar(value: Any) -> str: """Normalize scalar values for set comparison.""" if value is None: return "null" - return str(value).strip() + text = str(value) + text = _strip_circled_numbers(text) + text = unicodedata.normalize("NFKC", text) + text = text.replace("※", "") + text = re.sub(r"\s+", " ", text).strip() + return text + + +def _strip_circled_numbers(text: str) -> str: + """Remove circled-number characters (e.g., ①②) for robust matching.""" + return "".join(ch for ch in text if unicodedata.category(ch) != "No") + + +def _normalize_element(value: Any) -> str: + """Normalize list elements for comparison.""" + if isinstance(value, (dict, list)): + return canonical(value) + return _normalize_scalar(value) def _list_score_ordered(truth_list: list[Any], pred_list: Any) -> float: @@ -48,8 +67,8 @@ def _list_score_ordered(truth_list: list[Any], pred_list: Any) -> float: return 0.0 if not truth_list: return 0.0 - truth_norm = [_normalize_scalar(v) for v in truth_list] - pred_norm = [_normalize_scalar(v) for v in pred_list] + truth_norm = [_normalize_element(v) for v in truth_list] + pred_norm = [_normalize_element(v) for v in pred_list] lcs_len = _lcs_length(truth_norm, pred_norm) return lcs_len / len(truth_norm) @@ -71,6 +90,45 @@ def _lcs_length(a: list[str], b: list[str]) -> int: return dp[-1] +def _dict_score(truth_dict: dict[str, Any], pred_dict: dict[str, Any]) -> float: + """Compute a key-level score for nested dicts (order-insensitive lists).""" + total = len(truth_dict) + if total == 0: + return 0.0 + score_sum = 0.0 + for key, truth_val in truth_dict.items(): + if key not in pred_dict: + continue + pred_val = pred_dict[key] + score_sum += _value_score(truth_val, pred_val, ordered=False) + return score_sum / total + + +def _dict_score_ordered(truth_dict: dict[str, Any], pred_dict: dict[str, Any]) -> float: + """Compute a key-level score for nested dicts (order-aware lists).""" + total = len(truth_dict) + if total == 0: + return 0.0 + score_sum = 0.0 + for key, truth_val in truth_dict.items(): + if key not in pred_dict: + continue + pred_val = pred_dict[key] + score_sum += _value_score(truth_val, pred_val, ordered=True) + return score_sum / total + + +def _value_score(truth: Any, pred: Any, *, ordered: bool) -> float: + """Score a value with optional list ordering.""" + if isinstance(truth, dict): + if not isinstance(pred, dict): + return 0.0 + return _dict_score_ordered(truth, pred) if ordered else _dict_score(truth, pred) + if isinstance(truth, list): + return _list_score_ordered(truth, pred) if ordered else _list_score(truth, pred) + return 1.0 if exact_match(truth, pred) else 0.0 + + def key_score(truth: Any, pred: Any) -> float: """Compute a key-level score against the truth payload. @@ -84,21 +142,9 @@ def key_score(truth: Any, pred: Any) -> float: 1.0 if exactly equal, else 0.0. """ if isinstance(truth, dict): - total = len(truth) - if total == 0: - return 0.0 if not isinstance(pred, dict): return 0.0 - score_sum = 0.0 - for key, truth_val in truth.items(): - if key not in pred: - continue - pred_val = pred[key] - if isinstance(truth_val, list): - score_sum += _list_score(truth_val, pred_val) - continue - score_sum += 1.0 if exact_match(truth_val, pred_val) else 0.0 - return score_sum / total + return _dict_score(truth, pred) if isinstance(truth, list): return _list_score(truth, pred) return 1.0 if exact_match(truth, pred) else 0.0 @@ -107,21 +153,9 @@ def key_score(truth: Any, pred: Any) -> float: def key_score_ordered(truth: Any, pred: Any) -> float: """Compute a key-level score that respects list order.""" if isinstance(truth, dict): - total = len(truth) - if total == 0: - return 0.0 if not isinstance(pred, dict): return 0.0 - score_sum = 0.0 - for key, truth_val in truth.items(): - if key not in pred: - continue - pred_val = pred[key] - if isinstance(truth_val, list): - score_sum += _list_score_ordered(truth_val, pred_val) - continue - score_sum += 1.0 if exact_match(truth_val, pred_val) else 0.0 - return score_sum / total + return _dict_score_ordered(truth, pred) if isinstance(truth, list): return _list_score_ordered(truth, pred) return 1.0 if exact_match(truth, pred) else 0.0 From a061e57fbb94b1feffbb2fa657a2996c0ac0cd18 Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Sat, 24 Jan 2026 17:41:52 +0900 Subject: [PATCH 12/38] feat: Add SmartArt organization chart case to manifest with corresponding truth data --- benchmark/data/manifest.json | 12 ++++++++++++ benchmark/data/truth/smartart_01.json | 9 +++++++++ 2 files changed, 21 insertions(+) create mode 100644 benchmark/data/truth/smartart_01.json diff --git a/benchmark/data/manifest.json b/benchmark/data/manifest.json index 3ae38db..2427336 100644 --- a/benchmark/data/manifest.json +++ b/benchmark/data/manifest.json @@ -50,6 +50,18 @@ "truth": "data/truth/tax_report_01.json", "sheet_scope": null, "render": { "dpi": 220, "max_pages": 1 } + }, + { + "id": "smartart_01", + "type": "organization_chart", + "xlsx": "data/raw/smartart_01.xlsx", + "question": "このExcel帳票(SmartArtで作成された組織図)について、次の3点を抽出し、JSONのみで返してください。\n\n(1) top_structure: 最上位から第2階層までの組織構造を、親子関係が分かる形で抽出してください。\n\n(2) sales_departments: 「営業部」の直下にある課の名称を、上から順に配列で返してください。\n\n(3) production_sites: 「生産部」の直下にある工場名を、上から順に配列で返してください。\n\n出力形式(厳守):\n{\n \"top_structure\": {\n \"取締役会\": {\n \"社長\": [\"...\"]\n }\n },\n \"sales_departments\": [\"...\", \"...\"],\n \"production_sites\": [\"...\", \"...\"]\n}\n\n注意:\n- 図形の色や配置座標は含めないでください。テキスト内容と階層関係のみを対象とします。\n- 余分な説明文やコードフェンスは付けず、JSONのみを返してください。", + "truth": "data/truth/smartart_01.json", + "sheet_scope": null, + "render": { + "dpi": 220, + "max_pages": 1 + } } ] } diff --git a/benchmark/data/truth/smartart_01.json b/benchmark/data/truth/smartart_01.json new file mode 100644 index 0000000..2a22af1 --- /dev/null +++ b/benchmark/data/truth/smartart_01.json @@ -0,0 +1,9 @@ +{ + "top_structure": { + "取締役会": { + "社長": ["企画管理部", "営業部", "開発部", "技術部", "生産部", "総務部"] + } + }, + "sales_departments": ["第1営業課", "第2営業課", "第3営業課", "海外営業課"], + "production_sites": ["愛知工場", "山形工場", "高知工場"] +} From 522bb904e451544d44e6cec214af0becd2f6bf94 Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Sat, 24 Jan 2026 17:53:39 +0900 Subject: [PATCH 13/38] feat: Refactor extraction process to use ExStructEngine for improved functionality --- benchmark/src/bench/pipeline/exstruct_adapter.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/benchmark/src/bench/pipeline/exstruct_adapter.py b/benchmark/src/bench/pipeline/exstruct_adapter.py index 008f02f..3edccf3 100644 --- a/benchmark/src/bench/pipeline/exstruct_adapter.py +++ b/benchmark/src/bench/pipeline/exstruct_adapter.py @@ -5,7 +5,11 @@ from pydantic import BaseModel -from exstruct import ExtractionMode, extract as exstruct_extract +from exstruct import ( + ExtractionMode, + ExStructEngine, + StructOptions, +) from exstruct.models import SheetData, WorkbookData from .common import write_text @@ -62,7 +66,8 @@ def extract_exstruct( config: Optional ExStruct text extraction configuration. """ resolved_config = config or ExstructTextConfig() - workbook = exstruct_extract(xlsx_path, mode=resolved_config.mode) + engine = ExStructEngine(options=StructOptions(include_merged_values_in_rows=False)) + workbook = engine.extract(xlsx_path, mode=resolved_config.mode) workbook = _filter_workbook_sheets(workbook, sheet_scope) payload = workbook.to_json( pretty=resolved_config.pretty, indent=resolved_config.indent From 1d466a7ac30d24a38b018f41de236078388d2bd8 Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Sat, 24 Jan 2026 18:04:32 +0900 Subject: [PATCH 14/38] feat: Add basic document case to manifest with corresponding truth data --- benchmark/data/manifest.json | 12 ++++++++++++ benchmark/data/truth/basic_01.json | 31 ++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 benchmark/data/truth/basic_01.json diff --git a/benchmark/data/manifest.json b/benchmark/data/manifest.json index 2427336..60f40df 100644 --- a/benchmark/data/manifest.json +++ b/benchmark/data/manifest.json @@ -62,6 +62,18 @@ "dpi": 220, "max_pages": 1 } + }, + { + "id": "basic_01", + "type": "mixed_document", + "xlsx": "data/raw/basic_01.xlsx", + "question": "このExcel帳票について、次の3点を抽出し、JSONのみで返してください。\n\n(1) sales_table: 左上の売上表について、月をキーとして各製品の数値を抽出してください。\n\n(2) chart_series: 右上の折れ線グラフに含まれる系列名を、凡例の表示順で配列として返してください。\n\n(3) flowchart_paths: 下部のフローチャートについて、開始から終了までの処理パスを条件付きで2通り抽出してください。\n - format_valid = true の場合の処理パス\n - format_valid = false の場合の処理パス\n\n出力形式(厳守):\n{\n \"sales_table\": {\n \"Jan-25\": {\"製品A\": 0, \"製品B\": 0, \"製品C\": 0},\n \"Feb-25\": {\"製品A\": 0, \"製品B\": 0, \"製品C\": 0}\n },\n \"chart_series\": [\"...\", \"...\"],\n \"flowchart_paths\": {\n \"format_valid_true\": [\"...\", \"...\"],\n \"format_valid_false\": [\"...\", \"...\"]\n }\n}\n\n注意:\n- 数値は整数で返してください。\n- フローチャートのパスは、図形内の文言をそのまま順番に並べてください。\n- 余分な説明文やコードフェンスは付けず、JSONのみを返してください。", + "truth": "data/truth/basic_01.json", + "sheet_scope": null, + "render": { + "dpi": 220, + "max_pages": 1 + } } ] } diff --git a/benchmark/data/truth/basic_01.json b/benchmark/data/truth/basic_01.json new file mode 100644 index 0000000..53bca60 --- /dev/null +++ b/benchmark/data/truth/basic_01.json @@ -0,0 +1,31 @@ +{ + "sales_table": { + "Jan-25": { "製品A": 120, "製品B": 80, "製品C": 60 }, + "Feb-25": { "製品A": 135, "製品B": 90, "製品C": 64 }, + "Mar-25": { "製品A": 150, "製品B": 100, "製品C": 70 }, + "Apr-25": { "製品A": 170, "製品B": 110, "製品C": 72 }, + "May-25": { "製品A": 160, "製品B": 120, "製品C": 75 }, + "Jun-25": { "製品A": 180, "製品B": 130, "製品C": 80 } + }, + "chart_series": ["製品A", "製品B", "製品C"], + "flowchart_paths": { + "format_valid_true": [ + "開始", + "入力データ読み込み", + "形式は正しい?", + "1件処理", + "残件あり?", + "出力を生成", + "メール送信?", + "メール送信", + "終了" + ], + "format_valid_false": [ + "開始", + "入力データ読み込み", + "形式は正しい?", + "エラー表示", + "終了" + ] + } +} From 183b81e5835e75d5cbc0eb35610da74c2e183448 Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Sat, 24 Jan 2026 18:06:37 +0900 Subject: [PATCH 15/38] feat: Add total cost and call count tracking to ask function --- benchmark/src/bench/cli.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/benchmark/src/bench/cli.py b/benchmark/src/bench/cli.py index b8eb9cf..c32187d 100644 --- a/benchmark/src/bench/cli.py +++ b/benchmark/src/bench/cli.py @@ -248,6 +248,8 @@ def ask( client = OpenAIResponsesClient() ensure_dir(PROMPTS_DIR) ensure_dir(RESPONSES_DIR) + total_cost = 0.0 + total_calls = 0 for c in cases: console.rule(f"ASK {c.id}") @@ -328,9 +330,12 @@ def ask( with resp_file.open("a", encoding="utf-8") as f: f.write(resp_line + "\n") + total_cost += res.cost_usd + total_calls += 1 print( f"[cyan]{c.id} {m}[/cyan] tokens(in/out)={res.input_tokens}/{res.output_tokens} cost=${res.cost_usd:.6f}" ) + print(f"[green]Total cost: ${total_cost:.6f} ({total_calls} call(s))[/green]") @app.command() From 3cad08ad8e0fe14225357a4d93feb72abec6d4a0 Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Sat, 24 Jan 2026 18:18:54 +0900 Subject: [PATCH 16/38] feat: Update tax report question and truth data structure for improved clarity and consistency --- benchmark/data/manifest.json | 7 ++++-- benchmark/data/truth/tax_report_01.json | 32 +++++++++++-------------- 2 files changed, 19 insertions(+), 20 deletions(-) diff --git a/benchmark/data/manifest.json b/benchmark/data/manifest.json index 60f40df..f0a4cd4 100644 --- a/benchmark/data/manifest.json +++ b/benchmark/data/manifest.json @@ -46,10 +46,13 @@ "id": "tax_report_01", "type": "application_form", "xlsx": "data/raw/tax_report_01.xlsx", - "question": "このExcel帳票(令和○年度分 市民税・県民税申告書)について、次の3点を抽出し、JSONのみで返してください。\n\n(1) income_section_labels: 右側の「1 収入金額等」セクションに並んでいる収入項目のラベルを、上から順に配列で返してください。\n\n(2) deduction_section_labels: 右下の「4 所得から差し引かれる金額」セクションに並んでいる控除項目のラベルを、上から順に配列で返してください。\n\n(3) required_documents_note: 右端に縦書きで記載されている注意文のうち、「源泉徴収票、控除証明書などの必要書類(コピー可)は…」で始まる文を、そのまま1つの文字列として返してください。\n\n出力形式(厳守):\n{\n \"income_section_labels\": [\"...\", \"...\"],\n \"deduction_section_labels\": [\"...\", \"...\"],\n \"required_documents_note\": \"...\"\n}\n\n注意:\n- チェックボックス記号(□など)は含めないでください。\n- ラベルは帳票に印字されている見たままの文字列(空白や中黒を含む場合はそのまま)にしてください。\n- 余分な説明文やコードフェンスは付けず、JSONのみを返してください。", + "question": "この市民税・県民税申告書の右側に配置されている縦方向の帳票構造を解析してください。\n\n次の条件をすべて満たすJSONを返してください。\n\n1. 「収入金額」ブロックに含まれる項目名を、上から順に配列で列挙してください。\n2. 上記項目群を視覚的にまとめている「合計」項目名を1つ指定してください。\n3. 「所得から差し引かれる金額」ブロックに含まれる項目名を、上から順に配列で列挙してください。\n4. 上記控除項目群を視覚的にまとめている「合計」項目名を1つ指定してください。\n\n制約:\n- 項目名は帳票に記載されている日本語表記をそのまま使用してください。\n- 数値、記号、注釈文は含めないでください。\n- 同一列・同一枠内にある項目同士の位置関係に基づいて判断してください。\n- JSONのみを返してください。\n\n出力形式:\n{\n \"income_items\": [\"...\", \"...\"],\n \"income_total\": \"...\",\n \"deduction_items\": [\"...\", \"...\"],\n \"deduction_total\": \"...\"\n}", "truth": "data/truth/tax_report_01.json", "sheet_scope": null, - "render": { "dpi": 220, "max_pages": 1 } + "render": { + "dpi": 240, + "max_pages": 1 + } }, { "id": "smartart_01", diff --git a/benchmark/data/truth/tax_report_01.json b/benchmark/data/truth/tax_report_01.json index 3ff18ba..174476d 100644 --- a/benchmark/data/truth/tax_report_01.json +++ b/benchmark/data/truth/tax_report_01.json @@ -1,33 +1,29 @@ { - "income_section_labels": [ - "事 業", - "農 業", + "income_items": [ + "事業(営業等)", + "事業(農業)", "不動産", - "利 子", - "配 当", - "給 与", + "利子", + "配当", + "給与", "公的年金等", - "業 務", - "そ の 他", - "合計(①+⑧+⑨)", - "総 合 譲 渡・一 時", - "合 計" + "業務", + "その他" ], - "deduction_section_labels": [ + "income_total": "合計", + "deduction_items": [ "社会保険料控除", "小規模企業共済等掛金控除", "生命保険料控除", "地震保険料控除", "寡婦、ひとり親控除", - "勤労学生、障害者控除", + "勤労学生控除", "配偶者(特別)控除", "扶養控除", - "特定親族特別控除", + "障害者控除", "基礎控除", - "⑬から㉕までの計", "雑損控除", - "医療費控除", - "合計(⑳+㉗+㉘)" + "医療費控除" ], - "required_documents_note": "源泉徴収票、控除証明書などの必要書類(コピー可)は、この申告書には貼り付けずに共に提出してください。" + "deduction_total": "合計" } From f00b408250fb879d535fbd25a68776274ea7d7ad Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Sat, 24 Jan 2026 19:57:21 +0900 Subject: [PATCH 17/38] feat: Add normalization rules and scoring enhancements for improved evaluation accuracy --- benchmark/README.md | 11 ++ benchmark/data/normalization_rules.json | 32 ++++ benchmark/src/bench/cli.py | 65 +++++++- .../src/bench/eval/normalization_rules.py | 103 ++++++++++++ benchmark/src/bench/eval/score.py | 149 ++++++++++++++++++ .../src/bench/pipeline/exstruct_adapter.py | 2 +- 6 files changed, 360 insertions(+), 2 deletions(-) create mode 100644 benchmark/data/normalization_rules.json create mode 100644 benchmark/src/bench/eval/normalization_rules.py diff --git a/benchmark/README.md b/benchmark/README.md index a2d1bdc..5d88df2 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -37,6 +37,17 @@ Outputs: - outputs/results/results.csv - outputs/results/report.md +## Evaluation + +The evaluator now writes two tracks: + +- Exact: `score`, `score_ordered` (strict string match, current behavior) +- Normalized: `score_norm`, `score_norm_ordered` (applies case-specific rules) + +Normalization rules live in `data/normalization_rules.json` and are applied in +`bench.cli eval`. Publish these rules alongside the benchmark to keep the +normalized track transparent and reproducible. + ## Notes: - GPT-4o Responses API supports text and image inputs. See docs: diff --git a/benchmark/data/normalization_rules.json b/benchmark/data/normalization_rules.json new file mode 100644 index 0000000..64bdb49 --- /dev/null +++ b/benchmark/data/normalization_rules.json @@ -0,0 +1,32 @@ +{ + "cases": { + "tax_report_01": { + "alias_rules": [ + { + "canonical": "合計", + "aliases": ["⑬から㉕までの計", "13から25までの計"] + } + ], + "split_rules": [ + { + "trigger": "勤労学生、障害者控除", + "parts": ["勤労学生控除", "障害者控除"] + }, + { + "trigger": "勤労学生・障害者控除", + "parts": ["勤労学生控除", "障害者控除"] + } + ], + "composite_rules": [ + { + "canonical": "事業(営業等)", + "parts": ["事業", "営業等"] + }, + { + "canonical": "事業(農業)", + "parts": ["事業", "農業"] + } + ] + } + } +} diff --git a/benchmark/src/bench/cli.py b/benchmark/src/bench/cli.py index c32187d..ca5bcca 100644 --- a/benchmark/src/bench/cli.py +++ b/benchmark/src/bench/cli.py @@ -10,8 +10,14 @@ import typer from .eval.normalize import normalize_json_text +from .eval.normalization_rules import load_ruleset from .eval.report import write_results_csv -from .eval.score import key_score, key_score_ordered +from .eval.score import ( + key_score, + key_score_normalized, + key_score_ordered, + key_score_ordered_normalized, +) from .llm.openai_client import OpenAIResponsesClient from .manifest import Case, load_manifest from .paths import ( @@ -72,6 +78,8 @@ class ResultRow(BaseModel): model: str | None score: float score_ordered: float + score_norm: float | None = None + score_norm_ordered: float | None = None ok: bool input_tokens: int output_tokens: int @@ -353,6 +361,7 @@ def eval(case: str = "all", method: str = "all") -> None: methods = _select_methods(method) rows: list[ResultRow] = [] + ruleset = load_ruleset(DATA_DIR / "normalization_rules.json") for c in cases: truth_path = _resolve_case_path(c.truth, case_id=c.id, label="truth") @@ -370,15 +379,22 @@ def eval(case: str = "all", method: str = "all") -> None: if rec.get("method") in methods: latest[rec["method"]] = rec + rules = ruleset.for_case(c.id) for m, rec in latest.items(): ok = False score = 0.0 score_ordered = 0.0 + score_norm: float | None = None + score_norm_ordered: float | None = None err: str | None = None try: pred_obj = normalize_json_text(rec["text"]) score = key_score(truth, pred_obj) score_ordered = key_score_ordered(truth, pred_obj) + score_norm = key_score_normalized(truth, pred_obj, rules) + score_norm_ordered = key_score_ordered_normalized( + truth, pred_obj, rules + ) ok = score == 1.0 except Exception as exc: err = str(exc) @@ -391,6 +407,8 @@ def eval(case: str = "all", method: str = "all") -> None: model=rec.get("model"), score=score, score_ordered=score_ordered, + score_norm=score_norm, + score_norm_ordered=score_norm_ordered, ok=ok, input_tokens=int(rec.get("input_tokens", 0)), output_tokens=int(rec.get("output_tokens", 0)), @@ -423,8 +441,15 @@ def report() -> None: } if "score_ordered" in df.columns: agg["acc_ordered"] = ("score_ordered", "mean") + if "score_norm" in df.columns: + agg["acc_norm"] = ("score_norm", "mean") + if "score_norm_ordered" in df.columns: + agg["acc_norm_ordered"] = ("score_norm_ordered", "mean") g = df.groupby("method").agg(**agg).reset_index() + detail_dir = RESULTS_DIR / "detailed_reports" + detail_dir.mkdir(parents=True, exist_ok=True) + md_lines = [] md_lines.append("# Benchmark Report") md_lines.append("") @@ -432,12 +457,50 @@ def report() -> None: md_lines.append("") md_lines.append(g.to_markdown(index=False)) md_lines.append("") + md_lines.append("## Detailed reports") + md_lines.append("") + for case_id in sorted(df["case_id"].unique()): + md_lines.append(f"- detailed_reports/report_{case_id}.md") + md_lines.append("") out_md = RESULTS_DIR / "report.md" out_md.write_text("\n".join(md_lines), encoding="utf-8") print(f"[green]Wrote {out_md}[/green]") print("[cyan]Summary (from report.md)[/cyan]") print(g.to_markdown(index=False)) + # Per-case detail reports + detail_cols = [ + "method", + "case_id", + "type", + "model", + "score", + "score_ordered", + "score_norm", + "score_norm_ordered", + "input_tokens", + "output_tokens", + "cost_usd", + "error", + ] + available_cols = [c for c in detail_cols if c in df.columns] + + for case_id in sorted(df["case_id"].unique()): + case_df = df[df["case_id"] == case_id][available_cols] + case_lines = [ + "# Benchmark Report", + "", + f"## Details: {case_id}", + "", + case_df.to_markdown(index=False), + "", + ] + case_md = detail_dir / f"report_{case_id}.md" + case_md.write_text("\n".join(case_lines), encoding="utf-8") + print(f"[green]Wrote {case_md}[/green]") + print(f"[cyan]Details ({case_id})[/cyan]") + print(case_df.to_markdown(index=False)) + if __name__ == "__main__": app() diff --git a/benchmark/src/bench/eval/normalization_rules.py b/benchmark/src/bench/eval/normalization_rules.py new file mode 100644 index 0000000..9b08cfd --- /dev/null +++ b/benchmark/src/bench/eval/normalization_rules.py @@ -0,0 +1,103 @@ +from __future__ import annotations + +import json +import re +import unicodedata +from pathlib import Path + +from pydantic import BaseModel, Field + + +class AliasRule(BaseModel): + """Canonical label with its acceptable aliases.""" + + canonical: str + aliases: list[str] = Field(default_factory=list) + + +class SplitRule(BaseModel): + """Split a combined label into multiple canonical labels.""" + + trigger: str + parts: list[str] + + +class CompositeRule(BaseModel): + """Match a canonical label when all parts appear in prediction.""" + + canonical: str + parts: list[str] + + +class NormalizationRules(BaseModel): + """Normalization rules for a single case.""" + + alias_rules: list[AliasRule] = Field(default_factory=list) + split_rules: list[SplitRule] = Field(default_factory=list) + composite_rules: list[CompositeRule] = Field(default_factory=list) + + +class NormalizationRuleset(BaseModel): + """Normalization rules keyed by case id.""" + + cases: dict[str, NormalizationRules] = Field(default_factory=dict) + + def for_case(self, case_id: str) -> NormalizationRules: + """Return rules for the given case id (or empty rules if missing).""" + return self.cases.get(case_id, NormalizationRules()) + + +class RuleIndex(BaseModel): + """Prebuilt normalized lookup tables for scoring.""" + + alias_map: dict[str, str] = Field(default_factory=dict) + split_map: dict[str, list[str]] = Field(default_factory=dict) + composite_map: dict[str, list[list[str]]] = Field(default_factory=dict) + + +def _strip_circled_numbers(text: str) -> str: + """Remove circled-number characters for robust matching.""" + return "".join(ch for ch in text if unicodedata.category(ch) != "No") + + +def normalize_label(text: str) -> str: + """Normalize labels for comparison.""" + text = _strip_circled_numbers(text) + text = unicodedata.normalize("NFKC", text) + text = re.sub(r"\s+", " ", text).strip() + return text + + +def build_rule_index(rules: NormalizationRules) -> RuleIndex: + """Build normalized lookup tables from rules.""" + alias_map: dict[str, str] = {} + for rule in rules.alias_rules: + canonical = normalize_label(rule.canonical) + alias_map[canonical] = canonical + for alias in rule.aliases: + alias_map[normalize_label(alias)] = canonical + + split_map: dict[str, list[str]] = { + normalize_label(rule.trigger): [normalize_label(p) for p in rule.parts] + for rule in rules.split_rules + } + + composite_map: dict[str, list[list[str]]] = {} + for rule in rules.composite_rules: + canonical = normalize_label(rule.canonical) + parts = [normalize_label(p) for p in rule.parts] + composite_map.setdefault(canonical, []).append(parts) + + return RuleIndex( + alias_map=alias_map, + split_map=split_map, + composite_map=composite_map, + ) + + +def load_ruleset(path: Path) -> NormalizationRuleset: + """Load normalization ruleset from JSON file.""" + if not path.exists(): + return NormalizationRuleset() + payload = json.loads(path.read_text(encoding="utf-8")) + return NormalizationRuleset(**payload) diff --git a/benchmark/src/bench/eval/score.py b/benchmark/src/bench/eval/score.py index 3a6b407..4c81119 100644 --- a/benchmark/src/bench/eval/score.py +++ b/benchmark/src/bench/eval/score.py @@ -5,6 +5,12 @@ from typing import Any from .exact_match import canonical, exact_match +from .normalization_rules import ( + NormalizationRules, + RuleIndex, + build_rule_index, + normalize_label, +) def _list_score(truth_list: list[Any], pred_list: Any) -> float: @@ -53,6 +59,22 @@ def _normalize_element(value: Any) -> str: return _normalize_scalar(value) +def _normalize_scalar_with_rules(value: Any, index: RuleIndex | None) -> str: + """Normalize scalar values with optional normalization rules.""" + text = normalize_label(str(value)) + if index is None: + return text + return index.alias_map.get(text, text) + + +def _expand_pred_item(value: Any, index: RuleIndex) -> list[str]: + """Expand a predicted list item using split rules and aliases.""" + text = _normalize_scalar_with_rules(value, index) + if text in index.split_map: + return index.split_map[text] + return [text] + + def _list_score_ordered(truth_list: list[Any], pred_list: Any) -> float: """Compute an order-aware partial match score for lists. @@ -73,6 +95,48 @@ def _list_score_ordered(truth_list: list[Any], pred_list: Any) -> float: return lcs_len / len(truth_norm) +def _list_score_normalized( + truth_list: list[Any], pred_list: Any, index: RuleIndex +) -> float: + """Compute a partial match score for lists with normalization rules.""" + if not isinstance(pred_list, list): + return 0.0 + if not truth_list: + return 0.0 + truth_norm = [_normalize_scalar_with_rules(v, index) for v in truth_list] + pred_expanded: list[str] = [] + for v in pred_list: + pred_expanded.extend(_expand_pred_item(v, index)) + pred_set = set(pred_expanded) + matched = 0 + for t in truth_norm: + if t in pred_set: + matched += 1 + continue + if t in index.composite_map: + for parts in index.composite_map[t]: + if all(p in pred_set for p in parts): + matched += 1 + break + return matched / len(truth_norm) + + +def _list_score_ordered_normalized( + truth_list: list[Any], pred_list: Any, index: RuleIndex +) -> float: + """Compute order-aware list score with normalization rules.""" + if not isinstance(pred_list, list): + return 0.0 + if not truth_list: + return 0.0 + truth_norm = [_normalize_scalar_with_rules(v, index) for v in truth_list] + pred_expanded: list[str] = [] + for v in pred_list: + pred_expanded.extend(_expand_pred_item(v, index)) + lcs_len = _lcs_length(truth_norm, pred_expanded) + return lcs_len / len(truth_norm) + + def _lcs_length(a: list[str], b: list[str]) -> int: """Compute the length of the longest common subsequence.""" if not a or not b: @@ -118,6 +182,38 @@ def _dict_score_ordered(truth_dict: dict[str, Any], pred_dict: dict[str, Any]) - return score_sum / total +def _dict_score_normalized( + truth_dict: dict[str, Any], pred_dict: dict[str, Any], index: RuleIndex +) -> float: + """Compute a key-level score for nested dicts with normalization rules.""" + total = len(truth_dict) + if total == 0: + return 0.0 + score_sum = 0.0 + for key, truth_val in truth_dict.items(): + if key not in pred_dict: + continue + pred_val = pred_dict[key] + score_sum += _value_score_normalized(truth_val, pred_val, index, ordered=False) + return score_sum / total + + +def _dict_score_ordered_normalized( + truth_dict: dict[str, Any], pred_dict: dict[str, Any], index: RuleIndex +) -> float: + """Compute a key-level score with normalized, order-aware list scoring.""" + total = len(truth_dict) + if total == 0: + return 0.0 + score_sum = 0.0 + for key, truth_val in truth_dict.items(): + if key not in pred_dict: + continue + pred_val = pred_dict[key] + score_sum += _value_score_normalized(truth_val, pred_val, index, ordered=True) + return score_sum / total + + def _value_score(truth: Any, pred: Any, *, ordered: bool) -> float: """Score a value with optional list ordering.""" if isinstance(truth, dict): @@ -129,6 +225,29 @@ def _value_score(truth: Any, pred: Any, *, ordered: bool) -> float: return 1.0 if exact_match(truth, pred) else 0.0 +def _value_score_normalized( + truth: Any, pred: Any, index: RuleIndex, *, ordered: bool +) -> float: + """Score a value using normalization rules.""" + if isinstance(truth, dict): + if not isinstance(pred, dict): + return 0.0 + return ( + _dict_score_ordered_normalized(truth, pred, index) + if ordered + else _dict_score_normalized(truth, pred, index) + ) + if isinstance(truth, list): + return ( + _list_score_ordered_normalized(truth, pred, index) + if ordered + else _list_score_normalized(truth, pred, index) + ) + truth_norm = _normalize_scalar_with_rules(truth, index) + pred_norm = _normalize_scalar_with_rules(pred, index) + return 1.0 if truth_norm == pred_norm else 0.0 + + def key_score(truth: Any, pred: Any) -> float: """Compute a key-level score against the truth payload. @@ -159,3 +278,33 @@ def key_score_ordered(truth: Any, pred: Any) -> float: if isinstance(truth, list): return _list_score_ordered(truth, pred) return 1.0 if exact_match(truth, pred) else 0.0 + + +def key_score_normalized(truth: Any, pred: Any, rules: NormalizationRules) -> float: + """Compute a normalized score using optional rules.""" + index = build_rule_index(rules) + if isinstance(truth, dict): + if not isinstance(pred, dict): + return 0.0 + return _dict_score_normalized(truth, pred, index) + if isinstance(truth, list): + return _list_score_normalized(truth, pred, index) + truth_norm = _normalize_scalar_with_rules(truth, index) + pred_norm = _normalize_scalar_with_rules(pred, index) + return 1.0 if truth_norm == pred_norm else 0.0 + + +def key_score_ordered_normalized( + truth: Any, pred: Any, rules: NormalizationRules +) -> float: + """Compute an order-aware normalized score using optional rules.""" + index = build_rule_index(rules) + if isinstance(truth, dict): + if not isinstance(pred, dict): + return 0.0 + return _dict_score_ordered_normalized(truth, pred, index) + if isinstance(truth, list): + return _list_score_ordered_normalized(truth, pred, index) + truth_norm = _normalize_scalar_with_rules(truth, index) + pred_norm = _normalize_scalar_with_rules(pred, index) + return 1.0 if truth_norm == pred_norm else 0.0 diff --git a/benchmark/src/bench/pipeline/exstruct_adapter.py b/benchmark/src/bench/pipeline/exstruct_adapter.py index 3edccf3..720b7a9 100644 --- a/benchmark/src/bench/pipeline/exstruct_adapter.py +++ b/benchmark/src/bench/pipeline/exstruct_adapter.py @@ -66,7 +66,7 @@ def extract_exstruct( config: Optional ExStruct text extraction configuration. """ resolved_config = config or ExstructTextConfig() - engine = ExStructEngine(options=StructOptions(include_merged_values_in_rows=False)) + engine = ExStructEngine(options=StructOptions(include_merged_values_in_rows=True)) workbook = engine.extract(xlsx_path, mode=resolved_config.mode) workbook = _filter_workbook_sheets(workbook, sheet_scope) payload = workbook.to_json( From 349f622782c8b1012676416f7ac497b290c7264a Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Sat, 24 Jan 2026 20:07:59 +0900 Subject: [PATCH 18/38] feat: Add alias rules for certificate of employment to normalization rules --- benchmark/data/normalization_rules.json | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/benchmark/data/normalization_rules.json b/benchmark/data/normalization_rules.json index 64bdb49..dbf1f07 100644 --- a/benchmark/data/normalization_rules.json +++ b/benchmark/data/normalization_rules.json @@ -1,5 +1,17 @@ { "cases": { + "certificate_of_employment_01": { + "alias_rules": [ + { + "canonical": "※本証明書の内容について、就労先事業者等に無断で作成又は改変を行ったときは、刑法上の罪に問われる場合があります。", + "aliases": [ + "※本証明書の内容について、就労先事業者等に無断で作成し又は改変を行ったときには、刑法上の罪に問われる場合があります。" + ] + } + ], + "split_rules": [], + "composite_rules": [] + }, "tax_report_01": { "alias_rules": [ { From 10bb9da5d5a6b12f507870622acc75539554ed32 Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Sat, 24 Jan 2026 20:16:07 +0900 Subject: [PATCH 19/38] feat: Enhance benchmark report with interpretation guidelines for accuracy evaluation --- benchmark/README.md | 16 +++++++++++++ benchmark/src/bench/cli.py | 46 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) diff --git a/benchmark/README.md b/benchmark/README.md index 5d88df2..a2ace85 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -37,6 +37,22 @@ Outputs: - outputs/results/results.csv - outputs/results/report.md +## How to interpret results (public guide) + +This benchmark reports two evaluation tracks to keep comparisons fair: + +- Exact: strict string match with no normalization. +- Normalized: applies case-specific rules in `data/normalization_rules.json` to + absorb formatting differences (aliases, split/composite labels). + +Recommended interpretation: + +- Use **Exact** to compare end-to-end string fidelity (best for literal extraction). +- Use **Normalized** to compare **document understanding** across methods. +- When methods disagree between tracks, favor Normalized for Excel-heavy layouts + where labels are split/merged or phrased differently. +- Always cite both accuracy and cost metrics when presenting results publicly. + ## Evaluation The evaluator now writes two tracks: diff --git a/benchmark/src/bench/cli.py b/benchmark/src/bench/cli.py index ca5bcca..97ea1c8 100644 --- a/benchmark/src/bench/cli.py +++ b/benchmark/src/bench/cli.py @@ -453,6 +453,52 @@ def report() -> None: md_lines = [] md_lines.append("# Benchmark Report") md_lines.append("") + md_lines.append( + "This report summarizes extraction accuracy for each method on the benchmark cases." + ) + md_lines.append( + "Scores are computed per case and aggregated by method. Both exact and normalized" + ) + md_lines.append( + "tracks are reported to ensure fair comparison across formatting variations." + ) + md_lines.append("") + md_lines.append("## How to interpret results (public guide)") + md_lines.append("") + md_lines.append("- Exact: strict string match with no normalization.") + md_lines.append( + "- Normalized: applies case-specific rules in data/normalization_rules.json to" + ) + md_lines.append( + " absorb formatting differences (aliases, split/composite labels)." + ) + md_lines.append("") + md_lines.append("Recommended interpretation:") + md_lines.append("") + md_lines.append( + "- Use Exact to compare end-to-end string fidelity (best for literal extraction)." + ) + md_lines.append( + "- Use Normalized to compare document understanding across methods." + ) + md_lines.append( + "- When tracks disagree, favor Normalized for Excel-heavy layouts where labels" + ) + md_lines.append(" are split/merged or phrased differently.") + md_lines.append( + "- Always cite both accuracy and cost metrics in public comparisons." + ) + md_lines.append("") + md_lines.append("## Evaluation tracks") + md_lines.append("") + md_lines.append("- Exact: strict string match without any normalization.") + md_lines.append( + "- Normalized: applies case-specific normalization rules (aliases, split/composite)" + ) + md_lines.append( + " defined in data/normalization_rules.json to absorb format and wording variations." + ) + md_lines.append("") md_lines.append("## Summary by method") md_lines.append("") md_lines.append(g.to_markdown(index=False)) From 5ce469623bfeca0aa1beae74a035d5017c2e8460 Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Sat, 24 Jan 2026 20:17:45 +0900 Subject: [PATCH 20/38] feat: Move summary output to the end of the report function for better clarity --- benchmark/src/bench/cli.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/benchmark/src/bench/cli.py b/benchmark/src/bench/cli.py index 97ea1c8..5990c19 100644 --- a/benchmark/src/bench/cli.py +++ b/benchmark/src/bench/cli.py @@ -511,8 +511,6 @@ def report() -> None: out_md = RESULTS_DIR / "report.md" out_md.write_text("\n".join(md_lines), encoding="utf-8") print(f"[green]Wrote {out_md}[/green]") - print("[cyan]Summary (from report.md)[/cyan]") - print(g.to_markdown(index=False)) # Per-case detail reports detail_cols = [ @@ -547,6 +545,9 @@ def report() -> None: print(f"[cyan]Details ({case_id})[/cyan]") print(case_df.to_markdown(index=False)) + print("[magenta]Summary (from report.md)[/magenta]") + print(g.to_markdown(index=False)) + if __name__ == "__main__": app() From de3acfd410240f6db4af63a0f5b205d0358581f2 Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Sat, 24 Jan 2026 22:18:40 +0900 Subject: [PATCH 21/38] feat: Add evaluation protocol to README and report function for reproducibility --- benchmark/README.md | 19 +++++++++++++++++++ benchmark/src/bench/cli.py | 19 +++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/benchmark/README.md b/benchmark/README.md index a2ace85..cfe09d1 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -37,6 +37,25 @@ Outputs: - outputs/results/results.csv - outputs/results/report.md +## Evaluation protocol (public) + +To ensure reproducibility and fair comparison, follow these fixed settings: + +- Model: gpt-4o (Responses API) +- Temperature: 0.0 +- Prompt: fixed in `bench/llm/openai_client.py` +- Input contexts: generated by `bench.cli extract` using the same sources for all methods +- Normalization: optional normalized track uses `data/normalization_rules.json` +- Evaluation: `bench.cli eval` produces both Exact and Normalized scores +- Report: `bench.cli report` generates `report.md` and per-case detailed reports + +Recommended disclosure when publishing results: + +- Model name + version, temperature, and date of run +- Full `normalization_rules.json` used for normalized scores +- Cost/token estimation method +- Any skipped cases and the reason (missing files, extraction failures) + ## How to interpret results (public guide) This benchmark reports two evaluation tracks to keep comparisons fair: diff --git a/benchmark/src/bench/cli.py b/benchmark/src/bench/cli.py index 5990c19..ae1facd 100644 --- a/benchmark/src/bench/cli.py +++ b/benchmark/src/bench/cli.py @@ -463,6 +463,25 @@ def report() -> None: "tracks are reported to ensure fair comparison across formatting variations." ) md_lines.append("") + md_lines.append("## Evaluation protocol (public)") + md_lines.append("") + md_lines.append("Fixed settings for reproducibility:") + md_lines.append("") + md_lines.append("- Model: gpt-4o (Responses API)") + md_lines.append("- Temperature: 0.0") + md_lines.append("- Prompt: fixed in bench/llm/openai_client.py") + md_lines.append("- Input contexts: generated by bench.cli extract") + md_lines.append("- Normalization: data/normalization_rules.json (optional track)") + md_lines.append("- Evaluation: bench.cli eval (Exact + Normalized)") + md_lines.append("- Report: bench.cli report (summary + per-case)") + md_lines.append("") + md_lines.append("Recommended disclosure when publishing results:") + md_lines.append("") + md_lines.append("- Model name + version, temperature, and date of run") + md_lines.append("- Full normalization_rules.json used for normalized scores") + md_lines.append("- Cost/token estimation method") + md_lines.append("- Any skipped cases and the reason (missing files, failures)") + md_lines.append("") md_lines.append("## How to interpret results (public guide)") md_lines.append("") md_lines.append("- Exact: strict string match with no normalization.") From 0a666c6da9ca2da46f8dbc6845ec2d6b99ce517a Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Sat, 24 Jan 2026 22:27:03 +0900 Subject: [PATCH 22/38] feat: Add reproducibility scripts for Windows PowerShell and macOS/Linux --- benchmark/README.md | 34 ++++++++++++++++++ benchmark/scripts/reproduce.ps1 | 60 ++++++++++++++++++++++++++++++++ benchmark/scripts/reproduce.sh | 61 +++++++++++++++++++++++++++++++++ 3 files changed, 155 insertions(+) create mode 100644 benchmark/scripts/reproduce.ps1 create mode 100644 benchmark/scripts/reproduce.sh diff --git a/benchmark/README.md b/benchmark/README.md index cfe09d1..8b077a2 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -29,6 +29,40 @@ pip install -e . make all ``` +## Reproducibility script (Windows PowerShell) + +```powershell +.\scripts\reproduce.ps1 +``` + +Options: + +- `-Case` (default: `all`) +- `-Method` (default: `all`) +- `-Model` (default: `gpt-4o`) +- `-Temperature` (default: `0.0`) +- `-SkipAsk` (skip LLM calls; uses existing responses) + +## Reproducibility script (macOS/Linux) + +```bash +./scripts/reproduce.sh +``` + +If you see a permission error, run: + +```bash +chmod +x ./scripts/reproduce.sh +``` + +Options: + +- `--case` (default: `all`) +- `--method` (default: `all`) +- `--model` (default: `gpt-4o`) +- `--temperature` (default: `0.0`) +- `--skip-ask` (skip LLM calls; uses existing responses) + Outputs: - outputs/extracted/\* : extracted context (text or images) diff --git a/benchmark/scripts/reproduce.ps1 b/benchmark/scripts/reproduce.ps1 new file mode 100644 index 0000000..6f3c2d7 --- /dev/null +++ b/benchmark/scripts/reproduce.ps1 @@ -0,0 +1,60 @@ +Set-StrictMode -Version Latest +$ErrorActionPreference = "Stop" + +param( + [string]$Case = "all", + [string]$Method = "all", + [string]$Model = "gpt-4o", + [double]$Temperature = 0.0, + [switch]$SkipAsk +) + +function Write-Info { + param([string]$Message) + Write-Host "[reproduce] $Message" +} + +$scriptDir = Split-Path -Parent $MyInvocation.MyCommand.Path +$benchDir = Split-Path -Parent $scriptDir +$repoDir = Split-Path -Parent $benchDir + +Push-Location $benchDir + +try { + if (-not (Test-Path ".env")) { + Write-Info "Copying .env.example -> .env (remember to set OPENAI_API_KEY)." + Copy-Item ".env.example" ".env" + } + + if (-not (Test-Path ".venv")) { + Write-Info "Creating virtual environment." + python -m venv .venv + } + + $python = ".venv\\Scripts\\python" + if (-not (Test-Path $python)) { + throw "Python venv not found at $python" + } + + Write-Info "Installing dependencies." + & $python -m pip install -e $repoDir + & $python -m pip install -e . + + Write-Info "Extracting contexts." + & $python -m bench.cli extract --case $Case --method $Method + + if (-not $SkipAsk) { + Write-Info "Running LLM inference." + & $python -m bench.cli ask --case $Case --method $Method --model $Model --temperature $Temperature + } else { + Write-Info "Skipping LLM inference." + } + + Write-Info "Evaluating results." + & $python -m bench.cli eval --case $Case --method $Method + + Write-Info "Generating reports." + & $python -m bench.cli report +} finally { + Pop-Location +} diff --git a/benchmark/scripts/reproduce.sh b/benchmark/scripts/reproduce.sh new file mode 100644 index 0000000..7c8affd --- /dev/null +++ b/benchmark/scripts/reproduce.sh @@ -0,0 +1,61 @@ +#!/usr/bin/env bash +set -euo pipefail + +CASE="all" +METHOD="all" +MODEL="gpt-4o" +TEMPERATURE="0.0" +SKIP_ASK="false" + +while [[ $# -gt 0 ]]; do + case "$1" in + --case) CASE="$2"; shift 2 ;; + --method) METHOD="$2"; shift 2 ;; + --model) MODEL="$2"; shift 2 ;; + --temperature) TEMPERATURE="$2"; shift 2 ;; + --skip-ask) SKIP_ASK="true"; shift ;; + *) echo "Unknown arg: $1" >&2; exit 1 ;; + esac +done + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +bench_dir="$(dirname "$script_dir")" +repo_dir="$(dirname "$bench_dir")" + +cd "$bench_dir" + +if [[ ! -f ".env" ]]; then + echo "[reproduce] Copying .env.example -> .env (remember to set OPENAI_API_KEY)." + cp .env.example .env +fi + +if [[ ! -d ".venv" ]]; then + echo "[reproduce] Creating virtual environment." + python -m venv .venv +fi + +python_bin=".venv/bin/python" +if [[ ! -f "$python_bin" ]]; then + echo "Python venv not found at $python_bin" >&2 + exit 1 +fi + +echo "[reproduce] Installing dependencies." +"$python_bin" -m pip install -e "$repo_dir" +"$python_bin" -m pip install -e . + +echo "[reproduce] Extracting contexts." +"$python_bin" -m bench.cli extract --case "$CASE" --method "$METHOD" + +if [[ "$SKIP_ASK" == "true" ]]; then + echo "[reproduce] Skipping LLM inference." +else + echo "[reproduce] Running LLM inference." + "$python_bin" -m bench.cli ask --case "$CASE" --method "$METHOD" --model "$MODEL" --temperature "$TEMPERATURE" +fi + +echo "[reproduce] Evaluating results." +"$python_bin" -m bench.cli eval --case "$CASE" --method "$METHOD" + +echo "[reproduce] Generating reports." +"$python_bin" -m bench.cli report From 9cb9571b261762cec7c0e243ee7a94d7c73b1306 Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Sat, 24 Jan 2026 23:07:42 +0900 Subject: [PATCH 23/38] feat: Add normalization rules and truth data for heatstroke and workflow cases --- benchmark/data/manifest.json | 24 +++ benchmark/data/normalization_rules.json | 30 +++ benchmark/data/truth/heatstroke_flow_01.json | 60 ++++++ benchmark/data/truth/workflow_01.json | 84 ++++++++ benchmark/src/bench/cli.py | 29 +++ .../src/bench/eval/normalization_rules.py | 16 ++ benchmark/src/bench/eval/score.py | 192 +++++++++++++++++- 7 files changed, 426 insertions(+), 9 deletions(-) create mode 100644 benchmark/data/truth/heatstroke_flow_01.json create mode 100644 benchmark/data/truth/workflow_01.json diff --git a/benchmark/data/manifest.json b/benchmark/data/manifest.json index f0a4cd4..e37cb26 100644 --- a/benchmark/data/manifest.json +++ b/benchmark/data/manifest.json @@ -77,6 +77,30 @@ "dpi": 220, "max_pages": 1 } + }, + { + "id": "heatstroke_flow_01", + "type": "flowchart", + "xlsx": "data/raw/heatstroke_flow_01.xlsx", + "question": "このExcelに記載されている熱中症対応フローについて、上から順に各対応ステップを抽出してください。各ステップについて、step_name(工程名)、description(内容要約)、special_conditions(条件や注意事項がある場合のみ配列で記載)を含むJSONを、次の形式のみで返してください。\n\n{\n \"steps\": [\n {\n \"step_name\": \"...\",\n \"description\": \"...\",\n \"special_conditions\": [\"...\"]\n }\n ]\n}", + "truth": "data/truth/heatstroke_flow_01.json", + "sheet_scope": null, + "render": { + "dpi": 200, + "max_pages": 1 + } + }, + { + "id": "workflow_01", + "type": "workflow", + "xlsx": "data/raw/workflow_01.xlsx", + "question": "このExcelに記載されている業務フロー図(ネット注文フローチャート)について、工程を上から順に整理してください。各工程について、actor(実行主体)、step_name(工程名)、next_steps(次に進む工程名の配列)を含むJSONを、次の形式のみで返してください。\n\n{\n \"steps\": [\n {\n \"actor\": \"お客様|当社\",\n \"step_name\": \"...\",\n \"next_steps\": [\"...\"]\n }\n ]\n}", + "truth": "data/truth/workflow_01.json", + "sheet_scope": null, + "render": { + "dpi": 200, + "max_pages": 1 + } } ] } diff --git a/benchmark/data/normalization_rules.json b/benchmark/data/normalization_rules.json index dbf1f07..b20c455 100644 --- a/benchmark/data/normalization_rules.json +++ b/benchmark/data/normalization_rules.json @@ -1,5 +1,35 @@ { "cases": { + "heatstroke_flow_01": { + "alias_rules": [], + "split_rules": [], + "composite_rules": [], + "list_object_rules": [ + { + "list_key": "steps", + "string_fields": ["step_name"], + "string_fields_contains": [], + "list_fields_contains": [], + "strip_prefix": { + "step_name": "^[0-9]+[\\..]\\s*" + } + } + ] + }, + "workflow_01": { + "alias_rules": [], + "split_rules": [], + "composite_rules": [], + "list_object_rules": [ + { + "list_key": "steps", + "string_fields": ["step_name"], + "string_fields_contains": [], + "list_fields_contains": [], + "strip_prefix": {} + } + ] + }, "certificate_of_employment_01": { "alias_rules": [ { diff --git a/benchmark/data/truth/heatstroke_flow_01.json b/benchmark/data/truth/heatstroke_flow_01.json new file mode 100644 index 0000000..54ce1a0 --- /dev/null +++ b/benchmark/data/truth/heatstroke_flow_01.json @@ -0,0 +1,60 @@ +{ + "steps": [ + { + "step_name": "発見", + "description": "熱中症が疑われる症状があるかを確認する。", + "special_conditions": [ + "めまい", + "失神", + "筋肉痛", + "筋肉の硬直", + "大量の発汗", + "頭痛", + "嘔吐", + "意識障害", + "けいれん", + "高体温" + ] + }, + { + "step_name": "報告", + "description": "作業管理者および緊急連絡先へ状況を報告する。", + "special_conditions": [] + }, + { + "step_name": "初期対応", + "description": "涼しい場所への移動、水分補給、体を冷やすなどの応急処置を行う。", + "special_conditions": [ + "WBGT値が28度以上の場合は作業を中断する", + "気温が31度以上の場合は作業を中断する" + ] + }, + { + "step_name": "医療機関搬送・救急要請", + "description": "症状に応じて医療機関へ搬送するか救急要請を行う。", + "special_conditions": [ + "意識がない場合は119番通報する", + "応答が曖昧な場合は119番通報する", + "高熱が続く場合は119番通報する", + "けいれんなど重症の兆候がある場合は119番通報する" + ] + }, + { + "step_name": "事後対応・記録", + "description": "発生状況や対応内容を記録し、保存および定期的な見直しを行う。", + "special_conditions": [ + "発生日時", + "場所", + "WBGT値", + "気温", + "作業内容", + "作業時間", + "症状", + "初期対応内容", + "報告先", + "搬送有無", + "最終対応" + ] + } + ] +} diff --git a/benchmark/data/truth/workflow_01.json b/benchmark/data/truth/workflow_01.json new file mode 100644 index 0000000..ea7d3c3 --- /dev/null +++ b/benchmark/data/truth/workflow_01.json @@ -0,0 +1,84 @@ +{ + "steps": [ + { + "actor": "お客様", + "step_name": "商品検索", + "next_steps": ["検討"] + }, + { + "actor": "当社", + "step_name": "商品情報を表示", + "next_steps": ["検討"] + }, + { + "actor": "お客様", + "step_name": "検討", + "next_steps": ["キャンセル", "カートに追加"] + }, + { + "actor": "お客様", + "step_name": "キャンセル", + "next_steps": [] + }, + { + "actor": "お客様", + "step_name": "カートに追加", + "next_steps": ["在庫確認"] + }, + { + "actor": "当社", + "step_name": "在庫確認", + "next_steps": ["レジに進む"] + }, + { + "actor": "お客様", + "step_name": "レジに進む", + "next_steps": ["支払い方法の選択"] + }, + { + "actor": "お客様", + "step_name": "支払い方法の選択", + "next_steps": ["支払いの案内"] + }, + { + "actor": "当社", + "step_name": "支払いの案内", + "next_steps": ["支払い処理"] + }, + { + "actor": "当社", + "step_name": "支払い処理", + "next_steps": ["注文の確定"] + }, + { + "actor": "お客様", + "step_name": "注文の確定", + "next_steps": ["配送先入力"] + }, + { + "actor": "お客様", + "step_name": "配送先入力", + "next_steps": ["配送先確認"] + }, + { + "actor": "当社", + "step_name": "配送先確認", + "next_steps": ["注文確認メールを送信"] + }, + { + "actor": "当社", + "step_name": "注文確認メールを送信", + "next_steps": ["商品を準備・発送"] + }, + { + "actor": "当社", + "step_name": "商品を準備・発送", + "next_steps": ["商品受取"] + }, + { + "actor": "お客様", + "step_name": "商品受取", + "next_steps": [] + } + ] +} diff --git a/benchmark/src/bench/cli.py b/benchmark/src/bench/cli.py index ae1facd..f2d43bf 100644 --- a/benchmark/src/bench/cli.py +++ b/benchmark/src/bench/cli.py @@ -522,6 +522,35 @@ def report() -> None: md_lines.append("") md_lines.append(g.to_markdown(index=False)) md_lines.append("") + md_lines.append("## Normalization leniency summary") + md_lines.append("") + ruleset = load_ruleset(DATA_DIR / "normalization_rules.json") + if ruleset.cases: + summary_rows: list[dict[str, str | int]] = [] + for case_id, rules in sorted(ruleset.cases.items()): + details = [] + for rule in rules.list_object_rules: + parts = [ + f"strings={','.join(rule.string_fields) or '-'}", + f"strings_contains={','.join(rule.string_fields_contains) or '-'}", + f"lists_contains={','.join(rule.list_fields_contains) or '-'}", + f"strip_prefix={','.join(rule.strip_prefix.keys()) or '-'}", + ] + details.append(f"{rule.list_key}({'; '.join(parts)})") + summary_rows.append( + { + "case_id": case_id, + "alias_rules": len(rules.alias_rules), + "split_rules": len(rules.split_rules), + "composite_rules": len(rules.composite_rules), + "list_object_rules": len(rules.list_object_rules), + "details": " | ".join(details) if details else "-", + } + ) + md_lines.append(pd.DataFrame(summary_rows).to_markdown(index=False)) + else: + md_lines.append("_No normalization rules defined._") + md_lines.append("") md_lines.append("## Detailed reports") md_lines.append("") for case_id in sorted(df["case_id"].unique()): diff --git a/benchmark/src/bench/eval/normalization_rules.py b/benchmark/src/bench/eval/normalization_rules.py index 9b08cfd..2958597 100644 --- a/benchmark/src/bench/eval/normalization_rules.py +++ b/benchmark/src/bench/eval/normalization_rules.py @@ -29,12 +29,28 @@ class CompositeRule(BaseModel): parts: list[str] +class ListObjectRule(BaseModel): + """Normalization rules for lists of objects.""" + + list_key: str + string_fields: list[str] = Field(default_factory=list) + string_fields_contains: list[str] = Field(default_factory=list) + list_fields: list[str] = Field(default_factory=list) + list_fields_contains: list[str] = Field(default_factory=list) + strip_prefix: dict[str, str] = Field(default_factory=dict) + + class NormalizationRules(BaseModel): """Normalization rules for a single case.""" alias_rules: list[AliasRule] = Field(default_factory=list) split_rules: list[SplitRule] = Field(default_factory=list) composite_rules: list[CompositeRule] = Field(default_factory=list) + list_object_rules: list[ListObjectRule] = Field(default_factory=list) + + def list_object_rule_map(self) -> dict[str, ListObjectRule]: + """Return list-object rules keyed by list key.""" + return {rule.list_key: rule for rule in self.list_object_rules} class NormalizationRuleset(BaseModel): diff --git a/benchmark/src/bench/eval/score.py b/benchmark/src/bench/eval/score.py index 4c81119..277bda6 100644 --- a/benchmark/src/bench/eval/score.py +++ b/benchmark/src/bench/eval/score.py @@ -6,6 +6,7 @@ from .exact_match import canonical, exact_match from .normalization_rules import ( + ListObjectRule, NormalizationRules, RuleIndex, build_rule_index, @@ -75,6 +76,142 @@ def _expand_pred_item(value: Any, index: RuleIndex) -> list[str]: return [text] +_SPLIT_PATTERN = re.compile(r"[、,,・//]+") + + +def _coerce_list(value: Any) -> list[str]: + if isinstance(value, list): + return [str(v) for v in value if v is not None] + if isinstance(value, str): + return [value] + return [] + + +def _normalize_text_field( + value: Any, index: RuleIndex, *, prefix_pattern: str | None +) -> str: + text = str(value) + if prefix_pattern: + text = re.sub(prefix_pattern, "", text).strip() + return _normalize_scalar_with_rules(text, index) + + +def _normalize_list_field( + value: Any, index: RuleIndex, rule: ListObjectRule, field_name: str +) -> list[str]: + items = _coerce_list(value) + if len(items) == 1 and field_name in rule.list_fields_contains: + items = [t for t in _SPLIT_PATTERN.split(items[0]) if t.strip()] + return [_normalize_scalar_with_rules(v, index) for v in items if str(v).strip()] + + +def _object_matches( + truth_obj: dict[str, Any], + pred_obj: dict[str, Any], + rule: ListObjectRule, + index: RuleIndex, +) -> bool: + for field in rule.string_fields: + if field not in truth_obj or field not in pred_obj: + return False + t_val = _normalize_text_field( + truth_obj[field], index, prefix_pattern=rule.strip_prefix.get(field) + ) + p_val = _normalize_text_field( + pred_obj[field], index, prefix_pattern=rule.strip_prefix.get(field) + ) + if t_val != p_val: + return False + + for field in rule.string_fields_contains: + if field not in truth_obj or field not in pred_obj: + return False + t_val = _normalize_text_field( + truth_obj[field], index, prefix_pattern=rule.strip_prefix.get(field) + ) + p_val = _normalize_text_field( + pred_obj[field], index, prefix_pattern=rule.strip_prefix.get(field) + ) + if t_val not in p_val and p_val not in t_val: + return False + + for field in rule.list_fields: + if field not in truth_obj or field not in pred_obj: + return False + t_list = _normalize_list_field(truth_obj[field], index, rule, field) + p_list = _normalize_list_field(pred_obj[field], index, rule, field) + if set(t_list) != set(p_list): + return False + + for field in rule.list_fields_contains: + if field not in truth_obj or field not in pred_obj: + return False + t_list = _normalize_list_field(truth_obj[field], index, rule, field) + p_list = _normalize_list_field(pred_obj[field], index, rule, field) + if t_list and not p_list: + return False + combined = " ".join(p_list) + for t_val in t_list: + if t_val not in combined: + return False + + return True + + +def _lcs_length_objects( + a: list[dict[str, Any]], + b: list[dict[str, Any]], + *, + rule: ListObjectRule, + index: RuleIndex, +) -> int: + if not a or not b: + return 0 + dp = [0] * (len(b) + 1) + for i in range(1, len(a) + 1): + prev = 0 + for j in range(1, len(b) + 1): + temp = dp[j] + if _object_matches(a[i - 1], b[j - 1], rule, index): + dp[j] = prev + 1 + else: + dp[j] = max(dp[j], dp[j - 1]) + prev = temp + return dp[-1] + + +def _list_score_objects_normalized( + truth_list: list[Any], + pred_list: Any, + *, + rule: ListObjectRule, + index: RuleIndex, + ordered: bool, +) -> float: + if not isinstance(pred_list, list): + return 0.0 + if not truth_list: + return 0.0 + truth_objs = [t for t in truth_list if isinstance(t, dict)] + pred_objs = [p for p in pred_list if isinstance(p, dict)] + if not truth_objs: + return 0.0 + if ordered: + lcs_len = _lcs_length_objects(truth_objs, pred_objs, rule=rule, index=index) + return lcs_len / len(truth_objs) + matched = 0 + used: set[int] = set() + for t in truth_objs: + for i, p in enumerate(pred_objs): + if i in used: + continue + if _object_matches(t, p, rule, index): + matched += 1 + used.add(i) + break + return matched / len(truth_objs) + + def _list_score_ordered(truth_list: list[Any], pred_list: Any) -> float: """Compute an order-aware partial match score for lists. @@ -183,7 +320,10 @@ def _dict_score_ordered(truth_dict: dict[str, Any], pred_dict: dict[str, Any]) - def _dict_score_normalized( - truth_dict: dict[str, Any], pred_dict: dict[str, Any], index: RuleIndex + truth_dict: dict[str, Any], + pred_dict: dict[str, Any], + index: RuleIndex, + list_object_rules: dict[str, ListObjectRule], ) -> float: """Compute a key-level score for nested dicts with normalization rules.""" total = len(truth_dict) @@ -194,12 +334,27 @@ def _dict_score_normalized( if key not in pred_dict: continue pred_val = pred_dict[key] - score_sum += _value_score_normalized(truth_val, pred_val, index, ordered=False) + rule = list_object_rules.get(key) + if rule and isinstance(truth_val, list) and isinstance(pred_val, list): + score_sum += _list_score_objects_normalized( + truth_val, pred_val, rule=rule, index=index, ordered=False + ) + continue + score_sum += _value_score_normalized( + truth_val, + pred_val, + index, + ordered=False, + list_object_rules=list_object_rules, + ) return score_sum / total def _dict_score_ordered_normalized( - truth_dict: dict[str, Any], pred_dict: dict[str, Any], index: RuleIndex + truth_dict: dict[str, Any], + pred_dict: dict[str, Any], + index: RuleIndex, + list_object_rules: dict[str, ListObjectRule], ) -> float: """Compute a key-level score with normalized, order-aware list scoring.""" total = len(truth_dict) @@ -210,7 +365,19 @@ def _dict_score_ordered_normalized( if key not in pred_dict: continue pred_val = pred_dict[key] - score_sum += _value_score_normalized(truth_val, pred_val, index, ordered=True) + rule = list_object_rules.get(key) + if rule and isinstance(truth_val, list) and isinstance(pred_val, list): + score_sum += _list_score_objects_normalized( + truth_val, pred_val, rule=rule, index=index, ordered=True + ) + continue + score_sum += _value_score_normalized( + truth_val, + pred_val, + index, + ordered=True, + list_object_rules=list_object_rules, + ) return score_sum / total @@ -226,16 +393,21 @@ def _value_score(truth: Any, pred: Any, *, ordered: bool) -> float: def _value_score_normalized( - truth: Any, pred: Any, index: RuleIndex, *, ordered: bool + truth: Any, + pred: Any, + index: RuleIndex, + *, + ordered: bool, + list_object_rules: dict[str, ListObjectRule], ) -> float: """Score a value using normalization rules.""" if isinstance(truth, dict): if not isinstance(pred, dict): return 0.0 return ( - _dict_score_ordered_normalized(truth, pred, index) + _dict_score_ordered_normalized(truth, pred, index, list_object_rules) if ordered - else _dict_score_normalized(truth, pred, index) + else _dict_score_normalized(truth, pred, index, list_object_rules) ) if isinstance(truth, list): return ( @@ -283,10 +455,11 @@ def key_score_ordered(truth: Any, pred: Any) -> float: def key_score_normalized(truth: Any, pred: Any, rules: NormalizationRules) -> float: """Compute a normalized score using optional rules.""" index = build_rule_index(rules) + list_object_rules = rules.list_object_rule_map() if isinstance(truth, dict): if not isinstance(pred, dict): return 0.0 - return _dict_score_normalized(truth, pred, index) + return _dict_score_normalized(truth, pred, index, list_object_rules) if isinstance(truth, list): return _list_score_normalized(truth, pred, index) truth_norm = _normalize_scalar_with_rules(truth, index) @@ -299,10 +472,11 @@ def key_score_ordered_normalized( ) -> float: """Compute an order-aware normalized score using optional rules.""" index = build_rule_index(rules) + list_object_rules = rules.list_object_rule_map() if isinstance(truth, dict): if not isinstance(pred, dict): return 0.0 - return _dict_score_ordered_normalized(truth, pred, index) + return _dict_score_ordered_normalized(truth, pred, index, list_object_rules) if isinstance(truth, list): return _list_score_ordered_normalized(truth, pred, index) truth_norm = _normalize_scalar_with_rules(truth, index) From 6681d8494eaefd6ab57f11410ab308e3e35b892c Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Sat, 24 Jan 2026 23:24:30 +0900 Subject: [PATCH 24/38] feat: Add raw evaluation metrics and update README for new evaluation tracks --- benchmark/README.md | 10 +- benchmark/data/manifest.json | 24 ++++ benchmark/data/truth/basic_form_01.json | 34 ++++++ benchmark/data/truth/flowchart_02.json | 87 ++++++++++++++ benchmark/src/bench/cli.py | 29 ++++- benchmark/src/bench/eval/raw_match.py | 148 ++++++++++++++++++++++++ 6 files changed, 328 insertions(+), 4 deletions(-) create mode 100644 benchmark/data/truth/basic_form_01.json create mode 100644 benchmark/data/truth/flowchart_02.json create mode 100644 benchmark/src/bench/eval/raw_match.py diff --git a/benchmark/README.md b/benchmark/README.md index 8b077a2..272991a 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -80,7 +80,7 @@ To ensure reproducibility and fair comparison, follow these fixed settings: - Prompt: fixed in `bench/llm/openai_client.py` - Input contexts: generated by `bench.cli extract` using the same sources for all methods - Normalization: optional normalized track uses `data/normalization_rules.json` -- Evaluation: `bench.cli eval` produces both Exact and Normalized scores +- Evaluation: `bench.cli eval` produces Exact, Normalized, and Raw scores - Report: `bench.cli report` generates `report.md` and per-case detailed reports Recommended disclosure when publishing results: @@ -92,26 +92,30 @@ Recommended disclosure when publishing results: ## How to interpret results (public guide) -This benchmark reports two evaluation tracks to keep comparisons fair: +This benchmark reports three evaluation tracks to keep comparisons fair: - Exact: strict string match with no normalization. - Normalized: applies case-specific rules in `data/normalization_rules.json` to absorb formatting differences (aliases, split/composite labels). +- Raw: loose coverage/precision over flattened text tokens (schema-agnostic), + intended to reflect raw data capture without penalizing minor label variations. Recommended interpretation: - Use **Exact** to compare end-to-end string fidelity (best for literal extraction). - Use **Normalized** to compare **document understanding** across methods. +- Use **Raw** to compare how much ground-truth text is captured regardless of schema. - When methods disagree between tracks, favor Normalized for Excel-heavy layouts where labels are split/merged or phrased differently. - Always cite both accuracy and cost metrics when presenting results publicly. ## Evaluation -The evaluator now writes two tracks: +The evaluator now writes three tracks: - Exact: `score`, `score_ordered` (strict string match, current behavior) - Normalized: `score_norm`, `score_norm_ordered` (applies case-specific rules) +- Raw: `score_raw`, `score_raw_precision` (loose coverage/precision) Normalization rules live in `data/normalization_rules.json` and are applied in `bench.cli eval`. Publish these rules alongside the benchmark to keep the diff --git a/benchmark/data/manifest.json b/benchmark/data/manifest.json index e37cb26..92a6724 100644 --- a/benchmark/data/manifest.json +++ b/benchmark/data/manifest.json @@ -101,6 +101,30 @@ "dpi": 200, "max_pages": 1 } + }, + { + "id": "basic_form_01", + "type": "application_form", + "xlsx": "data/raw/basic_form_01.xlsx", + "question": "このExcel申請書に記載されている入力項目を、意味的なブロック単位で整理してください。申請者本人に関する項目、配偶者に関する項目、収入等に関する申告、預貯金等に関する申告の4分類に分け、それぞれに含まれる項目名を配列でまとめたJSONを、次の形式のみで返してください。\n\n{\n \"applicant\": [],\n \"spouse\": [],\n \"income_declaration\": [],\n \"asset_declaration\": []\n}", + "truth": "data/truth/basic_form_01.json", + "sheet_scope": null, + "render": { + "dpi": 200, + "max_pages": 1 + } + }, + { + "id": "flowchart_02", + "type": "flowchart", + "xlsx": "data/raw/flowchart_02.xlsx", + "question": "このExcelに記載されているログイン処理フローについて、工程を上から順に整理してください。各工程について、step_name(工程名)、step_type(start|process|decision|end)、next_steps(条件付き遷移を含む次工程)を含むJSONを、次の形式のみで返してください。\n\n{\n \"steps\": [\n {\n \"step_name\": \"...\",\n \"step_type\": \"start|process|decision|end\",\n \"next_steps\": [\n {\n \"condition\": \"...\",\n \"next\": \"...\"\n }\n ]\n }\n ]\n}", + "truth": "data/truth/flowchart_02.json", + "sheet_scope": null, + "render": { + "dpi": 200, + "max_pages": 1 + } } ] } diff --git a/benchmark/data/truth/basic_form_01.json b/benchmark/data/truth/basic_form_01.json new file mode 100644 index 0000000..cd8ca3b --- /dev/null +++ b/benchmark/data/truth/basic_form_01.json @@ -0,0 +1,34 @@ +{ + "applicant": [ + "フリガナ", + "被保険者氏名", + "生年月日", + "住所", + "連絡先", + "入所(院)した介護保険施設の所在地及び名称", + "入所(院)年月日" + ], + "spouse": [ + "配偶者の有無", + "配偶者氏名", + "配偶者生年月日", + "配偶者個人番号", + "配偶者住所", + "配偶者連絡先", + "本年1月1日現在の住所", + "課税状況" + ], + "income_declaration": [ + "生活保護受給者に該当するか", + "市町村民税非課税世帯であるか", + "課税年金収入額", + "その他の合計所得金額", + "年金の種類に関する申告" + ], + "asset_declaration": [ + "預貯金額", + "有価証券の金額", + "その他の資産額", + "配偶者の預貯金等を含むかどうか" + ] +} diff --git a/benchmark/data/truth/flowchart_02.json b/benchmark/data/truth/flowchart_02.json new file mode 100644 index 0000000..868d040 --- /dev/null +++ b/benchmark/data/truth/flowchart_02.json @@ -0,0 +1,87 @@ +{ + "steps": [ + { + "step_name": "ログイン画面", + "step_type": "start", + "next_steps": [ + { + "condition": "always", + "next": "登録情報を入力" + } + ] + }, + { + "step_name": "登録情報を入力", + "step_type": "process", + "next_steps": [ + { + "condition": "always", + "next": "入力内容は正しいか" + } + ] + }, + { + "step_name": "入力内容は正しいか", + "step_type": "decision", + "next_steps": [ + { + "condition": "はい", + "next": "サーバーに認証リクエストを送信" + }, + { + "condition": "いいえ", + "next": "再入力を提示" + } + ] + }, + { + "step_name": "再入力を提示", + "step_type": "process", + "next_steps": [ + { + "condition": "always", + "next": "登録情報を入力" + } + ] + }, + { + "step_name": "サーバーに認証リクエストを送信", + "step_type": "process", + "next_steps": [ + { + "condition": "always", + "next": "認証に成功か" + } + ] + }, + { + "step_name": "認証に成功か", + "step_type": "decision", + "next_steps": [ + { + "condition": "はい", + "next": "操作画面に遷移" + }, + { + "condition": "いいえ", + "next": "認証エラーを提示" + } + ] + }, + { + "step_name": "認証エラーを提示", + "step_type": "process", + "next_steps": [ + { + "condition": "always", + "next": "再入力を提示" + } + ] + }, + { + "step_name": "操作画面に遷移", + "step_type": "end", + "next_steps": [] + } + ] +} diff --git a/benchmark/src/bench/cli.py b/benchmark/src/bench/cli.py index f2d43bf..41dd944 100644 --- a/benchmark/src/bench/cli.py +++ b/benchmark/src/bench/cli.py @@ -12,6 +12,7 @@ from .eval.normalize import normalize_json_text from .eval.normalization_rules import load_ruleset from .eval.report import write_results_csv +from .eval.raw_match import raw_coverage_score, raw_precision_score from .eval.score import ( key_score, key_score_normalized, @@ -80,6 +81,8 @@ class ResultRow(BaseModel): score_ordered: float score_norm: float | None = None score_norm_ordered: float | None = None + score_raw: float | None = None + score_raw_precision: float | None = None ok: bool input_tokens: int output_tokens: int @@ -386,6 +389,8 @@ def eval(case: str = "all", method: str = "all") -> None: score_ordered = 0.0 score_norm: float | None = None score_norm_ordered: float | None = None + score_raw: float | None = None + score_raw_precision: float | None = None err: str | None = None try: pred_obj = normalize_json_text(rec["text"]) @@ -395,6 +400,8 @@ def eval(case: str = "all", method: str = "all") -> None: score_norm_ordered = key_score_ordered_normalized( truth, pred_obj, rules ) + score_raw = raw_coverage_score(truth, pred_obj) + score_raw_precision = raw_precision_score(truth, pred_obj) ok = score == 1.0 except Exception as exc: err = str(exc) @@ -409,6 +416,8 @@ def eval(case: str = "all", method: str = "all") -> None: score_ordered=score_ordered, score_norm=score_norm, score_norm_ordered=score_norm_ordered, + score_raw=score_raw, + score_raw_precision=score_raw_precision, ok=ok, input_tokens=int(rec.get("input_tokens", 0)), output_tokens=int(rec.get("output_tokens", 0)), @@ -445,6 +454,10 @@ def report() -> None: agg["acc_norm"] = ("score_norm", "mean") if "score_norm_ordered" in df.columns: agg["acc_norm_ordered"] = ("score_norm_ordered", "mean") + if "score_raw" in df.columns: + agg["acc_raw"] = ("score_raw", "mean") + if "score_raw_precision" in df.columns: + agg["raw_precision"] = ("score_raw_precision", "mean") g = df.groupby("method").agg(**agg).reset_index() detail_dir = RESULTS_DIR / "detailed_reports" @@ -472,7 +485,7 @@ def report() -> None: md_lines.append("- Prompt: fixed in bench/llm/openai_client.py") md_lines.append("- Input contexts: generated by bench.cli extract") md_lines.append("- Normalization: data/normalization_rules.json (optional track)") - md_lines.append("- Evaluation: bench.cli eval (Exact + Normalized)") + md_lines.append("- Evaluation: bench.cli eval (Exact + Normalized + Raw)") md_lines.append("- Report: bench.cli report (summary + per-case)") md_lines.append("") md_lines.append("Recommended disclosure when publishing results:") @@ -491,6 +504,9 @@ def report() -> None: md_lines.append( " absorb formatting differences (aliases, split/composite labels)." ) + md_lines.append( + "- Raw: loose coverage/precision over flattened text tokens (schema-agnostic)." + ) md_lines.append("") md_lines.append("Recommended interpretation:") md_lines.append("") @@ -500,6 +516,9 @@ def report() -> None: md_lines.append( "- Use Normalized to compare document understanding across methods." ) + md_lines.append( + "- Use Raw to compare how much ground-truth text is captured regardless of schema." + ) md_lines.append( "- When tracks disagree, favor Normalized for Excel-heavy layouts where labels" ) @@ -517,6 +536,12 @@ def report() -> None: md_lines.append( " defined in data/normalization_rules.json to absorb format and wording variations." ) + md_lines.append( + "- Raw: loose coverage/precision over flattened text tokens (schema-agnostic)," + ) + md_lines.append( + " intended to reflect raw data capture without penalizing minor label variations." + ) md_lines.append("") md_lines.append("## Summary by method") md_lines.append("") @@ -570,6 +595,8 @@ def report() -> None: "score_ordered", "score_norm", "score_norm_ordered", + "score_raw", + "score_raw_precision", "input_tokens", "output_tokens", "cost_usd", diff --git a/benchmark/src/bench/eval/raw_match.py b/benchmark/src/bench/eval/raw_match.py new file mode 100644 index 0000000..b8d612b --- /dev/null +++ b/benchmark/src/bench/eval/raw_match.py @@ -0,0 +1,148 @@ +from __future__ import annotations + +import re +import unicodedata +from typing import Any + +_WS_PATTERN = re.compile(r"\s+") +_NUMERIC_PATTERN = re.compile(r"[+-]?\d+(?:[.,]\d+)?") + + +def _normalize_raw_text(text: str) -> str: + """Normalize text for raw coverage/precision matching. + + Args: + text: Input string. + + Returns: + Normalized string with whitespace removed and width normalized. + """ + normalized = unicodedata.normalize("NFKC", text) + normalized = normalized.replace("窶サ", "") + normalized = _WS_PATTERN.sub("", normalized) + return normalized.strip() + + +def _is_numeric_token(text: str) -> bool: + """Return True if the text looks like a numeric token. + + Args: + text: Token to check. + + Returns: + True if the token matches a numeric pattern. + """ + return _NUMERIC_PATTERN.fullmatch(text) is not None + + +def _flatten_scalars( + value: Any, *, depth: int = 0, parent_is_list: bool = False +) -> list[str]: + """Flatten nested payloads into a list of scalar strings. + + Keys are included for nested dicts that are not record-like (dicts inside lists) + to capture table headers or row labels without pulling schema field names. + + Args: + value: Arbitrary JSON-like value. + depth: Current nesting depth. + parent_is_list: Whether the parent container is a list. + + Returns: + List of stringified scalar values (and selected keys). + """ + if value is None: + return [] + if isinstance(value, dict): + items: list[str] = [] + if depth > 0 and not parent_is_list: + items.extend([str(k) for k in value.keys()]) + for v in value.values(): + items.extend(_flatten_scalars(v, depth=depth + 1, parent_is_list=False)) + return items + if isinstance(value, list): + items: list[str] = [] + for v in value: + items.extend(_flatten_scalars(v, depth=depth + 1, parent_is_list=True)) + return items + return [str(value)] + + +def _dedupe_normalized(values: list[str]) -> list[str]: + """Normalize and de-duplicate text values, dropping empty tokens. + + Args: + values: List of raw string values. + + Returns: + De-duplicated list of normalized tokens. + """ + seen: set[str] = set() + normalized: list[str] = [] + for value in values: + token = _normalize_raw_text(value) + if not token: + continue + if token not in seen: + seen.add(token) + normalized.append(token) + return normalized + + +def _raw_match_token(truth_token: str, pred_token: str) -> bool: + """Return True if tokens match under loose raw-data matching rules. + + Args: + truth_token: Normalized truth token. + pred_token: Normalized prediction token. + + Returns: + True if tokens are considered a match. + """ + if not truth_token or not pred_token: + return False + if _is_numeric_token(truth_token) or len(truth_token) == 1: + return truth_token == pred_token + return truth_token in pred_token or pred_token in truth_token + + +def raw_coverage_score(truth: Any, pred: Any) -> float: + """Compute loose coverage of truth tokens in predictions. + + Args: + truth: Ground-truth JSON payload. + pred: Predicted JSON payload. + + Returns: + Coverage in [0, 1]. + """ + truth_tokens = _dedupe_normalized(_flatten_scalars(truth)) + pred_tokens = _dedupe_normalized(_flatten_scalars(pred)) + if not truth_tokens: + return 0.0 + matched = 0 + for t in truth_tokens: + if any(_raw_match_token(t, p) for p in pred_tokens): + matched += 1 + return matched / len(truth_tokens) + + +def raw_precision_score(truth: Any, pred: Any) -> float: + """Compute loose precision of prediction tokens against truth. + + Args: + truth: Ground-truth JSON payload. + pred: Predicted JSON payload. + + Returns: + Precision in [0, 1]. + """ + truth_tokens = _dedupe_normalized(_flatten_scalars(truth)) + pred_tokens = _dedupe_normalized(_flatten_scalars(pred)) + if not pred_tokens: + return 0.0 + matched = 0 + for p in pred_tokens: + if any(_raw_match_token(t, p) for t in truth_tokens): + matched += 1 + return matched / len(pred_tokens) From 8fec6f5d32d4c880cbc4ce5ef6fbb6a9b65aeddd Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Sat, 24 Jan 2026 23:42:59 +0900 Subject: [PATCH 25/38] fix: Format JSON structure for better readability and consistency --- benchmark/data/normalization_rules.json | 53 +++++++++++++++++++++---- 1 file changed, 45 insertions(+), 8 deletions(-) diff --git a/benchmark/data/normalization_rules.json b/benchmark/data/normalization_rules.json index b20c455..30b84d1 100644 --- a/benchmark/data/normalization_rules.json +++ b/benchmark/data/normalization_rules.json @@ -7,7 +7,9 @@ "list_object_rules": [ { "list_key": "steps", - "string_fields": ["step_name"], + "string_fields": [ + "step_name" + ], "string_fields_contains": [], "list_fields_contains": [], "strip_prefix": { @@ -23,7 +25,9 @@ "list_object_rules": [ { "list_key": "steps", - "string_fields": ["step_name"], + "string_fields": [ + "step_name" + ], "string_fields_contains": [], "list_fields_contains": [], "strip_prefix": {} @@ -46,29 +50,62 @@ "alias_rules": [ { "canonical": "合計", - "aliases": ["⑬から㉕までの計", "13から25までの計"] + "aliases": [ + "⑬から㉕までの計", + "13から25までの計" + ] } ], "split_rules": [ { "trigger": "勤労学生、障害者控除", - "parts": ["勤労学生控除", "障害者控除"] + "parts": [ + "勤労学生控除", + "障害者控除" + ] }, { "trigger": "勤労学生・障害者控除", - "parts": ["勤労学生控除", "障害者控除"] + "parts": [ + "勤労学生控除", + "障害者控除" + ] } ], "composite_rules": [ { "canonical": "事業(営業等)", - "parts": ["事業", "営業等"] + "parts": [ + "事業", + "営業等" + ] }, { "canonical": "事業(農業)", - "parts": ["事業", "農業"] + "parts": [ + "事業", + "農業" + ] + } + ] + }, + "flowchart_02": { + "alias_rules": [], + "split_rules": [], + "composite_rules": [], + "list_object_rules": [ + { + "list_key": "steps", + "string_fields": [ + "step_type" + ], + "string_fields_contains": [ + "step_name" + ], + "list_fields_contains": [], + "strip_prefix": {} } ] } } -} +} \ No newline at end of file From 417da5731ad4c347a3a089d3db6dff7d5b844392 Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Sun, 25 Jan 2026 12:09:07 +0900 Subject: [PATCH 26/38] feat: Add Markdown conversion functionality and evaluation metrics for Markdown outputs --- benchmark/README.md | 28 ++- benchmark/data/normalization_rules.json | 9 +- benchmark/src/bench/cli.py | 201 +++++++++++++++++++- benchmark/src/bench/eval/markdown_render.py | 128 +++++++++++++ benchmark/src/bench/eval/markdown_score.py | 96 ++++++++++ benchmark/src/bench/llm/openai_client.py | 49 +++++ benchmark/src/bench/paths.py | 2 + 7 files changed, 506 insertions(+), 7 deletions(-) create mode 100644 benchmark/src/bench/eval/markdown_render.py create mode 100644 benchmark/src/bench/eval/markdown_score.py diff --git a/benchmark/README.md b/benchmark/README.md index 272991a..3378a07 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -68,9 +68,28 @@ Outputs: - outputs/extracted/\* : extracted context (text or images) - outputs/prompts/\*.jsonl - outputs/responses/\*.jsonl +- outputs/markdown/\*/\*.md +- outputs/markdown/responses/\*.jsonl - outputs/results/results.csv - outputs/results/report.md +## Markdown conversion (optional) + +Generate Markdown from the latest JSON responses: + +```bash +python -m bench.cli markdown --case all --method all +``` + +Markdown scores (`score_md`, `score_md_precision`) are only computed when +Markdown outputs exist under `outputs/markdown/responses/`. + +If you want a deterministic renderer without LLM calls: + +```bash +python -m bench.cli markdown --case all --method all --use-llm false +``` + ## Evaluation protocol (public) To ensure reproducibility and fair comparison, follow these fixed settings: @@ -80,7 +99,7 @@ To ensure reproducibility and fair comparison, follow these fixed settings: - Prompt: fixed in `bench/llm/openai_client.py` - Input contexts: generated by `bench.cli extract` using the same sources for all methods - Normalization: optional normalized track uses `data/normalization_rules.json` -- Evaluation: `bench.cli eval` produces Exact, Normalized, and Raw scores +- Evaluation: `bench.cli eval` produces Exact, Normalized, Raw, and Markdown scores - Report: `bench.cli report` generates `report.md` and per-case detailed reports Recommended disclosure when publishing results: @@ -92,30 +111,33 @@ Recommended disclosure when publishing results: ## How to interpret results (public guide) -This benchmark reports three evaluation tracks to keep comparisons fair: +This benchmark reports four evaluation tracks to keep comparisons fair: - Exact: strict string match with no normalization. - Normalized: applies case-specific rules in `data/normalization_rules.json` to absorb formatting differences (aliases, split/composite labels). - Raw: loose coverage/precision over flattened text tokens (schema-agnostic), intended to reflect raw data capture without penalizing minor label variations. +- Markdown: coverage/precision against canonical Markdown rendered from truth. Recommended interpretation: - Use **Exact** to compare end-to-end string fidelity (best for literal extraction). - Use **Normalized** to compare **document understanding** across methods. - Use **Raw** to compare how much ground-truth text is captured regardless of schema. +- Use **Markdown** to evaluate JSON-to-Markdown conversion quality. - When methods disagree between tracks, favor Normalized for Excel-heavy layouts where labels are split/merged or phrased differently. - Always cite both accuracy and cost metrics when presenting results publicly. ## Evaluation -The evaluator now writes three tracks: +The evaluator now writes four tracks: - Exact: `score`, `score_ordered` (strict string match, current behavior) - Normalized: `score_norm`, `score_norm_ordered` (applies case-specific rules) - Raw: `score_raw`, `score_raw_precision` (loose coverage/precision) +- Markdown: `score_md`, `score_md_precision` (Markdown coverage/precision) Normalization rules live in `data/normalization_rules.json` and are applied in `bench.cli eval`. Publish these rules alongside the benchmark to keep the diff --git a/benchmark/data/normalization_rules.json b/benchmark/data/normalization_rules.json index 30b84d1..0590160 100644 --- a/benchmark/data/normalization_rules.json +++ b/benchmark/data/normalization_rules.json @@ -19,7 +19,14 @@ ] }, "workflow_01": { - "alias_rules": [], + "alias_rules": [ + { + "canonical": "キャンセル", + "aliases": [ + "買わない" + ] + } + ], "split_rules": [], "composite_rules": [], "list_object_rules": [ diff --git a/benchmark/src/bench/cli.py b/benchmark/src/bench/cli.py index 41dd944..dc0d014 100644 --- a/benchmark/src/bench/cli.py +++ b/benchmark/src/bench/cli.py @@ -9,6 +9,8 @@ from rich.console import Console import typer +from .eval.markdown_render import render_markdown +from .eval.markdown_score import markdown_coverage_score, markdown_precision_score from .eval.normalize import normalize_json_text from .eval.normalization_rules import load_ruleset from .eval.report import write_results_csv @@ -24,6 +26,8 @@ from .paths import ( DATA_DIR, EXTRACTED_DIR, + MARKDOWN_DIR, + MARKDOWN_RESPONSES_DIR, PROMPTS_DIR, RESPONSES_DIR, RESULTS_DIR, @@ -70,6 +74,21 @@ class ResponseRecord(BaseModel): raw: dict[str, Any] +class MarkdownRecord(BaseModel): + """Markdown conversion metadata saved for each request.""" + + case_id: str + method: str + model: str + temperature: float + prompt_hash: str + text: str + input_tokens: int + output_tokens: int + cost_usd: float + raw: dict[str, Any] + + class ResultRow(BaseModel): """Evaluation row for CSV output.""" @@ -83,6 +102,8 @@ class ResultRow(BaseModel): score_norm_ordered: float | None = None score_raw: float | None = None score_raw_precision: float | None = None + score_md: float | None = None + score_md_precision: float | None = None ok: bool input_tokens: int output_tokens: int @@ -167,6 +188,13 @@ def _reset_case_outputs(case_id: str) -> None: path.unlink() +def _reset_markdown_outputs(case_id: str) -> None: + """Delete existing markdown logs for a case.""" + path = MARKDOWN_RESPONSES_DIR / f"{case_id}.jsonl" + if path.exists(): + path.unlink() + + def _dump_jsonl(obj: BaseModel) -> str: """Serialize a record for JSONL output. @@ -256,7 +284,7 @@ def ask( raise typer.BadParameter(f"No cases matched: {case}") methods = _select_methods(method) - client = OpenAIResponsesClient() + client = OpenAIResponsesClient() if use_llm else None ensure_dir(PROMPTS_DIR) ensure_dir(RESPONSES_DIR) total_cost = 0.0 @@ -349,6 +377,98 @@ def ask( print(f"[green]Total cost: ${total_cost:.6f} ({total_calls} call(s))[/green]") +@app.command() +def markdown( + case: str = "all", + method: str = "all", + model: str = "gpt-4o", + temperature: float = 0.0, + use_llm: bool = True, +) -> None: + """Generate Markdown outputs from the latest JSON responses. + + Args: + case: Comma-separated case ids or "all". + method: Comma-separated method names or "all". + model: OpenAI model name for Markdown conversion. + temperature: Sampling temperature for the model. + use_llm: If True, call the LLM for conversion; otherwise use renderer. + """ + mf = load_manifest(_manifest_path()) + cases = _select_cases(mf.cases, case) + if not cases: + raise typer.BadParameter(f"No cases matched: {case}") + methods = _select_methods(method) + + client = OpenAIResponsesClient() + ensure_dir(MARKDOWN_DIR) + ensure_dir(MARKDOWN_RESPONSES_DIR) + total_cost = 0.0 + total_calls = 0 + + for c in cases: + console.rule(f"MARKDOWN {c.id}") + resp_file = RESPONSES_DIR / f"{c.id}.jsonl" + if not resp_file.exists(): + print(f"[yellow]skip: no responses for {c.id}[/yellow]") + continue + _reset_markdown_outputs(c.id) + latest: dict[str, dict[str, Any]] = {} + for line in resp_file.read_text(encoding="utf-8").splitlines(): + rec = json.loads(line) + if rec.get("method") in methods: + latest[rec["method"]] = rec + + case_dir = MARKDOWN_DIR / c.id + ensure_dir(case_dir) + md_file = MARKDOWN_RESPONSES_DIR / f"{c.id}.jsonl" + + for m, rec in latest.items(): + try: + pred_obj = normalize_json_text(rec["text"]) + json_text = json.dumps(pred_obj, ensure_ascii=False) + prompt_hash = sha256_text(json_text) + if use_llm: + if client is None: + raise RuntimeError( + "LLM client unavailable for markdown conversion." + ) + res = client.ask_markdown( + model=model, json_text=json_text, temperature=temperature + ) + md_text = res.text + md_rec = MarkdownRecord( + case_id=c.id, + method=m, + model=model, + temperature=temperature, + prompt_hash=prompt_hash, + text=md_text, + input_tokens=res.input_tokens, + output_tokens=res.output_tokens, + cost_usd=res.cost_usd, + raw=res.raw, + ) + total_cost += res.cost_usd + total_calls += 1 + line = _dump_jsonl(md_rec) + with md_file.open("a", encoding="utf-8") as f: + f.write(line + "\n") + else: + md_text = render_markdown(pred_obj, title=c.id) + + out_md = case_dir / f"{m}.md" + out_md.write_text(md_text, encoding="utf-8") + print(f"[green]{c.id} {m} -> {out_md}[/green]") + except Exception as exc: + print(f"[yellow]skip: markdown {c.id} {m} ({exc})[/yellow]") + + if use_llm: + print( + f"[green]Markdown cost: ${total_cost:.6f} ({total_calls} call(s))[/green]" + ) + + @app.command() def eval(case: str = "all", method: str = "all") -> None: """Evaluate the latest responses and write results CSV. @@ -365,6 +485,7 @@ def eval(case: str = "all", method: str = "all") -> None: rows: list[ResultRow] = [] ruleset = load_ruleset(DATA_DIR / "normalization_rules.json") + md_outputs: dict[str, dict[str, dict[str, Any]]] = {} for c in cases: truth_path = _resolve_case_path(c.truth, case_id=c.id, label="truth") @@ -383,6 +504,14 @@ def eval(case: str = "all", method: str = "all") -> None: latest[rec["method"]] = rec rules = ruleset.for_case(c.id) + md_file = MARKDOWN_RESPONSES_DIR / f"{c.id}.jsonl" + if md_file.exists(): + latest_md: dict[str, dict[str, Any]] = {} + for line in md_file.read_text(encoding="utf-8").splitlines(): + rec = json.loads(line) + if rec.get("method") in methods: + latest_md[rec["method"]] = rec + md_outputs[c.id] = latest_md for m, rec in latest.items(): ok = False score = 0.0 @@ -391,6 +520,8 @@ def eval(case: str = "all", method: str = "all") -> None: score_norm_ordered: float | None = None score_raw: float | None = None score_raw_precision: float | None = None + score_md: float | None = None + score_md_precision: float | None = None err: str | None = None try: pred_obj = normalize_json_text(rec["text"]) @@ -402,6 +533,12 @@ def eval(case: str = "all", method: str = "all") -> None: ) score_raw = raw_coverage_score(truth, pred_obj) score_raw_precision = raw_precision_score(truth, pred_obj) + md_truth = render_markdown(truth, title=c.id) + md_rec = md_outputs.get(c.id, {}).get(m) + if md_rec is not None: + md_text = str(md_rec.get("text", "")) + score_md = markdown_coverage_score(md_truth, md_text) + score_md_precision = markdown_precision_score(md_truth, md_text) ok = score == 1.0 except Exception as exc: err = str(exc) @@ -418,6 +555,8 @@ def eval(case: str = "all", method: str = "all") -> None: score_norm_ordered=score_norm_ordered, score_raw=score_raw, score_raw_precision=score_raw_precision, + score_md=score_md, + score_md_precision=score_md_precision, ok=ok, input_tokens=int(rec.get("input_tokens", 0)), output_tokens=int(rec.get("output_tokens", 0)), @@ -458,6 +597,10 @@ def report() -> None: agg["acc_raw"] = ("score_raw", "mean") if "score_raw_precision" in df.columns: agg["raw_precision"] = ("score_raw_precision", "mean") + if "score_md" in df.columns and df["score_md"].notna().any(): + agg["acc_md"] = ("score_md", "mean") + if "score_md_precision" in df.columns and df["score_md_precision"].notna().any(): + agg["md_precision"] = ("score_md_precision", "mean") g = df.groupby("method").agg(**agg).reset_index() detail_dir = RESULTS_DIR / "detailed_reports" @@ -470,10 +613,10 @@ def report() -> None: "This report summarizes extraction accuracy for each method on the benchmark cases." ) md_lines.append( - "Scores are computed per case and aggregated by method. Both exact and normalized" + "Scores are computed per case and aggregated by method. Exact, normalized, raw," ) md_lines.append( - "tracks are reported to ensure fair comparison across formatting variations." + "and markdown tracks are reported to ensure fair comparison across variations." ) md_lines.append("") md_lines.append("## Evaluation protocol (public)") @@ -486,6 +629,7 @@ def report() -> None: md_lines.append("- Input contexts: generated by bench.cli extract") md_lines.append("- Normalization: data/normalization_rules.json (optional track)") md_lines.append("- Evaluation: bench.cli eval (Exact + Normalized + Raw)") + md_lines.append("- Markdown conversion: bench.cli markdown (optional)") md_lines.append("- Report: bench.cli report (summary + per-case)") md_lines.append("") md_lines.append("Recommended disclosure when publishing results:") @@ -507,6 +651,9 @@ def report() -> None: md_lines.append( "- Raw: loose coverage/precision over flattened text tokens (schema-agnostic)." ) + md_lines.append( + "- Markdown: coverage/precision against canonical Markdown rendered from truth." + ) md_lines.append("") md_lines.append("Recommended interpretation:") md_lines.append("") @@ -519,6 +666,7 @@ def report() -> None: md_lines.append( "- Use Raw to compare how much ground-truth text is captured regardless of schema." ) + md_lines.append("- Use Markdown to evaluate JSON-to-Markdown conversion quality.") md_lines.append( "- When tracks disagree, favor Normalized for Excel-heavy layouts where labels" ) @@ -542,11 +690,56 @@ def report() -> None: md_lines.append( " intended to reflect raw data capture without penalizing minor label variations." ) + md_lines.append( + "- Markdown: coverage/precision comparing LLM Markdown to canonical truth Markdown." + ) md_lines.append("") md_lines.append("## Summary by method") md_lines.append("") md_lines.append(g.to_markdown(index=False)) md_lines.append("") + md_lines.append("## Exstruct positioning notes (public)") + md_lines.append("") + md_lines.append( + "Recommended primary indicators for exstruct positioning (RAG pre-processing):" + ) + md_lines.append("") + md_lines.append("- Normalized accuracy: acc_norm / acc_norm_ordered") + md_lines.append("- Raw coverage/precision: acc_raw / raw_precision") + md_lines.append("- Markdown coverage/precision: acc_md / md_precision") + md_lines.append("") + md_lines.append("Current deltas vs. best method (n=11, when available):") + md_lines.append("") + metric_labels = [ + ("acc_norm", "Normalized accuracy"), + ("acc_norm_ordered", "Normalized ordered accuracy"), + ("acc_raw", "Raw coverage"), + ("raw_precision", "Raw precision"), + ("acc_md", "Markdown coverage"), + ("md_precision", "Markdown precision"), + ] + if "method" in g.columns and not g.empty: + ex_row = g[g["method"] == "exstruct"] + for metric, label in metric_labels: + if metric not in g.columns: + continue + best_val = g[metric].max() + best_methods = g[g[metric] == best_val]["method"].tolist() + if ex_row.empty: + ex_val = None + else: + ex_val = float(ex_row[metric].iloc[0]) + if ex_val is None: + md_lines.append(f"- {label}: exstruct n/a; best {best_val:.6f}") + continue + delta = ex_val - best_val + md_lines.append( + f"- {label}: exstruct {ex_val:.6f} vs best {best_val:.6f}" + f" ({', '.join(best_methods)}), delta {delta:+.6f}" + ) + else: + md_lines.append("- (summary unavailable)") + md_lines.append("") md_lines.append("## Normalization leniency summary") md_lines.append("") ruleset = load_ruleset(DATA_DIR / "normalization_rules.json") @@ -597,6 +790,8 @@ def report() -> None: "score_norm_ordered", "score_raw", "score_raw_precision", + "score_md", + "score_md_precision", "input_tokens", "output_tokens", "cost_usd", diff --git a/benchmark/src/bench/eval/markdown_render.py b/benchmark/src/bench/eval/markdown_render.py new file mode 100644 index 0000000..2883d94 --- /dev/null +++ b/benchmark/src/bench/eval/markdown_render.py @@ -0,0 +1,128 @@ +from __future__ import annotations + +import json +from typing import Any + + +def render_markdown(value: Any, *, title: str | None = None) -> str: + """Render a canonical Markdown representation for JSON-like data. + + Args: + value: JSON-like payload to render. + title: Optional top-level title. + + Returns: + Markdown string representation. + """ + lines: list[str] = [] + if title: + lines.append(f"# {title}") + lines.append("") + _render_value(lines, value, level=2) + return "\n".join(lines).strip() + "\n" + + +def _render_value(lines: list[str], value: Any, *, level: int) -> None: + """Render a value into Markdown lines. + + Args: + lines: List to append output lines to. + value: JSON-like value to render. + level: Heading level to use for dict sections. + """ + if isinstance(value, dict): + _render_dict(lines, value, level=level) + return + if isinstance(value, list): + _render_list(lines, value, level=level) + return + lines.append(str(value)) + + +def _render_dict(lines: list[str], value: dict[str, Any], *, level: int) -> None: + """Render a dict as Markdown sections. + + Args: + lines: List to append output lines to. + value: Dict to render. + level: Heading level for keys. + """ + for key, item in value.items(): + heading = "#" * max(level, 1) + lines.append(f"{heading} {key}") + if isinstance(item, (dict, list)): + _render_value(lines, item, level=level + 1) + else: + lines.append(str(item)) + lines.append("") + + +def _render_list(lines: list[str], value: list[Any], *, level: int) -> None: + """Render a list in Markdown. + + Args: + lines: List to append output lines to. + value: List to render. + level: Heading level for nested dicts if needed. + """ + if not value: + lines.append("- (empty)") + return + if all(isinstance(item, dict) for item in value): + _render_table(lines, value) + lines.append("") + return + for item in value: + if isinstance(item, (dict, list)): + text = _json_string(item) + else: + text = str(item) + lines.append(f"- {text}") + + +def _render_table(lines: list[str], rows: list[Any]) -> None: + """Render a list of dicts as a Markdown table. + + Args: + lines: List to append output lines to. + rows: List of row dicts. + """ + keys: list[str] = [] + for row in rows: + if not isinstance(row, dict): + continue + for key in row.keys(): + if key not in keys: + keys.append(key) + if not keys: + lines.append("- (empty)") + return + header = "| " + " | ".join(keys) + " |" + sep = "| " + " | ".join(["---"] * len(keys)) + " |" + lines.append(header) + lines.append(sep) + for row in rows: + if not isinstance(row, dict): + cells = [_escape_cell(_json_string(row))] + [""] * (len(keys) - 1) + else: + cells = [_escape_cell(_cell_value(row.get(k))) for k in keys] + lines.append("| " + " | ".join(cells) + " |") + + +def _cell_value(value: Any) -> str: + """Convert a table cell value to string.""" + if isinstance(value, (dict, list)): + return _json_string(value) + if value is None: + return "" + return str(value) + + +def _json_string(value: Any) -> str: + """Serialize a value as compact JSON for inline use.""" + return json.dumps(value, ensure_ascii=False, sort_keys=True) + + +def _escape_cell(text: str) -> str: + """Escape pipe characters for Markdown tables.""" + return text.replace("|", "\\|") diff --git a/benchmark/src/bench/eval/markdown_score.py b/benchmark/src/bench/eval/markdown_score.py new file mode 100644 index 0000000..4e165a6 --- /dev/null +++ b/benchmark/src/bench/eval/markdown_score.py @@ -0,0 +1,96 @@ +from __future__ import annotations + +import re +import unicodedata + +_TABLE_SEPARATOR = re.compile(r"^[\s|:-]+$") +_WS_PATTERN = re.compile(r"\s+") +_NUMERIC_PATTERN = re.compile(r"[+-]?\d+(?:[.,]\d+)?") + + +def markdown_coverage_score(truth_md: str, pred_md: str) -> float: + """Compute coverage of truth Markdown lines in prediction. + + Args: + truth_md: Canonical Markdown from truth JSON. + pred_md: Markdown output to evaluate. + + Returns: + Coverage score in [0, 1]. + """ + truth_lines = _normalized_lines(truth_md) + pred_lines = _normalized_lines(pred_md) + if not truth_lines: + return 0.0 + matched = 0 + for t in truth_lines: + if any(_match_line(t, p) for p in pred_lines): + matched += 1 + return matched / len(truth_lines) + + +def markdown_precision_score(truth_md: str, pred_md: str) -> float: + """Compute precision of prediction Markdown lines against truth. + + Args: + truth_md: Canonical Markdown from truth JSON. + pred_md: Markdown output to evaluate. + + Returns: + Precision score in [0, 1]. + """ + truth_lines = _normalized_lines(truth_md) + pred_lines = _normalized_lines(pred_md) + if not pred_lines: + return 0.0 + matched = 0 + for p in pred_lines: + if any(_match_line(t, p) for t in truth_lines): + matched += 1 + return matched / len(pred_lines) + + +def _normalized_lines(markdown: str) -> list[str]: + """Normalize Markdown into comparable text lines.""" + lines: list[str] = [] + for raw in markdown.splitlines(): + if raw.strip().startswith("```"): + continue + norm = _normalize_line(raw) + if not norm: + continue + if _TABLE_SEPARATOR.fullmatch(norm): + continue + lines.append(norm) + return lines + + +def _normalize_line(line: str) -> str: + """Normalize a single Markdown line for matching.""" + text = line.strip() + if not text: + return "" + text = re.sub(r"^\s*#{1,6}\s*", "", text) + text = re.sub(r"^\s*[-*+]\s+", "", text) + text = text.replace("|", " ") + text = text.replace("`", "") + text = text.replace("*", "") + text = text.replace(">", "") + text = unicodedata.normalize("NFKC", text) + text = text.replace("窶サ", "") + text = _WS_PATTERN.sub("", text) + return text.strip() + + +def _match_line(truth_line: str, pred_line: str) -> bool: + """Return True if lines match under loose Markdown rules.""" + if not truth_line or not pred_line: + return False + if _is_numeric_line(truth_line) or len(truth_line) == 1: + return truth_line == pred_line + return truth_line in pred_line or pred_line in truth_line + + +def _is_numeric_line(text: str) -> bool: + """Return True if the text is numeric-only.""" + return _NUMERIC_PATTERN.fullmatch(text) is not None diff --git a/benchmark/src/bench/llm/openai_client.py b/benchmark/src/bench/llm/openai_client.py index a44ce25..da3106d 100644 --- a/benchmark/src/bench/llm/openai_client.py +++ b/benchmark/src/bench/llm/openai_client.py @@ -151,3 +151,52 @@ def ask_images( cost_usd=cost, raw=raw, ) + + def ask_markdown( + self, *, model: str, json_text: str, temperature: float + ) -> LLMResult: + """Call Responses API to convert JSON into Markdown. + + Args: + model: OpenAI model name (e.g., "gpt-4o"). + json_text: JSON payload to convert to Markdown. + temperature: Sampling temperature for the response. + + Returns: + LLMResult containing the model output and usage metadata. + """ + instructions = ( + "You are a strict Markdown formatter. Output Markdown only.\n" + "Rules:\n" + "- Use '## ' for top-level keys.\n" + "- For lists of scalars, use bullet lists.\n" + "- For lists of objects, use Markdown tables with columns in key order.\n" + "- For nested objects or lists inside table cells, use compact JSON.\n" + ) + resp = self.client.responses.create( + model=model, + temperature=temperature, + input=[ + { + "role": "user", + "content": [ + {"type": "input_text", "text": instructions}, + {"type": "input_text", "text": f"[JSON]\n{json_text}"}, + ], + } + ], + ) + + text = resp.output_text + usage = getattr(resp, "usage", None) + in_tok, out_tok = _extract_usage_tokens(usage) + cost = estimate_cost_usd(model, in_tok, out_tok) + + raw = json.loads(resp.model_dump_json()) + return LLMResult( + text=text, + input_tokens=in_tok, + output_tokens=out_tok, + cost_usd=cost, + raw=raw, + ) diff --git a/benchmark/src/bench/paths.py b/benchmark/src/bench/paths.py index c8ba71a..d4214be 100644 --- a/benchmark/src/bench/paths.py +++ b/benchmark/src/bench/paths.py @@ -11,6 +11,8 @@ EXTRACTED_DIR = OUT_DIR / "extracted" PROMPTS_DIR = OUT_DIR / "prompts" RESPONSES_DIR = OUT_DIR / "responses" +MARKDOWN_DIR = OUT_DIR / "markdown" +MARKDOWN_RESPONSES_DIR = MARKDOWN_DIR / "responses" RESULTS_DIR = OUT_DIR / "results" From 55feb057636b4a48471022db92c345ad949cdd80 Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Sun, 25 Jan 2026 20:06:55 +0900 Subject: [PATCH 27/38] feat: Add food inspection record data and enhance Markdown evaluation notes --- benchmark/data/manifest.json | 24 +++++++-- .../data/truth/food_inspection_record_01.json | 51 +++++++++++++++++++ benchmark/src/bench/cli.py | 38 +++++++++++++- 3 files changed, 109 insertions(+), 4 deletions(-) create mode 100644 benchmark/data/truth/food_inspection_record_01.json diff --git a/benchmark/data/manifest.json b/benchmark/data/manifest.json index 92a6724..70a53f1 100644 --- a/benchmark/data/manifest.json +++ b/benchmark/data/manifest.json @@ -7,7 +7,10 @@ "question": "このExcel帳票(Federal Financial Report / SF-425)について、次の情報を抽出し、JSONのみで返してください。\n\n(1) checkbox_groups: チェックボックスのグループ名と、その選択肢ラベル一覧を抽出してください(\"Report Type\" と \"Basis of Accounting\" の2グループのみ)。\n(2) not_required_by_epa_scope: 赤字の注記 \"Not Required by EPA\" がかかっているセクション名を返してください(例: \"Federal Cash\")。\n(3) section_headers: 帳票上部の番号付きセクション見出し(1〜9)のうち、見出しテキストのみを配列で返してください(例: \"Federal Agency and Organizational Element to Which Report is Submitted\" など)。\n\n出力形式(厳守):\n{\n \"checkbox_groups\": {\n \"Report Type\": [\"Quarterly\", \"Semi-Annual\", \"Annual\", \"Final\"],\n \"Basis of Accounting\": [\"Cash\", \"Accrual\"]\n },\n \"not_required_by_epa_scope\": \"...\",\n \"section_headers\": [\"...\", \"...\", \"...\"]\n}\n\n注意:\n- チェックボックスの記号(□など)は含めないでください。ラベル文字列のみを返してください。\n- section_headers は表示順(上から左→右)で返してください。\n- 余分な説明文やコードフェンスは付けず、JSONのみを返してください。", "truth": "data/truth/ffr_425_01.json", "sheet_scope": null, - "render": { "dpi": 220, "max_pages": 2 } + "render": { + "dpi": 220, + "max_pages": 2 + } }, { "id": "flowchart_01", @@ -28,7 +31,10 @@ "question": "このガントチャートのPhase3のタスク名とその開始日、終了日を抽出し、次のJSON形式のみで返してください: {\"tasks\":[{\"name\":\"...\",\"start_date\":\"YYYY-MM-DD\",\"end_date\":\"YYYY-MM-DD\"}, ...]}", "truth": "data/truth/gantt_01.json", "sheet_scope": null, - "render": { "dpi": 200, "max_pages": 4 } + "render": { + "dpi": 200, + "max_pages": 4 + } }, { "id": "certificate_of_employment_01", @@ -125,6 +131,18 @@ "dpi": 200, "max_pages": 1 } + }, + { + "id": "food_inspection_record_01", + "type": "inspection_log", + "xlsx": "data/raw/food_inspection record_01.xlsx", + "question": "This workbook contains three sheets (\"検食簿(1)\", \"検食簿 (2)\", \"検食簿 (3)\"). For the first date on each sheet, extract the lunch menu items and snack items and return JSON in the following format:\n\n{\n \"sheets\": {\n \"検食簿(1)\": {\"date\": \"...\", \"lunch_menu\": [\"...\"], \"snacks\": [\"...\"]},\n \"検食簿 (2)\": {\"date\": \"...\", \"lunch_menu\": [\"...\"], \"snacks\": [\"...\"]},\n \"検食簿 (3)\": {\"date\": \"...\", \"lunch_menu\": [\"...\"], \"snacks\": [\"...\"]},\n }\n}\n\nJSON only.", + "truth": "data/truth/food_inspection_record_01.json", + "sheet_scope": null, + "render": { + "dpi": 220, + "max_pages": 3 + } } ] -} +} \ No newline at end of file diff --git a/benchmark/data/truth/food_inspection_record_01.json b/benchmark/data/truth/food_inspection_record_01.json new file mode 100644 index 0000000..3ca2f65 --- /dev/null +++ b/benchmark/data/truth/food_inspection_record_01.json @@ -0,0 +1,51 @@ +{ + "sheets": { + "検食簿(1)": { + "date": "12月1日(月)", + "lunch_menu": [ + "麦ごはん", + "鶏肉の照り焼き", + "白菜のごま和え", + "切干大根の煮物", + "味噌汁(キャベツ)" + ], + "snacks": [ + "スキムミルク", + "ウエハース", + "スキムミルク", + "お菓子・こんぶ" + ] + }, + "検食簿 (2)": { + "date": "12月8日(月)", + "lunch_menu": [ + "麦ごはん", + "鶏肉の唐揚げ", + "キャベツ", + "ひじきの中華和え", + "すまし汁(麩)" + ], + "snacks": [ + "スキムミルク", + "ボーロ", + "スキムミルク", + "おからケーキ" + ] + }, + "検食簿 (3)": { + "date": "12月15日(月)", + "lunch_menu": [ + "麦ごはん", + "豚肉と野菜の煮物", + "大豆のサラダ", + "味噌汁(なす)" + ], + "snacks": [ + "スキムミルク", + "ウエハース", + "スキムミルク", + "お菓子・こんぶ" + ] + } + } +} \ No newline at end of file diff --git a/benchmark/src/bench/cli.py b/benchmark/src/bench/cli.py index dc0d014..b7f7a84 100644 --- a/benchmark/src/bench/cli.py +++ b/benchmark/src/bench/cli.py @@ -284,7 +284,7 @@ def ask( raise typer.BadParameter(f"No cases matched: {case}") methods = _select_methods(method) - client = OpenAIResponsesClient() if use_llm else None + client = OpenAIResponsesClient() ensure_dir(PROMPTS_DIR) ensure_dir(RESPONSES_DIR) total_cost = 0.0 @@ -698,6 +698,42 @@ def report() -> None: md_lines.append("") md_lines.append(g.to_markdown(index=False)) md_lines.append("") + md_lines.append("## Markdown evaluation notes") + md_lines.append("") + md_lines.append( + "Markdown scores measure how well the generated Markdown lines match a canonical" + ) + md_lines.append( + "Markdown rendering of the ground truth JSON. This is a *conversion quality*" + ) + md_lines.append("signal, not a direct extraction-accuracy substitute.") + md_lines.append("") + md_lines.append("Key points:") + md_lines.append("") + md_lines.append( + "- Coverage (acc_md): how much of truth Markdown content is recovered." + ) + md_lines.append( + "- Precision (md_precision): how much of predicted Markdown is correct." + ) + md_lines.append( + "- Layout shifts or list formatting differences can lower scores even if" + ) + md_lines.append(" the underlying facts are correct.") + md_lines.append( + "- LLM-based conversion introduces variability; re-run with the same seed" + ) + md_lines.append( + " and model settings to assess stability, or use deterministic rendering" + ) + md_lines.append(" for baseline comparisons.") + md_lines.append( + "- Use Markdown scores when your downstream task consumes Markdown (e.g.," + ) + md_lines.append( + " RAG ingestion), and report alongside Exact/Normalized/Raw metrics." + ) + md_lines.append("") md_lines.append("## Exstruct positioning notes (public)") md_lines.append("") md_lines.append( From 5813c2c084bcd7d717a9ffbb31119abad343614b Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Mon, 26 Jan 2026 21:45:26 +0900 Subject: [PATCH 28/38] feat: Add RUB specification document for Reconstruction Utility Benchmark --- benchmark/docs/spec.md | 110 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 110 insertions(+) create mode 100644 benchmark/docs/spec.md diff --git a/benchmark/docs/spec.md b/benchmark/docs/spec.md new file mode 100644 index 0000000..35d85db --- /dev/null +++ b/benchmark/docs/spec.md @@ -0,0 +1,110 @@ +# Reconstruction Utility Benchmark (RUB) Specification + +## 1. 目的 + +RUB は「再構築された Markdown が後続タスクにどれだけ耐えるか」を機械的に測る。 +Markdown 文字列の一致ではなく、構造利用性(Reconstruction Utility)を評価対象とする。 + +## 2. 評価対象 + +- 入力: 同一 Excel 文書 +- 手法: pdf / image_vlm / exstruct / html / openpyxl +- 出力: 各手法で生成した Markdown +- 評価: Markdown のみを入力にした構造クエリの解答精度 + +## 3. 評価フロー(2段階) + +### Stage A: 再構築 + +各手法で Markdown を生成する。 + +- pdf: soffice → pdf → テキスト抽出 → Markdown +- image_vlm: 画像レンダリング → VLM → Markdown +- exstruct: exstruct JSON → LLM → Markdown +- html / openpyxl: テキスト抽出 → Markdown + +### Stage B: 利用(採点対象) + +Stage A の Markdown だけを入力に、構造クエリを解かせる。 + +- 出力は JSON のみ +- JSON はスキーマ固定 +- 採点は正規化後の完全一致(deterministic) + +## 4. タスク設計方針 + +- 文字列一致に依存せず、構造理解を問う +- 手法間で不公平にならないよう、入力は Markdown のみ +- 各タスクは以下のいずれかに分類する + - 集合問題(項目一覧) + - グラフ問題(ノード/エッジ) + - 階層問題(親子関係) + - 表問題(行列の対応) +- 丸数字や装飾記号など、表記ゆれが大きい要素は避ける + +## 5. 正規化(決定的) + +採点前に以下の正規化を行う。 + +- 文字列: 前後空白削除、連続空白を 1 つに、改行は \n に統一 +- 辞書: キーソート(canonicalization) +- 配列: 順序が意味を持たないタスクはソート +- 数値: 可能な範囲で数値化(例: "012" → 12) + +## 6. 採点指標 + +### 6.1 主指標: RUS + +RUS = 正解数 / 問題数 + +### 6.2 副指標 + +- Cost-normalized RUS = RUS / cost_usd +- Token-normalized RUS = RUS / input_tokens +- Stage A failure rate = Markdown 生成失敗率 + +## 7. データ構成 + +``` +benchmark/ + rub/ + README.md + BENCHMARK_SPEC.md + manifest.json + truth/ + *.json + schemas/ + *.schema.json + scoring/ + normalize.py + score.py + diagrams/ + rub_overview.mmd + scoring_flow.mmd +``` + +## 8. manifest 仕様(案) + +- id: ケースID +- type: タスク種別 +- xlsx: 元ファイルパス +- question: Stage B クエリ +- truth: 正解 JSON パス +- sheet_scope: 対象シート(null なら全体) +- render: 画像レンダ設定 + +## 9. 再現性 + +- モデル名、温度、実行日時を記録 +- 正規化ルールと採点コードを完全公開 +- ランダム性がある工程は温度 0 固定 + +## 10. 公開時の注意 + +- 「Markdown 一致」は補助指標としてのみ扱う +- RUS(利用可能性)を主指標として説明する +- 用途が異なる手法を同一スコアで殴らない + +--- + +この仕様は v1 とし、変更時は履歴を残す。 From a84535da8907db4d75160857fd2417833200bd87 Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Mon, 26 Jan 2026 22:02:42 +0900 Subject: [PATCH 29/38] Add RUB (Reconstruction Utility Benchmark) support with manifest and scoring - Introduced new directory structure for RUB benchmarks under `benchmark/rub`. - Added `manifest.json` to define tasks and their expected outputs. - Implemented `RubTask` and `RubManifest` models for loading and validating the manifest. - Created normalization functions to ensure consistent comparison of outputs. - Developed scoring functions to evaluate predictions against ground truth. - Added truth data for various tasks including forms, flowcharts, and inspection records. - Included helper modules for managing RUB tasks and their associated data. --- benchmark/rub/manifest.json | 104 +++++++ benchmark/rub/truth/basic_01.json | 31 ++ benchmark/rub/truth/basic_form_01.json | 34 +++ .../truth/certificate_of_employment_01.json | 58 ++++ benchmark/rub/truth/ffr_425_01.json | 18 ++ benchmark/rub/truth/flowchart_01.json | 14 + benchmark/rub/truth/flowchart_02.json | 87 ++++++ .../rub/truth/food_inspection_record_01.json | 51 ++++ benchmark/rub/truth/gantt_01.json | 24 ++ benchmark/rub/truth/heatstroke_flow_01.json | 60 ++++ benchmark/rub/truth/smartart_01.json | 9 + benchmark/rub/truth/tax_report_01.json | 29 ++ benchmark/rub/truth/workflow_01.json | 84 ++++++ benchmark/src/bench/cli.py | 283 ++++++++++++++++++ benchmark/src/bench/paths.py | 8 + benchmark/src/bench/rub/__init__.py | 1 + benchmark/src/bench/rub/manifest.py | 37 +++ benchmark/src/bench/rub/normalize.py | 104 +++++++ benchmark/src/bench/rub/score.py | 34 +++ 19 files changed, 1070 insertions(+) create mode 100644 benchmark/rub/manifest.json create mode 100644 benchmark/rub/truth/basic_01.json create mode 100644 benchmark/rub/truth/basic_form_01.json create mode 100644 benchmark/rub/truth/certificate_of_employment_01.json create mode 100644 benchmark/rub/truth/ffr_425_01.json create mode 100644 benchmark/rub/truth/flowchart_01.json create mode 100644 benchmark/rub/truth/flowchart_02.json create mode 100644 benchmark/rub/truth/food_inspection_record_01.json create mode 100644 benchmark/rub/truth/gantt_01.json create mode 100644 benchmark/rub/truth/heatstroke_flow_01.json create mode 100644 benchmark/rub/truth/smartart_01.json create mode 100644 benchmark/rub/truth/tax_report_01.json create mode 100644 benchmark/rub/truth/workflow_01.json create mode 100644 benchmark/src/bench/rub/__init__.py create mode 100644 benchmark/src/bench/rub/manifest.py create mode 100644 benchmark/src/bench/rub/normalize.py create mode 100644 benchmark/src/bench/rub/score.py diff --git a/benchmark/rub/manifest.json b/benchmark/rub/manifest.json new file mode 100644 index 0000000..d6078ca --- /dev/null +++ b/benchmark/rub/manifest.json @@ -0,0 +1,104 @@ +{ + "tasks": [ + { + "id": "ffr_425_01", + "source_case_id": "ffr_425_01", + "type": "application_form", + "question": "このExcel帳票(Federal Financial Report / SF-425)について、次の情報を抽出し、JSONのみで返してください。\n\n(1) checkbox_groups: チェックボックスのグループ名と、その選択肢ラベル一覧を抽出してください(\"Report Type\" と \"Basis of Accounting\" の2グループのみ)。\n(2) not_required_by_epa_scope: 赤字の注記 \"Not Required by EPA\" がかかっているセクション名を返してください(例: \"Federal Cash\")。\n(3) section_headers: 帳票上部の番号付きセクション見出し(1〜9)のうち、見出しテキストのみを配列で返してください(例: \"Federal Agency and Organizational Element to Which Report is Submitted\" など)。\n\n出力形式(厳守):\n{\n \"checkbox_groups\": {\n \"Report Type\": [\"Quarterly\", \"Semi-Annual\", \"Annual\", \"Final\"],\n \"Basis of Accounting\": [\"Cash\", \"Accrual\"]\n },\n \"not_required_by_epa_scope\": \"...\",\n \"section_headers\": [\"...\", \"...\", \"...\"]\n}\n\n注意:\n- チェックボックスの記号(□など)は含めないでください。ラベル文字列のみを返してください。\n- section_headers は表示順(上から左→右)で返してください。\n- 余分な説明文やコードフェンスは付けず、JSONのみを返してください。", + "truth": "rub\\truth\\ffr_425_01.json" + }, + { + "id": "flowchart_01", + "source_case_id": "flowchart_01", + "type": "flowchart", + "question": "このフローチャートの開始から終了までの主要な処理ステップを順番に抽出し、次のJSON形式のみで返してください。\n\n出力形式(厳守):\n{\n \"steps\": [\"step1\", \"step2\", \"step3\", ...]\n}\n\n注意事項:\n- 開始ノードと終了ノードも含めてください\n- 分岐やループがある場合は、代表的な主経路として線形化してください\n- ステップ名は図中のラベル文字列をそのまま使用してください", + "truth": "rub\\truth\\flowchart_01.json" + }, + { + "id": "gantt_01", + "source_case_id": "gantt_01", + "type": "gantt", + "question": "このガントチャートのPhase3のタスク名とその開始日、終了日を抽出し、次のJSON形式のみで返してください: {\"tasks\":[{\"name\":\"...\",\"start_date\":\"YYYY-MM-DD\",\"end_date\":\"YYYY-MM-DD\"}, ...]}", + "truth": "rub\\truth\\gantt_01.json", + "unordered_paths": [ + "tasks" + ] + }, + { + "id": "certificate_of_employment_01", + "source_case_id": "certificate_of_employment_01", + "type": "application_form", + "question": "このExcel帳票(就労証明書)について、次の3点を抽出し、JSONのみで返してください。\n\n(1) checkbox_groups: 以下の3つのチェックボックス項目について、それぞれの選択肢ラベルを抽出してください。\n - 業種\n - 雇用の形態\n - 雇用(予定)期間等(無期 / 有期)\n\n(2) numbered_sections: 帳票の「No.」列に対応する番号付き項目の見出し(1〜14)を、番号をキーとして抽出してください。\n\n(3) warning_text: 赤字で記載されている注意文を、そのまま1つの文字列として抽出してください。\n\n出力形式(厳守):\n{\n \"checkbox_groups\": {\n \"業種\": [\"...\", \"...\"],\n \"雇用の形態\": [\"...\", \"...\"],\n \"雇用(予定)期間等\": [\"...\", \"...\"]\n },\n \"numbered_sections\": {\n \"1\": \"...\",\n \"2\": \"...\",\n \"3\": \"...\"\n },\n \"warning_text\": \"...\"\n}\n\n注意:\n- チェックボックス記号(□など)は含めず、ラベル文字列のみを返してください。\n- numbered_sections は 1〜14 すべてを含めてください。\n- 余分な説明文やコードフェンスは付けず、JSONのみを返してください。", + "truth": "rub\\truth\\certificate_of_employment_01.json" + }, + { + "id": "tax_report_01", + "source_case_id": "tax_report_01", + "type": "application_form", + "question": "この市民税・県民税申告書の右側に配置されている縦方向の帳票構造を解析してください。\n\n次の条件をすべて満たすJSONを返してください。\n\n1. 「収入金額」ブロックに含まれる項目名を、上から順に配列で列挙してください。\n2. 上記項目群を視覚的にまとめている「合計」項目名を1つ指定してください。\n3. 「所得から差し引かれる金額」ブロックに含まれる項目名を、上から順に配列で列挙してください。\n4. 上記控除項目群を視覚的にまとめている「合計」項目名を1つ指定してください。\n\n制約:\n- 項目名は帳票に記載されている日本語表記をそのまま使用してください。\n- 数値、記号、注釈文は含めないでください。\n- 同一列・同一枠内にある項目同士の位置関係に基づいて判断してください。\n- JSONのみを返してください。\n\n出力形式:\n{\n \"income_items\": [\"...\", \"...\"],\n \"income_total\": \"...\",\n \"deduction_items\": [\"...\", \"...\"],\n \"deduction_total\": \"...\"\n}", + "truth": "rub\\truth\\tax_report_01.json" + }, + { + "id": "smartart_01", + "source_case_id": "smartart_01", + "type": "organization_chart", + "question": "このExcel帳票(SmartArtで作成された組織図)について、次の3点を抽出し、JSONのみで返してください。\n\n(1) top_structure: 最上位から第2階層までの組織構造を、親子関係が分かる形で抽出してください。\n\n(2) sales_departments: 「営業部」の直下にある課の名称を、上から順に配列で返してください。\n\n(3) production_sites: 「生産部」の直下にある工場名を、上から順に配列で返してください。\n\n出力形式(厳守):\n{\n \"top_structure\": {\n \"取締役会\": {\n \"社長\": [\"...\"]\n }\n },\n \"sales_departments\": [\"...\", \"...\"],\n \"production_sites\": [\"...\", \"...\"]\n}\n\n注意:\n- 図形の色や配置座標は含めないでください。テキスト内容と階層関係のみを対象とします。\n- 余分な説明文やコードフェンスは付けず、JSONのみを返してください。", + "truth": "rub\\truth\\smartart_01.json", + "unordered_paths": [ + "top_structure.取締役会.社長", + "sales_departments", + "production_sites" + ] + }, + { + "id": "basic_01", + "source_case_id": "basic_01", + "type": "mixed_document", + "question": "このExcel帳票について、次の3点を抽出し、JSONのみで返してください。\n\n(1) sales_table: 左上の売上表について、月をキーとして各製品の数値を抽出してください。\n\n(2) chart_series: 右上の折れ線グラフに含まれる系列名を、凡例の表示順で配列として返してください。\n\n(3) flowchart_paths: 下部のフローチャートについて、開始から終了までの処理パスを条件付きで2通り抽出してください。\n - format_valid = true の場合の処理パス\n - format_valid = false の場合の処理パス\n\n出力形式(厳守):\n{\n \"sales_table\": {\n \"Jan-25\": {\"製品A\": 0, \"製品B\": 0, \"製品C\": 0},\n \"Feb-25\": {\"製品A\": 0, \"製品B\": 0, \"製品C\": 0}\n },\n \"chart_series\": [\"...\", \"...\"],\n \"flowchart_paths\": {\n \"format_valid_true\": [\"...\", \"...\"],\n \"format_valid_false\": [\"...\", \"...\"]\n }\n}\n\n注意:\n- 数値は整数で返してください。\n- フローチャートのパスは、図形内の文言をそのまま順番に並べてください。\n- 余分な説明文やコードフェンスは付けず、JSONのみを返してください。", + "truth": "rub\\truth\\basic_01.json" + }, + { + "id": "heatstroke_flow_01", + "source_case_id": "heatstroke_flow_01", + "type": "flowchart", + "question": "このExcelに記載されている熱中症対応フローについて、上から順に各対応ステップを抽出してください。各ステップについて、step_name(工程名)、description(内容要約)、special_conditions(条件や注意事項がある場合のみ配列で記載)を含むJSONを、次の形式のみで返してください。\n\n{\n \"steps\": [\n {\n \"step_name\": \"...\",\n \"description\": \"...\",\n \"special_conditions\": [\"...\"]\n }\n ]\n}", + "truth": "rub\\truth\\heatstroke_flow_01.json" + }, + { + "id": "workflow_01", + "source_case_id": "workflow_01", + "type": "workflow", + "question": "このExcelに記載されている業務フロー図(ネット注文フローチャート)について、工程を上から順に整理してください。各工程について、actor(実行主体)、step_name(工程名)、next_steps(次に進む工程名の配列)を含むJSONを、次の形式のみで返してください。\n\n{\n \"steps\": [\n {\n \"actor\": \"お客様|当社\",\n \"step_name\": \"...\",\n \"next_steps\": [\"...\"]\n }\n ]\n}", + "truth": "rub\\truth\\workflow_01.json", + "unordered_paths": [ + "steps", + "steps.next_steps" + ] + }, + { + "id": "basic_form_01", + "source_case_id": "basic_form_01", + "type": "application_form", + "question": "このExcel申請書に記載されている入力項目を、意味的なブロック単位で整理してください。申請者本人に関する項目、配偶者に関する項目、収入等に関する申告、預貯金等に関する申告の4分類に分け、それぞれに含まれる項目名を配列でまとめたJSONを、次の形式のみで返してください。\n\n{\n \"applicant\": [],\n \"spouse\": [],\n \"income_declaration\": [],\n \"asset_declaration\": []\n}", + "truth": "rub\\truth\\basic_form_01.json" + }, + { + "id": "flowchart_02", + "source_case_id": "flowchart_02", + "type": "flowchart", + "question": "このExcelに記載されているログイン処理フローについて、工程を上から順に整理してください。各工程について、step_name(工程名)、step_type(start|process|decision|end)、next_steps(条件付き遷移を含む次工程)を含むJSONを、次の形式のみで返してください。\n\n{\n \"steps\": [\n {\n \"step_name\": \"...\",\n \"step_type\": \"start|process|decision|end\",\n \"next_steps\": [\n {\n \"condition\": \"...\",\n \"next\": \"...\"\n }\n ]\n }\n ]\n}", + "truth": "rub\\truth\\flowchart_02.json", + "unordered_paths": [ + "steps", + "steps.next_steps" + ] + }, + { + "id": "food_inspection_record_01", + "source_case_id": "food_inspection_record_01", + "type": "inspection_log", + "question": "This workbook contains three sheets (\"検食簿(1)\", \"検食簿 (2)\", \"検食簿 (3)\"). For the first date on each sheet, extract the lunch menu items and snack items and return JSON in the following format:\n\n{\n \"sheets\": {\n \"検食簿(1)\": {\"date\": \"...\", \"lunch_menu\": [\"...\"], \"snacks\": [\"...\"]},\n \"検食簿 (2)\": {\"date\": \"...\", \"lunch_menu\": [\"...\"], \"snacks\": [\"...\"]},\n \"検食簿 (3)\": {\"date\": \"...\", \"lunch_menu\": [\"...\"], \"snacks\": [\"...\"]},\n }\n}\n\nJSON only.", + "truth": "rub\\truth\\food_inspection_record_01.json" + } + ] +} \ No newline at end of file diff --git a/benchmark/rub/truth/basic_01.json b/benchmark/rub/truth/basic_01.json new file mode 100644 index 0000000..53bca60 --- /dev/null +++ b/benchmark/rub/truth/basic_01.json @@ -0,0 +1,31 @@ +{ + "sales_table": { + "Jan-25": { "製品A": 120, "製品B": 80, "製品C": 60 }, + "Feb-25": { "製品A": 135, "製品B": 90, "製品C": 64 }, + "Mar-25": { "製品A": 150, "製品B": 100, "製品C": 70 }, + "Apr-25": { "製品A": 170, "製品B": 110, "製品C": 72 }, + "May-25": { "製品A": 160, "製品B": 120, "製品C": 75 }, + "Jun-25": { "製品A": 180, "製品B": 130, "製品C": 80 } + }, + "chart_series": ["製品A", "製品B", "製品C"], + "flowchart_paths": { + "format_valid_true": [ + "開始", + "入力データ読み込み", + "形式は正しい?", + "1件処理", + "残件あり?", + "出力を生成", + "メール送信?", + "メール送信", + "終了" + ], + "format_valid_false": [ + "開始", + "入力データ読み込み", + "形式は正しい?", + "エラー表示", + "終了" + ] + } +} diff --git a/benchmark/rub/truth/basic_form_01.json b/benchmark/rub/truth/basic_form_01.json new file mode 100644 index 0000000..cd8ca3b --- /dev/null +++ b/benchmark/rub/truth/basic_form_01.json @@ -0,0 +1,34 @@ +{ + "applicant": [ + "フリガナ", + "被保険者氏名", + "生年月日", + "住所", + "連絡先", + "入所(院)した介護保険施設の所在地及び名称", + "入所(院)年月日" + ], + "spouse": [ + "配偶者の有無", + "配偶者氏名", + "配偶者生年月日", + "配偶者個人番号", + "配偶者住所", + "配偶者連絡先", + "本年1月1日現在の住所", + "課税状況" + ], + "income_declaration": [ + "生活保護受給者に該当するか", + "市町村民税非課税世帯であるか", + "課税年金収入額", + "その他の合計所得金額", + "年金の種類に関する申告" + ], + "asset_declaration": [ + "預貯金額", + "有価証券の金額", + "その他の資産額", + "配偶者の預貯金等を含むかどうか" + ] +} diff --git a/benchmark/rub/truth/certificate_of_employment_01.json b/benchmark/rub/truth/certificate_of_employment_01.json new file mode 100644 index 0000000..9dc4cd3 --- /dev/null +++ b/benchmark/rub/truth/certificate_of_employment_01.json @@ -0,0 +1,58 @@ +{ + "checkbox_groups": { + "業種": [ + "農業・林業", + "漁業", + "鉱業・採石業・砂利採取業", + "建設業", + "製造業", + "電気・ガス・熱供給・水道業", + "情報通信業", + "運輸業・郵便業", + "卸売業・小売業", + "金融業・保険業", + "不動産業・物品賃貸業", + "学術研究・専門・技術サービス", + "宿泊業・飲食サービス業", + "生活関連サービス業・娯楽業", + "医療・福祉", + "教育・学習支援業", + "複合サービス事業", + "公務", + "その他" + ], + "雇用の形態": [ + "正社員", + "パート・アルバイト", + "派遣社員", + "契約社員", + "会計年度任用職員", + "非常勤・臨時職員", + "役員", + "自営業主", + "自営業専従者", + "家族従業者", + "内職", + "業務委託", + "その他" + ], + "雇用(予定)期間等": ["無期", "有期"] + }, + "numbered_sections": { + "1": "業種", + "2": "本人氏名", + "3": "雇用(予定)期間等", + "4": "本人就労先事業所", + "5": "雇用の形態", + "6": "就労時間(固定就労の場合)", + "7": "就労時間(変則就労の場合)", + "8": "就労実績", + "9": "産前・産後休業の取得", + "10": "育児休業の取得", + "11": "産休・育休以外の休業の取得", + "12": "復職(予定)年月日", + "13": "育児のための短時間勤務制度利用有無", + "14": "保育士等としての勤務実態の有無" + }, + "warning_text": "※本証明書の内容について、就労先事業者等に無断で作成又は改変を行ったときは、刑法上の罪に問われる場合があります。" +} diff --git a/benchmark/rub/truth/ffr_425_01.json b/benchmark/rub/truth/ffr_425_01.json new file mode 100644 index 0000000..a53b43d --- /dev/null +++ b/benchmark/rub/truth/ffr_425_01.json @@ -0,0 +1,18 @@ +{ + "checkbox_groups": { + "Report Type": ["Quarterly", "Semi-Annual", "Annual", "Final"], + "Basis of Accounting": ["Cash", "Accrual"] + }, + "not_required_by_epa_scope": "Federal Cash", + "section_headers": [ + "Federal Agency and Organizational Element to Which Report is Submitted", + "Federal Grant or Other Identifying Number Assigned by Federal Agency", + "Recipient Organization (Name and complete address including Zip code)", + "DUNS Number", + "Recipient Account Number or Identifying Number", + "Report Type", + "Basis of Accounting", + "Project/Grant Period", + "Reporting Period End Date" + ] +} diff --git a/benchmark/rub/truth/flowchart_01.json b/benchmark/rub/truth/flowchart_01.json new file mode 100644 index 0000000..196bc95 --- /dev/null +++ b/benchmark/rub/truth/flowchart_01.json @@ -0,0 +1,14 @@ +{ + "steps": [ + "S", + "要件抽出", + "ヒアリング", + "非機能要件", + "思考実験", + "再検証", + "まとめ", + "文書作成", + "締結", + "E" + ] +} diff --git a/benchmark/rub/truth/flowchart_02.json b/benchmark/rub/truth/flowchart_02.json new file mode 100644 index 0000000..868d040 --- /dev/null +++ b/benchmark/rub/truth/flowchart_02.json @@ -0,0 +1,87 @@ +{ + "steps": [ + { + "step_name": "ログイン画面", + "step_type": "start", + "next_steps": [ + { + "condition": "always", + "next": "登録情報を入力" + } + ] + }, + { + "step_name": "登録情報を入力", + "step_type": "process", + "next_steps": [ + { + "condition": "always", + "next": "入力内容は正しいか" + } + ] + }, + { + "step_name": "入力内容は正しいか", + "step_type": "decision", + "next_steps": [ + { + "condition": "はい", + "next": "サーバーに認証リクエストを送信" + }, + { + "condition": "いいえ", + "next": "再入力を提示" + } + ] + }, + { + "step_name": "再入力を提示", + "step_type": "process", + "next_steps": [ + { + "condition": "always", + "next": "登録情報を入力" + } + ] + }, + { + "step_name": "サーバーに認証リクエストを送信", + "step_type": "process", + "next_steps": [ + { + "condition": "always", + "next": "認証に成功か" + } + ] + }, + { + "step_name": "認証に成功か", + "step_type": "decision", + "next_steps": [ + { + "condition": "はい", + "next": "操作画面に遷移" + }, + { + "condition": "いいえ", + "next": "認証エラーを提示" + } + ] + }, + { + "step_name": "認証エラーを提示", + "step_type": "process", + "next_steps": [ + { + "condition": "always", + "next": "再入力を提示" + } + ] + }, + { + "step_name": "操作画面に遷移", + "step_type": "end", + "next_steps": [] + } + ] +} diff --git a/benchmark/rub/truth/food_inspection_record_01.json b/benchmark/rub/truth/food_inspection_record_01.json new file mode 100644 index 0000000..3ca2f65 --- /dev/null +++ b/benchmark/rub/truth/food_inspection_record_01.json @@ -0,0 +1,51 @@ +{ + "sheets": { + "検食簿(1)": { + "date": "12月1日(月)", + "lunch_menu": [ + "麦ごはん", + "鶏肉の照り焼き", + "白菜のごま和え", + "切干大根の煮物", + "味噌汁(キャベツ)" + ], + "snacks": [ + "スキムミルク", + "ウエハース", + "スキムミルク", + "お菓子・こんぶ" + ] + }, + "検食簿 (2)": { + "date": "12月8日(月)", + "lunch_menu": [ + "麦ごはん", + "鶏肉の唐揚げ", + "キャベツ", + "ひじきの中華和え", + "すまし汁(麩)" + ], + "snacks": [ + "スキムミルク", + "ボーロ", + "スキムミルク", + "おからケーキ" + ] + }, + "検食簿 (3)": { + "date": "12月15日(月)", + "lunch_menu": [ + "麦ごはん", + "豚肉と野菜の煮物", + "大豆のサラダ", + "味噌汁(なす)" + ], + "snacks": [ + "スキムミルク", + "ウエハース", + "スキムミルク", + "お菓子・こんぶ" + ] + } + } +} \ No newline at end of file diff --git a/benchmark/rub/truth/gantt_01.json b/benchmark/rub/truth/gantt_01.json new file mode 100644 index 0000000..aebdd03 --- /dev/null +++ b/benchmark/rub/truth/gantt_01.json @@ -0,0 +1,24 @@ +{ + "tasks": [ + { + "name": "Core Feature Dev", + "start_date": "2026-01-26", + "end_date": "2026-02-03" + }, + { + "name": "Edge Case Handling", + "start_date": "2026-01-27", + "end_date": "2026-02-03" + }, + { + "name": "Integration Work", + "start_date": "2026-01-29", + "end_date": "2026-02-04" + }, + { + "name": "Internal Review", + "start_date": "2026-02-01", + "end_date": "2026-02-04" + } + ] +} diff --git a/benchmark/rub/truth/heatstroke_flow_01.json b/benchmark/rub/truth/heatstroke_flow_01.json new file mode 100644 index 0000000..54ce1a0 --- /dev/null +++ b/benchmark/rub/truth/heatstroke_flow_01.json @@ -0,0 +1,60 @@ +{ + "steps": [ + { + "step_name": "発見", + "description": "熱中症が疑われる症状があるかを確認する。", + "special_conditions": [ + "めまい", + "失神", + "筋肉痛", + "筋肉の硬直", + "大量の発汗", + "頭痛", + "嘔吐", + "意識障害", + "けいれん", + "高体温" + ] + }, + { + "step_name": "報告", + "description": "作業管理者および緊急連絡先へ状況を報告する。", + "special_conditions": [] + }, + { + "step_name": "初期対応", + "description": "涼しい場所への移動、水分補給、体を冷やすなどの応急処置を行う。", + "special_conditions": [ + "WBGT値が28度以上の場合は作業を中断する", + "気温が31度以上の場合は作業を中断する" + ] + }, + { + "step_name": "医療機関搬送・救急要請", + "description": "症状に応じて医療機関へ搬送するか救急要請を行う。", + "special_conditions": [ + "意識がない場合は119番通報する", + "応答が曖昧な場合は119番通報する", + "高熱が続く場合は119番通報する", + "けいれんなど重症の兆候がある場合は119番通報する" + ] + }, + { + "step_name": "事後対応・記録", + "description": "発生状況や対応内容を記録し、保存および定期的な見直しを行う。", + "special_conditions": [ + "発生日時", + "場所", + "WBGT値", + "気温", + "作業内容", + "作業時間", + "症状", + "初期対応内容", + "報告先", + "搬送有無", + "最終対応" + ] + } + ] +} diff --git a/benchmark/rub/truth/smartart_01.json b/benchmark/rub/truth/smartart_01.json new file mode 100644 index 0000000..2a22af1 --- /dev/null +++ b/benchmark/rub/truth/smartart_01.json @@ -0,0 +1,9 @@ +{ + "top_structure": { + "取締役会": { + "社長": ["企画管理部", "営業部", "開発部", "技術部", "生産部", "総務部"] + } + }, + "sales_departments": ["第1営業課", "第2営業課", "第3営業課", "海外営業課"], + "production_sites": ["愛知工場", "山形工場", "高知工場"] +} diff --git a/benchmark/rub/truth/tax_report_01.json b/benchmark/rub/truth/tax_report_01.json new file mode 100644 index 0000000..174476d --- /dev/null +++ b/benchmark/rub/truth/tax_report_01.json @@ -0,0 +1,29 @@ +{ + "income_items": [ + "事業(営業等)", + "事業(農業)", + "不動産", + "利子", + "配当", + "給与", + "公的年金等", + "業務", + "その他" + ], + "income_total": "合計", + "deduction_items": [ + "社会保険料控除", + "小規模企業共済等掛金控除", + "生命保険料控除", + "地震保険料控除", + "寡婦、ひとり親控除", + "勤労学生控除", + "配偶者(特別)控除", + "扶養控除", + "障害者控除", + "基礎控除", + "雑損控除", + "医療費控除" + ], + "deduction_total": "合計" +} diff --git a/benchmark/rub/truth/workflow_01.json b/benchmark/rub/truth/workflow_01.json new file mode 100644 index 0000000..ea7d3c3 --- /dev/null +++ b/benchmark/rub/truth/workflow_01.json @@ -0,0 +1,84 @@ +{ + "steps": [ + { + "actor": "お客様", + "step_name": "商品検索", + "next_steps": ["検討"] + }, + { + "actor": "当社", + "step_name": "商品情報を表示", + "next_steps": ["検討"] + }, + { + "actor": "お客様", + "step_name": "検討", + "next_steps": ["キャンセル", "カートに追加"] + }, + { + "actor": "お客様", + "step_name": "キャンセル", + "next_steps": [] + }, + { + "actor": "お客様", + "step_name": "カートに追加", + "next_steps": ["在庫確認"] + }, + { + "actor": "当社", + "step_name": "在庫確認", + "next_steps": ["レジに進む"] + }, + { + "actor": "お客様", + "step_name": "レジに進む", + "next_steps": ["支払い方法の選択"] + }, + { + "actor": "お客様", + "step_name": "支払い方法の選択", + "next_steps": ["支払いの案内"] + }, + { + "actor": "当社", + "step_name": "支払いの案内", + "next_steps": ["支払い処理"] + }, + { + "actor": "当社", + "step_name": "支払い処理", + "next_steps": ["注文の確定"] + }, + { + "actor": "お客様", + "step_name": "注文の確定", + "next_steps": ["配送先入力"] + }, + { + "actor": "お客様", + "step_name": "配送先入力", + "next_steps": ["配送先確認"] + }, + { + "actor": "当社", + "step_name": "配送先確認", + "next_steps": ["注文確認メールを送信"] + }, + { + "actor": "当社", + "step_name": "注文確認メールを送信", + "next_steps": ["商品を準備・発送"] + }, + { + "actor": "当社", + "step_name": "商品を準備・発送", + "next_steps": ["商品受取"] + }, + { + "actor": "お客様", + "step_name": "商品受取", + "next_steps": [] + } + ] +} diff --git a/benchmark/src/bench/cli.py b/benchmark/src/bench/cli.py index b7f7a84..0e1db15 100644 --- a/benchmark/src/bench/cli.py +++ b/benchmark/src/bench/cli.py @@ -31,6 +31,11 @@ PROMPTS_DIR, RESPONSES_DIR, RESULTS_DIR, + RUB_MANIFEST, + RUB_OUT_DIR, + RUB_PROMPTS_DIR, + RUB_RESPONSES_DIR, + RUB_RESULTS_DIR, resolve_path, ) from .pipeline.common import ensure_dir, sha256_text, write_json @@ -39,6 +44,8 @@ from .pipeline.image_render import xlsx_to_pngs_via_pdf from .pipeline.openpyxl_pandas import extract_openpyxl from .pipeline.pdf_text import pdf_to_text, xlsx_to_pdf +from .rub.manifest import RubTask, load_rub_manifest +from .rub.score import score_exact app = typer.Typer(add_completion=False) console = Console() @@ -89,6 +96,39 @@ class MarkdownRecord(BaseModel): raw: dict[str, Any] +class RubResponseRecord(BaseModel): + """RUB response metadata saved for each request.""" + + task_id: str + source_case_id: str + method: str + model: str + temperature: float + prompt_hash: str + question: str + text: str + input_tokens: int + output_tokens: int + cost_usd: float + raw: dict[str, Any] + + +class RubResultRow(BaseModel): + """RUB evaluation row for CSV output.""" + + task_id: str + source_case_id: str + type: str + method: str + model: str | None + score: float + ok: bool + input_tokens: int + output_tokens: int + cost_usd: float + error: str | None + + class ResultRow(BaseModel): """Evaluation row for CSV output.""" @@ -162,6 +202,49 @@ def _select_methods(method: str) -> list[str]: return deduped +def _rub_manifest_path() -> Path: + """Return the path to the RUB manifest. + + Returns: + Path to rub/manifest.json. + """ + return RUB_MANIFEST + + +def _select_tasks(tasks: list[RubTask], task: str) -> list[RubTask]: + """Select RUB tasks by id list or all. + + Args: + tasks: Task list from the RUB manifest. + task: Comma-separated task ids or "all". + + Returns: + Filtered list of tasks. + """ + if task == "all": + return tasks + ids = {t.strip() for t in task.split(",") if t.strip()} + return [t for t in tasks if t.id in ids] + + +def _resolve_task_path(path_str: str, *, task_id: str, label: str) -> Path | None: + """Resolve a RUB manifest path, warning if missing. + + Args: + path_str: Path string from the manifest. + task_id: Task identifier for log messages. + label: Label for the path type (e.g., "truth"). + + Returns: + Resolved Path if it exists, otherwise None. + """ + resolved = resolve_path(path_str) + if resolved.exists(): + return resolved + print(f"[yellow]skip: missing {label} for {task_id}: {resolved}[/yellow]") + return None + + def _resolve_case_path(path_str: str, *, case_id: str, label: str) -> Path | None: """Resolve a manifest path, warning if missing. @@ -188,6 +271,14 @@ def _reset_case_outputs(case_id: str) -> None: path.unlink() +def _reset_rub_outputs(task_id: str) -> None: + """Delete existing RUB prompt/response logs for a task.""" + for directory in (RUB_PROMPTS_DIR, RUB_RESPONSES_DIR): + path = directory / f"{task_id}.jsonl" + if path.exists(): + path.unlink() + + def _reset_markdown_outputs(case_id: str) -> None: """Delete existing markdown logs for a case.""" path = MARKDOWN_RESPONSES_DIR / f"{case_id}.jsonl" @@ -469,6 +560,198 @@ def markdown( ) +@app.command() +def rub_ask( + task: str = "all", + method: str = "all", + model: str = "gpt-4o", + temperature: float = 0.0, +) -> None: + """Run RUB Stage B queries using Markdown outputs as context. + + Args: + task: Comma-separated task ids or "all". + method: Comma-separated method names or "all". + model: OpenAI model name for Stage B queries. + temperature: Sampling temperature for the model. + """ + rub_manifest = load_rub_manifest(_rub_manifest_path()) + tasks = _select_tasks(rub_manifest.tasks, task) + if not tasks: + raise typer.BadParameter(f"No tasks matched: {task}") + methods = _select_methods(method) + + ensure_dir(RUB_OUT_DIR) + ensure_dir(RUB_PROMPTS_DIR) + ensure_dir(RUB_RESPONSES_DIR) + + client = OpenAIResponsesClient() + total_cost = 0.0 + total_calls = 0 + + for t in tasks: + console.rule(f"RUB {t.id}") + _reset_rub_outputs(t.id) + resp_file = RUB_RESPONSES_DIR / f"{t.id}.jsonl" + for m in methods: + md_path = MARKDOWN_DIR / t.source_case_id / f"{m}.md" + if not md_path.exists(): + print(f"[yellow]skip: missing markdown {t.id} {m}[/yellow]") + continue + context_text = md_path.read_text(encoding="utf-8") + prompt_hash = sha256_text(f"{t.question}\n{context_text}") + try: + res = client.ask_text( + model=model, + question=t.question, + context_text=context_text, + temperature=temperature, + ) + rec = RubResponseRecord( + task_id=t.id, + source_case_id=t.source_case_id, + method=m, + model=model, + temperature=temperature, + prompt_hash=prompt_hash, + question=t.question, + text=res.text, + input_tokens=res.input_tokens, + output_tokens=res.output_tokens, + cost_usd=res.cost_usd, + raw=res.raw, + ) + line = _dump_jsonl(rec) + with resp_file.open("a", encoding="utf-8") as f: + f.write(line + "\n") + total_cost += res.cost_usd + total_calls += 1 + print(f"[green]{t.id} {m} -> {resp_file}[/green]") + except Exception as exc: + print(f"[yellow]skip: rub {t.id} {m} ({exc})[/yellow]") + + print(f"[green]RUB cost: ${total_cost:.6f} ({total_calls} call(s))[/green]") + + +@app.command() +def rub_eval(task: str = "all", method: str = "all") -> None: + """Evaluate RUB responses and write results CSV. + + Args: + task: Comma-separated task ids or "all". + method: Comma-separated method names or "all". + """ + rub_manifest = load_rub_manifest(_rub_manifest_path()) + tasks = _select_tasks(rub_manifest.tasks, task) + if not tasks: + raise typer.BadParameter(f"No tasks matched: {task}") + methods = _select_methods(method) + + rows: list[RubResultRow] = [] + for t in tasks: + truth_path = _resolve_task_path(t.truth, task_id=t.id, label="truth") + if not truth_path: + continue + truth = json.loads(truth_path.read_text(encoding="utf-8")) + + resp_file = RUB_RESPONSES_DIR / f"{t.id}.jsonl" + if not resp_file.exists(): + print(f"[yellow]skip: no RUB responses for {t.id}[/yellow]") + continue + latest: dict[str, dict[str, Any]] = {} + for line in resp_file.read_text(encoding="utf-8").splitlines(): + rec = json.loads(line) + if rec.get("method") in methods: + latest[rec["method"]] = rec + + for m, rec in latest.items(): + score = 0.0 + ok = False + err: str | None = None + try: + pred_obj = normalize_json_text(rec["text"]) + score_res = score_exact( + truth, pred_obj, unordered_paths=t.unordered_paths + ) + score = score_res.score + ok = score_res.ok + except Exception as exc: + err = str(exc) + + rows.append( + RubResultRow( + task_id=t.id, + source_case_id=t.source_case_id, + type=t.type, + method=m, + model=rec.get("model"), + score=score, + ok=ok, + input_tokens=int(rec.get("input_tokens", 0)), + output_tokens=int(rec.get("output_tokens", 0)), + cost_usd=float(rec.get("cost_usd", 0.0)), + error=err, + ) + ) + + out_csv = RUB_RESULTS_DIR / "rub_results.csv" + write_results_csv([row.model_dump() for row in rows], out_csv) + print(f"[green]Wrote {out_csv} ({len(rows)} rows)[/green]") + + +@app.command() +def rub_report() -> None: + """Generate a RUB Markdown report from the results CSV.""" + csv_path = RUB_RESULTS_DIR / "rub_results.csv" + if not csv_path.exists(): + raise typer.Exit(code=1) + + import pandas as pd + + df = pd.read_csv(csv_path) + agg: dict[str, tuple[str, str]] = { + "rus": ("score", "mean"), + "avg_in": ("input_tokens", "mean"), + "avg_cost": ("cost_usd", "mean"), + "n": ("task_id", "count"), + } + g = df.groupby("method").agg(**agg).reset_index() + + detail_dir = RUB_RESULTS_DIR / "detailed_reports" + detail_dir.mkdir(parents=True, exist_ok=True) + + md_lines: list[str] = [] + md_lines.append("# RUB Report") + md_lines.append("") + md_lines.append( + "This report summarizes Reconstruction Utility Benchmark (RUB) results." + ) + md_lines.append( + "Scores are computed on Stage B task accuracy using Markdown-only inputs." + ) + md_lines.append("") + md_lines.append("## Summary by method") + md_lines.append("") + md_lines.append(g.to_markdown(index=False)) + md_lines.append("") + + for task_id, task_df in df.groupby("task_id"): + task_path = detail_dir / f"report_{task_id}.md" + lines = [ + "# RUB Report", + "", + f"## Details: {task_id}", + "", + task_df.to_markdown(index=False), + "", + ] + task_path.write_text("\n".join(lines), encoding="utf-8") + + report_path = RUB_RESULTS_DIR / "report.md" + report_path.write_text("\n".join(md_lines), encoding="utf-8") + print(f"[green]Wrote {report_path}[/green]") + + @app.command() def eval(case: str = "all", method: str = "all") -> None: """Evaluate the latest responses and write results CSV. diff --git a/benchmark/src/bench/paths.py b/benchmark/src/bench/paths.py index d4214be..579b730 100644 --- a/benchmark/src/bench/paths.py +++ b/benchmark/src/bench/paths.py @@ -14,6 +14,14 @@ MARKDOWN_DIR = OUT_DIR / "markdown" MARKDOWN_RESPONSES_DIR = MARKDOWN_DIR / "responses" RESULTS_DIR = OUT_DIR / "results" +RUB_DIR = ROOT / "rub" +RUB_MANIFEST = RUB_DIR / "manifest.json" +RUB_TRUTH_DIR = RUB_DIR / "truth" +RUB_SCHEMA_DIR = RUB_DIR / "schemas" +RUB_OUT_DIR = OUT_DIR / "rub" +RUB_PROMPTS_DIR = RUB_OUT_DIR / "prompts" +RUB_RESPONSES_DIR = RUB_OUT_DIR / "responses" +RUB_RESULTS_DIR = RUB_OUT_DIR / "results" def resolve_path(path: str | Path) -> Path: diff --git a/benchmark/src/bench/rub/__init__.py b/benchmark/src/bench/rub/__init__.py new file mode 100644 index 0000000..6c1f1c2 --- /dev/null +++ b/benchmark/src/bench/rub/__init__.py @@ -0,0 +1 @@ +"""RUB (Reconstruction Utility Benchmark) helpers.""" diff --git a/benchmark/src/bench/rub/manifest.py b/benchmark/src/bench/rub/manifest.py new file mode 100644 index 0000000..9eda762 --- /dev/null +++ b/benchmark/src/bench/rub/manifest.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +import json +from pathlib import Path + +from pydantic import BaseModel, Field + + +class RubTask(BaseModel): + """RUB task definition.""" + + id: str + source_case_id: str = Field(..., description="Case id for Stage A Markdown.") + type: str + question: str + truth: str + schema_path: str | None = None + unordered_paths: list[str] | None = None + + +class RubManifest(BaseModel): + """RUB manifest container.""" + + tasks: list[RubTask] + + +def load_rub_manifest(path: Path) -> RubManifest: + """Load a RUB manifest file. + + Args: + path: Path to rub/manifest.json. + + Returns: + Parsed RubManifest. + """ + data = json.loads(path.read_text(encoding="utf-8")) + return RubManifest(**data) diff --git a/benchmark/src/bench/rub/normalize.py b/benchmark/src/bench/rub/normalize.py new file mode 100644 index 0000000..1243635 --- /dev/null +++ b/benchmark/src/bench/rub/normalize.py @@ -0,0 +1,104 @@ +from __future__ import annotations + +import json +import re +from typing import Any + +from pydantic import BaseModel + + +class NormalizedPayload(BaseModel): + """Normalized JSON payload for deterministic comparison.""" + + value: Any + + +def _normalize_text(value: str) -> str: + """Normalize a string for comparison. + + Args: + value: Raw string value. + + Returns: + Normalized string. + """ + text = value.replace("\r\n", "\n").replace("\r", "\n").strip() + text = re.sub(r"\s+", " ", text) + return text + + +def _maybe_parse_number(value: str) -> int | float | str: + """Parse a numeric string when possible. + + Args: + value: String value. + + Returns: + int/float when value is numeric, otherwise original string. + """ + if re.fullmatch(r"-?\d+", value): + return int(value) + if re.fullmatch(r"-?\d+\.\d+", value): + return float(value) + return value + + +def _canonical_json(value: Any) -> str: + """Return a canonical JSON string for sorting. + + Args: + value: JSON-serializable value. + + Returns: + Canonical JSON string. + """ + return json.dumps(value, ensure_ascii=False, sort_keys=True, separators=(",", ":")) + + +def _normalize_value(value: Any, *, unordered_paths: set[str], path: str) -> Any: + """Normalize a JSON-like value recursively. + + Args: + value: Input value. + unordered_paths: Set of list paths to sort. + path: Dot-separated path for the current value. + + Returns: + Normalized value. + """ + if isinstance(value, dict): + normalized: dict[str, Any] = {} + for key in sorted(value.keys()): + child_path = f"{path}.{key}" if path else key + normalized[key] = _normalize_value( + value[key], unordered_paths=unordered_paths, path=child_path + ) + return normalized + if isinstance(value, list): + normalized_items = [ + _normalize_value(item, unordered_paths=unordered_paths, path=path) + for item in value + ] + if path in unordered_paths: + normalized_items.sort(key=_canonical_json) + return normalized_items + if isinstance(value, str): + return _maybe_parse_number(_normalize_text(value)) + return value + + +def normalize_payload( + payload: Any, *, unordered_paths: list[str] | None = None +) -> NormalizedPayload: + """Normalize a JSON payload with deterministic rules. + + Args: + payload: Raw JSON object. + unordered_paths: Dot paths for lists that should be treated as unordered. + + Returns: + NormalizedPayload with normalized value. + """ + path_set = set(unordered_paths or []) + normalized = _normalize_value(payload, unordered_paths=path_set, path="") + return NormalizedPayload(value=normalized) diff --git a/benchmark/src/bench/rub/score.py b/benchmark/src/bench/rub/score.py new file mode 100644 index 0000000..826b6b0 --- /dev/null +++ b/benchmark/src/bench/rub/score.py @@ -0,0 +1,34 @@ +from __future__ import annotations + +from typing import Any + +from pydantic import BaseModel + +from .normalize import normalize_payload + + +class RubScore(BaseModel): + """Score result for a RUB task.""" + + score: float + ok: bool + error: str | None = None + + +def score_exact( + truth: Any, pred: Any, *, unordered_paths: list[str] | None = None +) -> RubScore: + """Compute exact-match score after normalization. + + Args: + truth: Ground-truth JSON object. + pred: Predicted JSON object. + unordered_paths: Dot paths for unordered list comparison. + + Returns: + RubScore with 1.0 for match, 0.0 otherwise. + """ + truth_norm = normalize_payload(truth, unordered_paths=unordered_paths).value + pred_norm = normalize_payload(pred, unordered_paths=unordered_paths).value + ok = truth_norm == pred_norm + return RubScore(score=1.0 if ok else 0.0, ok=ok) From dc0539054e611db54aa51117fde1505a7f1a6b78 Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Mon, 26 Jan 2026 22:45:59 +0900 Subject: [PATCH 30/38] feat: Add RUB lite support with manifest and evaluation tasks --- benchmark/README.md | 17 +++ benchmark/docs/spec.md | 142 ++++++++++-------- benchmark/rub/manifest_lite.json | 116 ++++++++++++++ benchmark/rub/truth_lite/basic_01.json | 7 + benchmark/rub/truth_lite/basic_form_01.json | 11 ++ .../certificate_of_employment_01.json | 18 +++ benchmark/rub/truth_lite/ffr_425_01.json | 13 ++ benchmark/rub/truth_lite/flowchart_01.json | 14 ++ benchmark/rub/truth_lite/flowchart_02.json | 12 ++ .../truth_lite/food_inspection_record_01.json | 7 + benchmark/rub/truth_lite/gantt_01.json | 8 + .../rub/truth_lite/heatstroke_flow_01.json | 9 ++ benchmark/rub/truth_lite/smartart_01.json | 13 ++ benchmark/rub/truth_lite/tax_report_01.json | 13 ++ benchmark/rub/truth_lite/workflow_01.json | 20 +++ benchmark/src/bench/cli.py | 20 ++- benchmark/src/bench/rub/manifest.py | 2 +- 17 files changed, 376 insertions(+), 66 deletions(-) create mode 100644 benchmark/rub/manifest_lite.json create mode 100644 benchmark/rub/truth_lite/basic_01.json create mode 100644 benchmark/rub/truth_lite/basic_form_01.json create mode 100644 benchmark/rub/truth_lite/certificate_of_employment_01.json create mode 100644 benchmark/rub/truth_lite/ffr_425_01.json create mode 100644 benchmark/rub/truth_lite/flowchart_01.json create mode 100644 benchmark/rub/truth_lite/flowchart_02.json create mode 100644 benchmark/rub/truth_lite/food_inspection_record_01.json create mode 100644 benchmark/rub/truth_lite/gantt_01.json create mode 100644 benchmark/rub/truth_lite/heatstroke_flow_01.json create mode 100644 benchmark/rub/truth_lite/smartart_01.json create mode 100644 benchmark/rub/truth_lite/tax_report_01.json create mode 100644 benchmark/rub/truth_lite/workflow_01.json diff --git a/benchmark/README.md b/benchmark/README.md index 3378a07..bdcfcf0 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -90,6 +90,23 @@ If you want a deterministic renderer without LLM calls: python -m bench.cli markdown --case all --method all --use-llm false ``` +## RUB (lite) + +RUB lite evaluates reconstruction utility using Markdown-only inputs. + +Run Stage B tasks with the lite manifest: + +```bash +python -m bench.cli rub-ask --task all --method all --manifest rub/manifest_lite.json +python -m bench.cli rub-eval --manifest rub/manifest_lite.json +python -m bench.cli rub-report +``` + +Outputs: + +- outputs/rub/results/rub_results.csv +- outputs/rub/results/report.md + ## Evaluation protocol (public) To ensure reproducibility and fair comparison, follow these fixed settings: diff --git a/benchmark/docs/spec.md b/benchmark/docs/spec.md index 35d85db..bdf10aa 100644 --- a/benchmark/docs/spec.md +++ b/benchmark/docs/spec.md @@ -1,77 +1,93 @@ # Reconstruction Utility Benchmark (RUB) Specification -## 1. 目的 +## 0. ????v0.1 / lite? -RUB は「再構築された Markdown が後続タスクにどれだけ耐えるか」を機械的に測る。 -Markdown 文字列の一致ではなく、構造利用性(Reconstruction Utility)を評価対象とする。 +???????? **RUB lite (v0.1)** ?????? +???????????????????? 0 ???????????????? -## 2. 評価対象 +RUB lite ??????? -- 入力: 同一 Excel 文書 -- 手法: pdf / image_vlm / exstruct / html / openpyxl -- 出力: 各手法で生成した Markdown -- 評価: Markdown のみを入力にした構造クエリの解答精度 +- benchmark/rub/manifest_lite.json +- benchmark/rub/truth_lite/*.json -## 3. 評価フロー(2段階) +????v1??????????????? -### Stage A: 再構築 +## 1. ?? -各手法で Markdown を生成する。 +RUB ???????? Markdown ???????????????????????? +Markdown ?????????????????Reconstruction Utility?????????? -- pdf: soffice → pdf → テキスト抽出 → Markdown -- image_vlm: 画像レンダリング → VLM → Markdown -- exstruct: exstruct JSON → LLM → Markdown -- html / openpyxl: テキスト抽出 → Markdown +## 2. ???? -### Stage B: 利用(採点対象) +- ??: ?? Excel ?? +- ??: pdf / image_vlm / exstruct / html / openpyxl +- ??: ???????? Markdown +- ??: Markdown ?????????????????? -Stage A の Markdown だけを入力に、構造クエリを解かせる。 +## 3. ??????2??? -- 出力は JSON のみ -- JSON はスキーマ固定 -- 採点は正規化後の完全一致(deterministic) +### Stage A: ??? -## 4. タスク設計方針 +???? Markdown ?????? -- 文字列一致に依存せず、構造理解を問う -- 手法間で不公平にならないよう、入力は Markdown のみ -- 各タスクは以下のいずれかに分類する - - 集合問題(項目一覧) - - グラフ問題(ノード/エッジ) - - 階層問題(親子関係) - - 表問題(行列の対応) -- 丸数字や装飾記号など、表記ゆれが大きい要素は避ける +- pdf: soffice ? pdf ? ?????? ? Markdown +- image_vlm: ???????? ? VLM ? Markdown +- exstruct: exstruct JSON ? LLM ? Markdown +- html / openpyxl: ?????? ? Markdown -## 5. 正規化(決定的) +### Stage B: ???????? -採点前に以下の正規化を行う。 +Stage A ? Markdown ?????????????????? -- 文字列: 前後空白削除、連続空白を 1 つに、改行は \n に統一 -- 辞書: キーソート(canonicalization) -- 配列: 順序が意味を持たないタスクはソート -- 数値: 可能な範囲で数値化(例: "012" → 12) +- ??? JSON ?? +- JSON ??????? +- ?????????????deterministic? -## 6. 採点指標 +## 4. ??????? -### 6.1 主指標: RUS +- ?????????????????? +- ?????????????????? Markdown ?? +- ????????????????? + - ?????????? + - ?????????/???? + - ?????????? + - ?????????? +- ????????????????????????? -RUS = 正解数 / 問題数 +## 5. ???????? -### 6.2 副指標 +?????????????? + +- ???: ???????????? 1 ?????? + ??? +- ??: ??????canonicalization? +- ??: ????????????????? +- ??: ???????????: "012" ? 12? + +## 6. ???? + +### 6.1 ???: RUS + +RUS = ??? / ??? + +### 6.2 ??? - Cost-normalized RUS = RUS / cost_usd - Token-normalized RUS = RUS / input_tokens -- Stage A failure rate = Markdown 生成失敗率 +- Stage A failure rate = Markdown ????? -## 7. データ構成 +## 7. ????? ``` benchmark/ rub/ README.md BENCHMARK_SPEC.md - manifest.json - truth/ + manifest.json # ??? (v1) + manifest_lite.json # ??? (v0.1 / lite) + truth/ # ??? (v1) + *.json + truth_lite/ # ??? (v0.1 / lite) *.json schemas/ *.schema.json @@ -83,28 +99,34 @@ benchmark/ scoring_flow.mmd ``` -## 8. manifest 仕様(案) +## 8. manifest ????? + +- id: ???ID +- type: ????? +- xlsx: ??????? +- question: Stage B ??? +- truth: ?? JSON ?? +- sheet_scope: ??????null ????? +- render: ??????? + +## 8.1 RUB lite ???? -- id: ケースID -- type: タスク種別 -- xlsx: 元ファイルパス -- question: Stage B クエリ -- truth: 正解 JSON パス -- sheet_scope: 対象シート(null なら全体) -- render: 画像レンダ設定 +- ?????????????? +- ??????unordered_paths????????????? +- 0/1 ???????????????????????? -## 9. 再現性 +## 9. ??? -- モデル名、温度、実行日時を記録 -- 正規化ルールと採点コードを完全公開 -- ランダム性がある工程は温度 0 固定 +- ??????????????? +- ????????????????? +- ????????????? 0 ?? -## 10. 公開時の注意 +## 10. ?????? -- 「Markdown 一致」は補助指標としてのみ扱う -- RUS(利用可能性)を主指標として説明する -- 用途が異なる手法を同一スコアで殴らない +- ?Markdown ??????????????? +- RUS?????????????????? +- ??????????????????? --- -この仕様は v1 とし、変更時は履歴を残す。 +????? v0.1?lite?????????????????? diff --git a/benchmark/rub/manifest_lite.json b/benchmark/rub/manifest_lite.json new file mode 100644 index 0000000..5029568 --- /dev/null +++ b/benchmark/rub/manifest_lite.json @@ -0,0 +1,116 @@ +{ + "tasks": [ + { + "id": "ffr_425_01", + "source_case_id": "ffr_425_01", + "type": "application_form", + "question": "Extract section headers. JSON only: {\"section_headers\":[\"...\"]}", + "truth": "rub/truth_lite/ffr_425_01.json", + "unordered_paths": [ + "section_headers" + ] + }, + { + "id": "certificate_of_employment_01", + "source_case_id": "certificate_of_employment_01", + "type": "application_form", + "question": "Extract section names. JSON only: {\"sections\":[\"...\"]}", + "truth": "rub/truth_lite/certificate_of_employment_01.json", + "unordered_paths": [ + "sections" + ] + }, + { + "id": "tax_report_01", + "source_case_id": "tax_report_01", + "type": "application_form", + "question": "Extract income item labels. JSON only: {\"income_items\":[\"...\"]}", + "truth": "rub/truth_lite/tax_report_01.json", + "unordered_paths": [ + "income_items" + ] + }, + { + "id": "basic_01", + "source_case_id": "basic_01", + "type": "mixed_document", + "question": "Extract chart series names. JSON only: {\"chart_series\":[\"...\"]}", + "truth": "rub/truth_lite/basic_01.json", + "unordered_paths": [ + "chart_series" + ] + }, + { + "id": "heatstroke_flow_01", + "source_case_id": "heatstroke_flow_01", + "type": "flowchart", + "question": "Extract step names in order. JSON only: {\"steps\":[\"...\"]}", + "truth": "rub/truth_lite/heatstroke_flow_01.json" + }, + { + "id": "workflow_01", + "source_case_id": "workflow_01", + "type": "workflow", + "question": "Extract node names. JSON only: {\"nodes\":[\"...\"]}", + "truth": "rub/truth_lite/workflow_01.json", + "unordered_paths": [ + "nodes" + ] + }, + { + "id": "flowchart_02", + "source_case_id": "flowchart_02", + "type": "flowchart", + "question": "Extract node names. JSON only: {\"nodes\":[\"...\"]}", + "truth": "rub/truth_lite/flowchart_02.json", + "unordered_paths": [ + "nodes" + ] + }, + { + "id": "food_inspection_record_01", + "source_case_id": "food_inspection_record_01", + "type": "inspection_log", + "question": "Extract first date per sheet. JSON only: {\"dates_by_sheet\": {\"sheet\": \"date\"}}", + "truth": "rub/truth_lite/food_inspection_record_01.json" + }, + { + "id": "basic_form_01", + "source_case_id": "basic_form_01", + "type": "application_form", + "question": "Extract applicant field labels. JSON only: {\"applicant_fields\":[\"...\"]}", + "truth": "rub/truth_lite/basic_form_01.json", + "unordered_paths": [ + "applicant_fields" + ] + }, + { + "id": "flowchart_01", + "source_case_id": "flowchart_01", + "type": "flowchart", + "question": "Extract flowchart step names in order. JSON only: {\"steps\":[\"...\"]}", + "truth": "rub/truth_lite/flowchart_01.json" + }, + { + "id": "gantt_01", + "source_case_id": "gantt_01", + "type": "gantt", + "question": "Extract task names. JSON only: {\"task_names\":[\"...\"]}", + "truth": "rub/truth_lite/gantt_01.json", + "unordered_paths": [ + "task_names" + ] + }, + { + "id": "smartart_01", + "source_case_id": "smartart_01", + "type": "organization_chart", + "question": "Extract sales_departments and production_sites. JSON only: {\"sales_departments\":[\"...\"],\"production_sites\":[\"...\"]}", + "truth": "rub/truth_lite/smartart_01.json", + "unordered_paths": [ + "sales_departments", + "production_sites" + ] + } + ] +} \ No newline at end of file diff --git a/benchmark/rub/truth_lite/basic_01.json b/benchmark/rub/truth_lite/basic_01.json new file mode 100644 index 0000000..b4d2709 --- /dev/null +++ b/benchmark/rub/truth_lite/basic_01.json @@ -0,0 +1,7 @@ +{ + "chart_series": [ + "製品A", + "製品B", + "製品C" + ] +} \ No newline at end of file diff --git a/benchmark/rub/truth_lite/basic_form_01.json b/benchmark/rub/truth_lite/basic_form_01.json new file mode 100644 index 0000000..b65c3a5 --- /dev/null +++ b/benchmark/rub/truth_lite/basic_form_01.json @@ -0,0 +1,11 @@ +{ + "applicant_fields": [ + "フリガナ", + "被保険者氏名", + "生年月日", + "住所", + "連絡先", + "入所(院)した介護保険施設の所在地及び名称", + "入所(院)年月日" + ] +} \ No newline at end of file diff --git a/benchmark/rub/truth_lite/certificate_of_employment_01.json b/benchmark/rub/truth_lite/certificate_of_employment_01.json new file mode 100644 index 0000000..2cf13e4 --- /dev/null +++ b/benchmark/rub/truth_lite/certificate_of_employment_01.json @@ -0,0 +1,18 @@ +{ + "sections": [ + "業種", + "本人氏名", + "雇用(予定)期間等", + "本人就労先事業所", + "雇用の形態", + "就労時間(固定就労の場合)", + "就労時間(変則就労の場合)", + "就労実績", + "産前・産後休業の取得", + "育児休業の取得", + "産休・育休以外の休業の取得", + "復職(予定)年月日", + "育児のための短時間勤務制度利用有無", + "保育士等としての勤務実態の有無" + ] +} \ No newline at end of file diff --git a/benchmark/rub/truth_lite/ffr_425_01.json b/benchmark/rub/truth_lite/ffr_425_01.json new file mode 100644 index 0000000..dba5797 --- /dev/null +++ b/benchmark/rub/truth_lite/ffr_425_01.json @@ -0,0 +1,13 @@ +{ + "section_headers": [ + "Federal Agency and Organizational Element to Which Report is Submitted", + "Federal Grant or Other Identifying Number Assigned by Federal Agency", + "Recipient Organization (Name and complete address including Zip code)", + "DUNS Number", + "Recipient Account Number or Identifying Number", + "Report Type", + "Basis of Accounting", + "Project/Grant Period", + "Reporting Period End Date" + ] +} \ No newline at end of file diff --git a/benchmark/rub/truth_lite/flowchart_01.json b/benchmark/rub/truth_lite/flowchart_01.json new file mode 100644 index 0000000..b3b354c --- /dev/null +++ b/benchmark/rub/truth_lite/flowchart_01.json @@ -0,0 +1,14 @@ +{ + "steps": [ + "S", + "要件抽出", + "ヒアリング", + "非機能要件", + "思考実験", + "再検証", + "まとめ", + "文書作成", + "締結", + "E" + ] +} \ No newline at end of file diff --git a/benchmark/rub/truth_lite/flowchart_02.json b/benchmark/rub/truth_lite/flowchart_02.json new file mode 100644 index 0000000..3c2ecac --- /dev/null +++ b/benchmark/rub/truth_lite/flowchart_02.json @@ -0,0 +1,12 @@ +{ + "nodes": [ + "ログイン画面", + "登録情報を入力", + "入力内容は正しいか", + "再入力を提示", + "サーバーに認証リクエストを送信", + "認証に成功か", + "認証エラーを提示", + "操作画面に遷移" + ] +} \ No newline at end of file diff --git a/benchmark/rub/truth_lite/food_inspection_record_01.json b/benchmark/rub/truth_lite/food_inspection_record_01.json new file mode 100644 index 0000000..cf2f64f --- /dev/null +++ b/benchmark/rub/truth_lite/food_inspection_record_01.json @@ -0,0 +1,7 @@ +{ + "dates_by_sheet": { + "検食簿(1)": "12月1日(月)", + "検食簿 (2)": "12月8日(月)", + "検食簿 (3)": "12月15日(月)" + } +} \ No newline at end of file diff --git a/benchmark/rub/truth_lite/gantt_01.json b/benchmark/rub/truth_lite/gantt_01.json new file mode 100644 index 0000000..e9823e9 --- /dev/null +++ b/benchmark/rub/truth_lite/gantt_01.json @@ -0,0 +1,8 @@ +{ + "task_names": [ + "Core Feature Dev", + "Edge Case Handling", + "Integration Work", + "Internal Review" + ] +} \ No newline at end of file diff --git a/benchmark/rub/truth_lite/heatstroke_flow_01.json b/benchmark/rub/truth_lite/heatstroke_flow_01.json new file mode 100644 index 0000000..beb233c --- /dev/null +++ b/benchmark/rub/truth_lite/heatstroke_flow_01.json @@ -0,0 +1,9 @@ +{ + "steps": [ + "発見", + "報告", + "初期対応", + "医療機関搬送・救急要請", + "事後対応・記録" + ] +} \ No newline at end of file diff --git a/benchmark/rub/truth_lite/smartart_01.json b/benchmark/rub/truth_lite/smartart_01.json new file mode 100644 index 0000000..051fd53 --- /dev/null +++ b/benchmark/rub/truth_lite/smartart_01.json @@ -0,0 +1,13 @@ +{ + "sales_departments": [ + "第1営業課", + "第2営業課", + "第3営業課", + "海外営業課" + ], + "production_sites": [ + "愛知工場", + "山形工場", + "高知工場" + ] +} \ No newline at end of file diff --git a/benchmark/rub/truth_lite/tax_report_01.json b/benchmark/rub/truth_lite/tax_report_01.json new file mode 100644 index 0000000..369bd8e --- /dev/null +++ b/benchmark/rub/truth_lite/tax_report_01.json @@ -0,0 +1,13 @@ +{ + "income_items": [ + "事業(営業等)", + "事業(農業)", + "不動産", + "利子", + "配当", + "給与", + "公的年金等", + "業務", + "その他" + ] +} \ No newline at end of file diff --git a/benchmark/rub/truth_lite/workflow_01.json b/benchmark/rub/truth_lite/workflow_01.json new file mode 100644 index 0000000..e1afa6e --- /dev/null +++ b/benchmark/rub/truth_lite/workflow_01.json @@ -0,0 +1,20 @@ +{ + "nodes": [ + "商品検索", + "商品情報を表示", + "検討", + "キャンセル", + "カートに追加", + "在庫確認", + "レジに進む", + "支払い方法の選択", + "支払いの案内", + "支払い処理", + "注文の確定", + "配送先入力", + "配送先確認", + "注文確認メールを送信", + "商品を準備・発送", + "商品受取" + ] +} \ No newline at end of file diff --git a/benchmark/src/bench/cli.py b/benchmark/src/bench/cli.py index 0e1db15..de3eb24 100644 --- a/benchmark/src/bench/cli.py +++ b/benchmark/src/bench/cli.py @@ -202,12 +202,17 @@ def _select_methods(method: str) -> list[str]: return deduped -def _rub_manifest_path() -> Path: +def _rub_manifest_path(manifest_path: str | None) -> Path: """Return the path to the RUB manifest. + Args: + manifest_path: Optional override path from CLI. + Returns: - Path to rub/manifest.json. + Path to the RUB manifest file. """ + if manifest_path: + return resolve_path(manifest_path) return RUB_MANIFEST @@ -566,6 +571,7 @@ def rub_ask( method: str = "all", model: str = "gpt-4o", temperature: float = 0.0, + manifest: str | None = None, ) -> None: """Run RUB Stage B queries using Markdown outputs as context. @@ -574,8 +580,9 @@ def rub_ask( method: Comma-separated method names or "all". model: OpenAI model name for Stage B queries. temperature: Sampling temperature for the model. + manifest: Optional RUB manifest path override. """ - rub_manifest = load_rub_manifest(_rub_manifest_path()) + rub_manifest = load_rub_manifest(_rub_manifest_path(manifest)) tasks = _select_tasks(rub_manifest.tasks, task) if not tasks: raise typer.BadParameter(f"No tasks matched: {task}") @@ -634,14 +641,17 @@ def rub_ask( @app.command() -def rub_eval(task: str = "all", method: str = "all") -> None: +def rub_eval( + task: str = "all", method: str = "all", manifest: str | None = None +) -> None: """Evaluate RUB responses and write results CSV. Args: task: Comma-separated task ids or "all". method: Comma-separated method names or "all". + manifest: Optional RUB manifest path override. """ - rub_manifest = load_rub_manifest(_rub_manifest_path()) + rub_manifest = load_rub_manifest(_rub_manifest_path(manifest)) tasks = _select_tasks(rub_manifest.tasks, task) if not tasks: raise typer.BadParameter(f"No tasks matched: {task}") diff --git a/benchmark/src/bench/rub/manifest.py b/benchmark/src/bench/rub/manifest.py index 9eda762..4708a33 100644 --- a/benchmark/src/bench/rub/manifest.py +++ b/benchmark/src/bench/rub/manifest.py @@ -33,5 +33,5 @@ def load_rub_manifest(path: Path) -> RubManifest: Returns: Parsed RubManifest. """ - data = json.loads(path.read_text(encoding="utf-8")) + data = json.loads(path.read_text(encoding="utf-8-sig")) return RubManifest(**data) From 522c902fe5047dc16f76cbe80bf8bc180f0763b1 Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Tue, 27 Jan 2026 21:58:58 +0900 Subject: [PATCH 31/38] feat: Enhance Markdown functionality with full-document generation and scoring metrics --- README.ja.md | 2 +- README.md | 2 +- benchmark/src/bench/cli.py | 131 ++++++++++++++++++++- benchmark/src/bench/eval/markdown_score.py | 18 ++- benchmark/src/bench/llm/openai_client.py | 96 +++++++++++++++ benchmark/src/bench/paths.py | 2 + benchmark/src/bench/rub/normalize.py | 17 ++- benchmark/src/bench/rub/score.py | 93 +++++++++++++++ docs/README.en.md | 2 +- docs/README.ja.md | 2 +- 10 files changed, 354 insertions(+), 11 deletions(-) diff --git a/README.ja.md b/README.ja.md index 122a1f9..d1b8583 100644 --- a/README.ja.md +++ b/README.ja.md @@ -1,6 +1,6 @@ # ExStruct — Excel 構造化抽出エンジン -[![PyPI version](https://badge.fury.io/py/exstruct.svg)](https://pypi.org/project/exstruct/) [![PyPI Downloads](https://static.pepy.tech/personalized-badge/exstruct?period=total&units=INTERNATIONAL_SYSTEM&left_color=BLACK&right_color=GREEN&left_text=downloads)](https://pepy.tech/projects/exstruct) ![Licence: BSD-3-Clause](https://img.shields.io/badge/license-BSD--3--Clause-blue?style=flat-square) [![pytest](https://github.com/harumiWeb/exstruct/actions/workflows/pytest.yml/badge.svg)](https://github.com/harumiWeb/exstruct/actions/workflows/pytest.yml) [![Codacy Badge](https://app.codacy.com/project/badge/Grade/e081cb4f634e4175b259eb7c34f54f60)](https://app.codacy.com/gh/harumiWeb/exstruct/dashboard?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_grade) [![codecov](https://codecov.io/gh/harumiWeb/exstruct/graph/badge.svg?token=2XI1O8TTA9)](https://codecov.io/gh/harumiWeb/exstruct) +[![PyPI version](https://badge.fury.io/py/exstruct.svg)](https://pypi.org/project/exstruct/) [![PyPI Downloads](https://static.pepy.tech/personalized-badge/exstruct?period=total&units=INTERNATIONAL_SYSTEM&left_color=BLACK&right_color=GREEN&left_text=downloads)](https://pepy.tech/projects/exstruct) ![Licence: BSD-3-Clause](https://img.shields.io/badge/license-BSD--3--Clause-blue?style=flat-square) [![pytest](https://github.com/harumiWeb/exstruct/actions/workflows/pytest.yml/badge.svg)](https://github.com/harumiWeb/exstruct/actions/workflows/pytest.yml) [![Codacy Badge](https://app.codacy.com/project/badge/Grade/e081cb4f634e4175b259eb7c34f54f60)](https://app.codacy.com/gh/harumiWeb/exstruct/dashboard?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_grade) [![codecov](https://codecov.io/gh/harumiWeb/exstruct/graph/badge.svg?token=2XI1O8TTA9)](https://codecov.io/gh/harumiWeb/exstruct) [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/harumiWeb/exstruct) ![ExStruct Image](docs/assets/icon.webp) diff --git a/README.md b/README.md index 1d78e3e..6f0aeb2 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # ExStruct — Excel Structured Extraction Engine -[![PyPI version](https://badge.fury.io/py/exstruct.svg)](https://pypi.org/project/exstruct/) [![PyPI Downloads](https://static.pepy.tech/personalized-badge/exstruct?period=total&units=INTERNATIONAL_SYSTEM&left_color=BLACK&right_color=GREEN&left_text=downloads)](https://pepy.tech/projects/exstruct) ![Licence: BSD-3-Clause](https://img.shields.io/badge/license-BSD--3--Clause-blue?style=flat-square) [![pytest](https://github.com/harumiWeb/exstruct/actions/workflows/pytest.yml/badge.svg)](https://github.com/harumiWeb/exstruct/actions/workflows/pytest.yml) [![Codacy Badge](https://app.codacy.com/project/badge/Grade/e081cb4f634e4175b259eb7c34f54f60)](https://app.codacy.com/gh/harumiWeb/exstruct/dashboard?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_grade) [![codecov](https://codecov.io/gh/harumiWeb/exstruct/graph/badge.svg?token=2XI1O8TTA9)](https://codecov.io/gh/harumiWeb/exstruct) +[![PyPI version](https://badge.fury.io/py/exstruct.svg)](https://pypi.org/project/exstruct/) [![PyPI Downloads](https://static.pepy.tech/personalized-badge/exstruct?period=total&units=INTERNATIONAL_SYSTEM&left_color=BLACK&right_color=GREEN&left_text=downloads)](https://pepy.tech/projects/exstruct) ![Licence: BSD-3-Clause](https://img.shields.io/badge/license-BSD--3--Clause-blue?style=flat-square) [![pytest](https://github.com/harumiWeb/exstruct/actions/workflows/pytest.yml/badge.svg)](https://github.com/harumiWeb/exstruct/actions/workflows/pytest.yml) [![Codacy Badge](https://app.codacy.com/project/badge/Grade/e081cb4f634e4175b259eb7c34f54f60)](https://app.codacy.com/gh/harumiWeb/exstruct/dashboard?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_grade) [![codecov](https://codecov.io/gh/harumiWeb/exstruct/graph/badge.svg?token=2XI1O8TTA9)](https://codecov.io/gh/harumiWeb/exstruct) [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/harumiWeb/exstruct) ![ExStruct Image](docs/assets/icon.webp) diff --git a/benchmark/src/bench/cli.py b/benchmark/src/bench/cli.py index de3eb24..bcdb443 100644 --- a/benchmark/src/bench/cli.py +++ b/benchmark/src/bench/cli.py @@ -27,6 +27,8 @@ DATA_DIR, EXTRACTED_DIR, MARKDOWN_DIR, + MARKDOWN_FULL_DIR, + MARKDOWN_FULL_RESPONSES_DIR, MARKDOWN_RESPONSES_DIR, PROMPTS_DIR, RESPONSES_DIR, @@ -45,7 +47,7 @@ from .pipeline.openpyxl_pandas import extract_openpyxl from .pipeline.pdf_text import pdf_to_text, xlsx_to_pdf from .rub.manifest import RubTask, load_rub_manifest -from .rub.score import score_exact +from .rub.score import RubPartialScore, score_exact, score_partial app = typer.Typer(add_completion=False) console = Console() @@ -122,6 +124,9 @@ class RubResultRow(BaseModel): method: str model: str | None score: float + partial_precision: float | None = None + partial_recall: float | None = None + partial_f1: float | None = None ok: bool input_tokens: int output_tokens: int @@ -291,6 +296,13 @@ def _reset_markdown_outputs(case_id: str) -> None: path.unlink() +def _reset_markdown_full_outputs(case_id: str) -> None: + """Delete existing full-markdown logs for a case.""" + path = MARKDOWN_FULL_RESPONSES_DIR / f"{case_id}.jsonl" + if path.exists(): + path.unlink() + + def _dump_jsonl(obj: BaseModel) -> str: """Serialize a record for JSONL output. @@ -565,12 +577,109 @@ def markdown( ) +@app.command() +def markdown_full( + case: str = "all", + method: str = "all", + model: str = "gpt-4o", + temperature: float = 0.0, +) -> None: + """Generate full-document Markdown from extracted contexts. + + Args: + case: Comma-separated case ids or "all". + method: Comma-separated method names or "all". + model: OpenAI model name for Markdown conversion. + temperature: Sampling temperature for the model. + """ + mf = load_manifest(_manifest_path()) + cases = _select_cases(mf.cases, case) + if not cases: + raise typer.BadParameter(f"No cases matched: {case}") + methods = _select_methods(method) + + client = OpenAIResponsesClient() + ensure_dir(MARKDOWN_FULL_DIR) + ensure_dir(MARKDOWN_FULL_RESPONSES_DIR) + total_cost = 0.0 + total_calls = 0 + + for c in cases: + console.rule(f"MARKDOWN FULL {c.id}") + _reset_markdown_full_outputs(c.id) + case_dir = MARKDOWN_FULL_DIR / c.id + ensure_dir(case_dir) + md_file = MARKDOWN_FULL_RESPONSES_DIR / f"{c.id}.jsonl" + + for m in methods: + try: + if m == "image_vlm": + img_dir = EXTRACTED_DIR / "image_vlm" / c.id + images_json = img_dir / "images.json" + if not images_json.exists(): + print(f"[yellow]skip: missing images for {c.id}[/yellow]") + continue + imgs = json.loads(images_json.read_text(encoding="utf-8"))["images"] + img_paths = [Path(p) for p in imgs] + if not img_paths: + print(f"[yellow]skip: no images for {c.id}[/yellow]") + continue + prompt_hash = sha256_text("|".join([p.name for p in img_paths])) + res = client.ask_markdown_images( + model=model, image_paths=img_paths, temperature=temperature + ) + else: + txt_path = EXTRACTED_DIR / m / f"{c.id}.txt" + if not txt_path.exists(): + print( + f"[yellow]skip: missing context for {c.id} ({m})[/yellow]" + ) + continue + context_text = txt_path.read_text(encoding="utf-8") + prompt_hash = sha256_text(context_text) + res = client.ask_markdown_from_text( + model=model, + context_text=context_text, + temperature=temperature, + ) + + md_text = res.text + md_rec = MarkdownRecord( + case_id=c.id, + method=m, + model=model, + temperature=temperature, + prompt_hash=prompt_hash, + text=md_text, + input_tokens=res.input_tokens, + output_tokens=res.output_tokens, + cost_usd=res.cost_usd, + raw=res.raw, + ) + total_cost += res.cost_usd + total_calls += 1 + line = _dump_jsonl(md_rec) + with md_file.open("a", encoding="utf-8") as f: + f.write(line + "\n") + + out_md = case_dir / f"{m}.md" + out_md.write_text(md_text, encoding="utf-8") + print(f"[green]{c.id} {m} -> {out_md}[/green]") + except Exception as exc: + print(f"[yellow]skip: markdown full {c.id} {m} ({exc})[/yellow]") + + print( + f"[green]Markdown full cost: ${total_cost:.6f} ({total_calls} call(s))[/green]" + ) + + @app.command() def rub_ask( task: str = "all", method: str = "all", model: str = "gpt-4o", temperature: float = 0.0, + context: str = "partial", manifest: str | None = None, ) -> None: """Run RUB Stage B queries using Markdown outputs as context. @@ -580,6 +689,7 @@ def rub_ask( method: Comma-separated method names or "all". model: OpenAI model name for Stage B queries. temperature: Sampling temperature for the model. + context: Markdown source ("partial" or "full"). manifest: Optional RUB manifest path override. """ rub_manifest = load_rub_manifest(_rub_manifest_path(manifest)) @@ -587,6 +697,10 @@ def rub_ask( if not tasks: raise typer.BadParameter(f"No tasks matched: {task}") methods = _select_methods(method) + context_key = context.lower().strip() + if context_key not in {"partial", "full"}: + raise typer.BadParameter(f"Invalid context: {context}") + md_root = MARKDOWN_DIR if context_key == "partial" else MARKDOWN_FULL_DIR ensure_dir(RUB_OUT_DIR) ensure_dir(RUB_PROMPTS_DIR) @@ -601,7 +715,7 @@ def rub_ask( _reset_rub_outputs(t.id) resp_file = RUB_RESPONSES_DIR / f"{t.id}.jsonl" for m in methods: - md_path = MARKDOWN_DIR / t.source_case_id / f"{m}.md" + md_path = md_root / t.source_case_id / f"{m}.md" if not md_path.exists(): print(f"[yellow]skip: missing markdown {t.id} {m}[/yellow]") continue @@ -677,6 +791,7 @@ def rub_eval( for m, rec in latest.items(): score = 0.0 ok = False + partial: RubPartialScore | None = None err: str | None = None try: pred_obj = normalize_json_text(rec["text"]) @@ -685,6 +800,9 @@ def rub_eval( ) score = score_res.score ok = score_res.ok + partial = score_partial( + truth, pred_obj, unordered_paths=t.unordered_paths + ) except Exception as exc: err = str(exc) @@ -696,6 +814,9 @@ def rub_eval( method=m, model=rec.get("model"), score=score, + partial_precision=partial.precision if partial else None, + partial_recall=partial.recall if partial else None, + partial_f1=partial.f1 if partial else None, ok=ok, input_tokens=int(rec.get("input_tokens", 0)), output_tokens=int(rec.get("output_tokens", 0)), @@ -725,6 +846,12 @@ def rub_report() -> None: "avg_cost": ("cost_usd", "mean"), "n": ("task_id", "count"), } + if "partial_precision" in df.columns and df["partial_precision"].notna().any(): + agg["partial_precision"] = ("partial_precision", "mean") + if "partial_recall" in df.columns and df["partial_recall"].notna().any(): + agg["partial_recall"] = ("partial_recall", "mean") + if "partial_f1" in df.columns and df["partial_f1"].notna().any(): + agg["partial_f1"] = ("partial_f1", "mean") g = df.groupby("method").agg(**agg).reset_index() detail_dir = RUB_RESULTS_DIR / "detailed_reports" diff --git a/benchmark/src/bench/eval/markdown_score.py b/benchmark/src/bench/eval/markdown_score.py index 4e165a6..9178cdc 100644 --- a/benchmark/src/bench/eval/markdown_score.py +++ b/benchmark/src/bench/eval/markdown_score.py @@ -6,6 +6,15 @@ _TABLE_SEPARATOR = re.compile(r"^[\s|:-]+$") _WS_PATTERN = re.compile(r"\s+") _NUMERIC_PATTERN = re.compile(r"[+-]?\d+(?:[.,]\d+)?") +_DOT_SEPARATORS = re.compile(r"[\u30fb\uff65\u00b7\u2022\u2219]") +_ZERO_WIDTH_PATTERN = re.compile(r"[\u200b\u200c\u200d\ufeff]") +_WEEKDAY_PAREN = re.compile( + r"(?:\uFF08|\()" + r"(?:\u6708|\u706B|\u6C34|\u6728|\u91D1|\u571F|\u65E5)" + r"(?:\uFF09|\))" +) +_PAREN = re.compile(r"[\uFF08\uFF09()]") +_NON_ASCII_SPACE_PATTERN = re.compile(r"(?<=[^\x00-\x7F])\s+(?=[^\x00-\x7F])") def markdown_coverage_score(truth_md: str, pred_md: str) -> float: @@ -77,8 +86,13 @@ def _normalize_line(line: str) -> str: text = text.replace("*", "") text = text.replace(">", "") text = unicodedata.normalize("NFKC", text) - text = text.replace("窶サ", "") - text = _WS_PATTERN.sub("", text) + text = text.replace("\u3000", " ") + text = _ZERO_WIDTH_PATTERN.sub("", text) + text = _WEEKDAY_PAREN.sub("", text) + text = _PAREN.sub("", text) + text = _DOT_SEPARATORS.sub("", text) + text = _WS_PATTERN.sub(" ", text) + text = _NON_ASCII_SPACE_PATTERN.sub("", text) return text.strip() diff --git a/benchmark/src/bench/llm/openai_client.py b/benchmark/src/bench/llm/openai_client.py index da3106d..b3a8c7e 100644 --- a/benchmark/src/bench/llm/openai_client.py +++ b/benchmark/src/bench/llm/openai_client.py @@ -200,3 +200,99 @@ def ask_markdown( cost_usd=cost, raw=raw, ) + + def ask_markdown_from_text( + self, *, model: str, context_text: str, temperature: float + ) -> LLMResult: + """Call Responses API to convert raw text into Markdown. + + Args: + model: OpenAI model name (e.g., "gpt-4o"). + context_text: Extracted document text to format. + temperature: Sampling temperature for the response. + + Returns: + LLMResult containing the model output and usage metadata. + """ + instructions = ( + "You are a strict Markdown formatter. Output Markdown only.\n" + "Rules:\n" + "- Preserve all content from the input.\n" + "- Use headings and lists when they are clearly implied.\n" + "- Use tables when a row/column structure is evident.\n" + "- Do not add or invent information.\n" + ) + resp = self.client.responses.create( + model=model, + temperature=temperature, + input=[ + { + "role": "user", + "content": [ + {"type": "input_text", "text": instructions}, + {"type": "input_text", "text": f"[TEXT]\n{context_text}"}, + ], + } + ], + ) + + text = resp.output_text + usage = getattr(resp, "usage", None) + in_tok, out_tok = _extract_usage_tokens(usage) + cost = estimate_cost_usd(model, in_tok, out_tok) + + raw = json.loads(resp.model_dump_json()) + return LLMResult( + text=text, + input_tokens=in_tok, + output_tokens=out_tok, + cost_usd=cost, + raw=raw, + ) + + def ask_markdown_images( + self, *, model: str, image_paths: list[Path], temperature: float + ) -> LLMResult: + """Call Responses API to convert images into Markdown. + + Args: + model: OpenAI model name (e.g., "gpt-4o"). + image_paths: PNG image paths to include as vision input. + temperature: Sampling temperature for the response. + + Returns: + LLMResult containing the model output and usage metadata. + """ + instructions = ( + "You are a strict Markdown formatter. Output Markdown only.\n" + "Rules:\n" + "- Preserve all visible content from the images.\n" + "- Use headings and lists when they are clearly implied.\n" + "- Use tables when a row/column structure is evident.\n" + "- Do not add or invent information.\n" + ) + content: list[dict[str, Any]] = [ + {"type": "input_text", "text": instructions}, + ] + for p in image_paths: + content.append({"type": "input_image", "image_url": _png_to_data_url(p)}) + + resp = self.client.responses.create( + model=model, + temperature=temperature, + input=[{"role": "user", "content": content}], + ) + + text = resp.output_text + usage = getattr(resp, "usage", None) + in_tok, out_tok = _extract_usage_tokens(usage) + cost = estimate_cost_usd(model, in_tok, out_tok) + + raw = json.loads(resp.model_dump_json()) + return LLMResult( + text=text, + input_tokens=in_tok, + output_tokens=out_tok, + cost_usd=cost, + raw=raw, + ) diff --git a/benchmark/src/bench/paths.py b/benchmark/src/bench/paths.py index 579b730..cba6fac 100644 --- a/benchmark/src/bench/paths.py +++ b/benchmark/src/bench/paths.py @@ -13,6 +13,8 @@ RESPONSES_DIR = OUT_DIR / "responses" MARKDOWN_DIR = OUT_DIR / "markdown" MARKDOWN_RESPONSES_DIR = MARKDOWN_DIR / "responses" +MARKDOWN_FULL_DIR = OUT_DIR / "markdown_full" +MARKDOWN_FULL_RESPONSES_DIR = MARKDOWN_FULL_DIR / "responses" RESULTS_DIR = OUT_DIR / "results" RUB_DIR = ROOT / "rub" RUB_MANIFEST = RUB_DIR / "manifest.json" diff --git a/benchmark/src/bench/rub/normalize.py b/benchmark/src/bench/rub/normalize.py index 1243635..d3a4a12 100644 --- a/benchmark/src/bench/rub/normalize.py +++ b/benchmark/src/bench/rub/normalize.py @@ -2,6 +2,7 @@ import json import re +import unicodedata from typing import Any from pydantic import BaseModel @@ -13,6 +14,11 @@ class NormalizedPayload(BaseModel): value: Any +_WS_PATTERN = re.compile(r"\s+") +_ZERO_WIDTH_PATTERN = re.compile(r"[\u200b\u200c\u200d\ufeff]") +_NON_ASCII_SPACE_PATTERN = re.compile(r"(?<=[^\x00-\x7F])\s+(?=[^\x00-\x7F])") + + def _normalize_text(value: str) -> str: """Normalize a string for comparison. @@ -22,9 +28,14 @@ def _normalize_text(value: str) -> str: Returns: Normalized string. """ - text = value.replace("\r\n", "\n").replace("\r", "\n").strip() - text = re.sub(r"\s+", " ", text) - return text + text = value.replace("\r\n", "\n").replace("\r", "\n") + text = unicodedata.normalize("NFKC", text) + text = text.replace("\u3000", " ") + text = _ZERO_WIDTH_PATTERN.sub("", text) + text = text.strip() + text = _WS_PATTERN.sub(" ", text) + text = _NON_ASCII_SPACE_PATTERN.sub("", text) + return text.strip() def _maybe_parse_number(value: str) -> int | float | str: diff --git a/benchmark/src/bench/rub/score.py b/benchmark/src/bench/rub/score.py index 826b6b0..bc7f9c7 100644 --- a/benchmark/src/bench/rub/score.py +++ b/benchmark/src/bench/rub/score.py @@ -1,5 +1,6 @@ from __future__ import annotations +from collections import Counter from typing import Any from pydantic import BaseModel @@ -15,6 +16,55 @@ class RubScore(BaseModel): error: str | None = None +class RubPartialScore(BaseModel): + """Partial match score for a RUB task.""" + + precision: float + recall: float + f1: float + + +def _tokenize_scalar(value: Any) -> str | None: + """Convert a scalar to a comparable token. + + Args: + value: Scalar value. + + Returns: + Token string or None for empty values. + """ + if value is None: + return None + if isinstance(value, str): + token = value.strip() + return token or None + return str(value) + + +def _flatten_tokens(value: Any) -> list[str]: + """Flatten a JSON-like value into scalar tokens. + + Args: + value: Normalized JSON value. + + Returns: + List of scalar tokens. + """ + tokens: list[str] = [] + if isinstance(value, dict): + for v in value.values(): + tokens.extend(_flatten_tokens(v)) + return tokens + if isinstance(value, list): + for item in value: + tokens.extend(_flatten_tokens(item)) + return tokens + token = _tokenize_scalar(value) + if token is not None: + tokens.append(token) + return tokens + + def score_exact( truth: Any, pred: Any, *, unordered_paths: list[str] | None = None ) -> RubScore: @@ -32,3 +82,46 @@ def score_exact( pred_norm = normalize_payload(pred, unordered_paths=unordered_paths).value ok = truth_norm == pred_norm return RubScore(score=1.0 if ok else 0.0, ok=ok) + + +def score_partial( + truth: Any, pred: Any, *, unordered_paths: list[str] | None = None +) -> RubPartialScore: + """Compute partial-match precision/recall/F1 after normalization. + + Args: + truth: Ground-truth JSON object. + pred: Predicted JSON object. + unordered_paths: Dot paths for unordered list comparison. + + Returns: + RubPartialScore with precision/recall/F1. + """ + truth_norm = normalize_payload(truth, unordered_paths=unordered_paths).value + pred_norm = normalize_payload(pred, unordered_paths=unordered_paths).value + + truth_tokens = _flatten_tokens(truth_norm) + pred_tokens = _flatten_tokens(pred_norm) + + truth_counts = Counter(truth_tokens) + pred_counts = Counter(pred_tokens) + overlap = sum((truth_counts & pred_counts).values()) + + truth_total = sum(truth_counts.values()) + pred_total = sum(pred_counts.values()) + + if pred_total == 0: + precision = 1.0 if truth_total == 0 else 0.0 + else: + precision = overlap / pred_total + if truth_total == 0: + recall = 1.0 if pred_total == 0 else 0.0 + else: + recall = overlap / truth_total + + if precision + recall == 0: + f1 = 0.0 + else: + f1 = 2 * precision * recall / (precision + recall) + + return RubPartialScore(precision=precision, recall=recall, f1=f1) diff --git a/docs/README.en.md b/docs/README.en.md index c63ea03..1110e14 100644 --- a/docs/README.en.md +++ b/docs/README.en.md @@ -1,6 +1,6 @@ # ExStruct — Excel Structured Extraction Engine -[![PyPI version](https://badge.fury.io/py/exstruct.svg)](https://pypi.org/project/exstruct/) [![PyPI Downloads](https://static.pepy.tech/personalized-badge/exstruct?period=total&units=INTERNATIONAL_SYSTEM&left_color=BLACK&right_color=GREEN&left_text=downloads)](https://pepy.tech/projects/exstruct) ![Licence: BSD-3-Clause](https://img.shields.io/badge/license-BSD--3--Clause-blue?style=flat-square) [![pytest](https://github.com/harumiWeb/exstruct/actions/workflows/pytest.yml/badge.svg)](https://github.com/harumiWeb/exstruct/actions/workflows/pytest.yml) [![Codacy Badge](https://app.codacy.com/project/badge/Grade/e081cb4f634e4175b259eb7c34f54f60)](https://app.codacy.com/gh/harumiWeb/exstruct/dashboard?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_grade) [![codecov](https://codecov.io/gh/harumiWeb/exstruct/graph/badge.svg?token=2XI1O8TTA9)](https://codecov.io/gh/harumiWeb/exstruct) +[![PyPI version](https://badge.fury.io/py/exstruct.svg)](https://pypi.org/project/exstruct/) [![PyPI Downloads](https://static.pepy.tech/personalized-badge/exstruct?period=total&units=INTERNATIONAL_SYSTEM&left_color=BLACK&right_color=GREEN&left_text=downloads)](https://pepy.tech/projects/exstruct) ![Licence: BSD-3-Clause](https://img.shields.io/badge/license-BSD--3--Clause-blue?style=flat-square) [![pytest](https://github.com/harumiWeb/exstruct/actions/workflows/pytest.yml/badge.svg)](https://github.com/harumiWeb/exstruct/actions/workflows/pytest.yml) [![Codacy Badge](https://app.codacy.com/project/badge/Grade/e081cb4f634e4175b259eb7c34f54f60)](https://app.codacy.com/gh/harumiWeb/exstruct/dashboard?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_grade) [![codecov](https://codecov.io/gh/harumiWeb/exstruct/graph/badge.svg?token=2XI1O8TTA9)](https://codecov.io/gh/harumiWeb/exstruct) [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/harumiWeb/exstruct) ![ExStruct Image](assets/icon.webp) diff --git a/docs/README.ja.md b/docs/README.ja.md index 17595ef..69e02df 100644 --- a/docs/README.ja.md +++ b/docs/README.ja.md @@ -1,6 +1,6 @@ # ExStruct — Excel 構造化抽出エンジン -[![PyPI version](https://badge.fury.io/py/exstruct.svg)](https://pypi.org/project/exstruct/) [![PyPI Downloads](https://static.pepy.tech/personalized-badge/exstruct?period=total&units=INTERNATIONAL_SYSTEM&left_color=BLACK&right_color=GREEN&left_text=downloads)](https://pepy.tech/projects/exstruct) ![Licence: BSD-3-Clause](https://img.shields.io/badge/license-BSD--3--Clause-blue?style=flat-square) [![pytest](https://github.com/harumiWeb/exstruct/actions/workflows/pytest.yml/badge.svg)](https://github.com/harumiWeb/exstruct/actions/workflows/pytest.yml) [![Codacy Badge](https://app.codacy.com/project/badge/Grade/e081cb4f634e4175b259eb7c34f54f60)](https://app.codacy.com/gh/harumiWeb/exstruct/dashboard?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_grade) [![codecov](https://codecov.io/gh/harumiWeb/exstruct/graph/badge.svg?token=2XI1O8TTA9)](https://codecov.io/gh/harumiWeb/exstruct) +[![PyPI version](https://badge.fury.io/py/exstruct.svg)](https://pypi.org/project/exstruct/) [![PyPI Downloads](https://static.pepy.tech/personalized-badge/exstruct?period=total&units=INTERNATIONAL_SYSTEM&left_color=BLACK&right_color=GREEN&left_text=downloads)](https://pepy.tech/projects/exstruct) ![Licence: BSD-3-Clause](https://img.shields.io/badge/license-BSD--3--Clause-blue?style=flat-square) [![pytest](https://github.com/harumiWeb/exstruct/actions/workflows/pytest.yml/badge.svg)](https://github.com/harumiWeb/exstruct/actions/workflows/pytest.yml) [![Codacy Badge](https://app.codacy.com/project/badge/Grade/e081cb4f634e4175b259eb7c34f54f60)](https://app.codacy.com/gh/harumiWeb/exstruct/dashboard?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_grade) [![codecov](https://codecov.io/gh/harumiWeb/exstruct/graph/badge.svg?token=2XI1O8TTA9)](https://codecov.io/gh/harumiWeb/exstruct) [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/harumiWeb/exstruct) ![ExStruct Image](/assets/icon.webp) From f48afd14b2cc8d7f6a5799db8d0ab3030b868cf2 Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Tue, 27 Jan 2026 22:25:37 +0900 Subject: [PATCH 32/38] feat: Refactor cost estimation to use a pricing dictionary for model support --- benchmark/src/bench/llm/pricing.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/benchmark/src/bench/llm/pricing.py b/benchmark/src/bench/llm/pricing.py index e409eef..7124021 100644 --- a/benchmark/src/bench/llm/pricing.py +++ b/benchmark/src/bench/llm/pricing.py @@ -8,12 +8,18 @@ GPT4O_INPUT_PER_1M = 2.50 GPT4O_OUTPUT_PER_1M = 10.00 +_PRICING_PER_1M: dict[str, tuple[float, float]] = { + "gpt-4o": (GPT4O_INPUT_PER_1M, GPT4O_OUTPUT_PER_1M), +} -def estimate_cost_usd(model: str, input_tokens: int, output_tokens: int) -> float: - if model != "gpt-4o": - # ベンチでは統一前提。拡張するならここをテーブル化。 - raise ValueError(f"Unsupported model for cost table: {model}") - return (input_tokens / 1_000_000) * GPT4O_INPUT_PER_1M + ( +def estimate_cost_usd(model: str, input_tokens: int, output_tokens: int) -> float: + """Estimate USD cost for a model run when pricing is known.""" + pricing = _PRICING_PER_1M.get(model) + if pricing is None: + # Pricing unknown; keep run going and report 0.0 cost. + return 0.0 + input_per_1m, output_per_1m = pricing + return (input_tokens / 1_000_000) * input_per_1m + ( output_tokens / 1_000_000 - ) * GPT4O_OUTPUT_PER_1M + ) * output_per_1m From 17780b9e465aca9df7fd689ad04a6b97577c8e0f Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Thu, 29 Jan 2026 21:22:58 +0900 Subject: [PATCH 33/38] feat: Add public report generation with charts and update functionality --- benchmark/README.md | 11 ++ benchmark/REPORT.md | 83 +++++++++ benchmark/pyproject.toml | 1 + benchmark/scripts/reproduce.ps1 | 1 + benchmark/scripts/reproduce.sh | 1 + benchmark/src/bench/cli.py | 19 ++ benchmark/src/bench/paths.py | 2 + benchmark/src/bench/report_public.py | 265 +++++++++++++++++++++++++++ 8 files changed, 383 insertions(+) create mode 100644 benchmark/REPORT.md create mode 100644 benchmark/src/bench/report_public.py diff --git a/benchmark/README.md b/benchmark/README.md index bdcfcf0..d40f901 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -73,6 +73,17 @@ Outputs: - outputs/results/results.csv - outputs/results/report.md +## Public report (REPORT.md) + +Generate chart images and update `REPORT.md` in the benchmark root: + +```bash +python -m bench.cli report-public +``` + +This command writes plots under `outputs/plots/` and inserts them into +`REPORT.md` between the chart markers. + ## Markdown conversion (optional) Generate Markdown from the latest JSON responses: diff --git a/benchmark/REPORT.md b/benchmark/REPORT.md new file mode 100644 index 0000000..4bfb840 --- /dev/null +++ b/benchmark/REPORT.md @@ -0,0 +1,83 @@ +# Benchmark Summary (Public) + +This summary consolidates the latest results for the Excel document benchmark and +RUB (structure query track). Use this file as a public-facing overview and link +full reports for reproducibility. + +Sources: +- outputs/results/report.md (core benchmark) +- outputs/rub/results/report.md (RUB structure_query) + +## Charts + +![Core Benchmark Summary](outputs/plots/core_benchmark.png) +![Markdown Evaluation Summary](outputs/plots/markdown_quality.png) +![RUB Structure Query Summary](outputs/plots/rub_structure_query.png) + +## Scope + +- Cases: 12 Excel documents +- Methods: exstruct, openpyxl, pdf, html, image_vlm +- Model: gpt-4o (Responses API) +- Temperature: 0.0 +- Note: record the run date/time when publishing + +## Core Benchmark (extraction + scoring) + +Key metrics from outputs/results/report.md: + +- Exact accuracy (acc): best = pdf 0.607551, exstruct = 0.583802 +- Normalized accuracy (acc_norm): best = pdf 0.856642, exstruct = 0.835538 +- Raw coverage (acc_raw): best = exstruct 0.876495 (tie for top) +- Raw precision: best = exstruct 0.933691 +- Markdown coverage (acc_md): best = pdf 0.700094, exstruct = 0.697269 +- Markdown precision: best = exstruct 0.796101 + +Interpretation: +- pdf leads in Exact/Normalized, especially when literal string match matters. +- exstruct is strongest on Raw coverage/precision and Markdown precision, + indicating robust capture and downstream-friendly structure. + +## RUB (structure_query track) + +RUB evaluates Stage B questions using Markdown-only inputs. Current track is +"structure_query" (paths selection). + +Summary from outputs/rub/results/report.md: + +- RUS: exstruct 0.166667 (tie for top with openpyxl 0.166667) +- Partial F1: exstruct 0.436772 (best among methods) + +Interpretation: +- exstruct is competitive for structure queries, but the margin is not large. +- This track is sensitive to question design; it rewards selection accuracy + more than raw reconstruction. + +## Positioning for RAG/LLM Preprocessing + +Practical strengths shown by the current benchmark: +- High Raw coverage/precision (exstruct best) +- High Markdown precision (exstruct best) +- Near-top normalized accuracy + +Practical caveats: +- Exact/normalized top spot is often pdf +- RUB structure_query shows only a modest advantage + +Recommended public framing: +- exstruct is a strong option when the goal is structured reuse (JSON/Markdown) + for downstream LLM/RAG pipelines. +- pdf/VLM methods can be stronger for literal string fidelity or visual layout + recovery. + +## Known Limitations + +- Absolute RUS values are low in some settings (task design sensitive). +- Results vary by task type (forms/flows/diagrams vs tables). +- Model changes (e.g., gpt-4.1) require separate runs and reporting. + +## Next Steps (optional) + +- Add a reconstruction track that scores “structure rebuild” directly. +- Add task-specific structure queries (not only path selection). +- Publish run date, model version, and normalization rules with results. diff --git a/benchmark/pyproject.toml b/benchmark/pyproject.toml index a8043ca..ac8f62c 100644 --- a/benchmark/pyproject.toml +++ b/benchmark/pyproject.toml @@ -8,6 +8,7 @@ dependencies = [ "beautifulsoup4>=4.14.3", "exstruct", "lxml>=6.0.2", + "matplotlib>=3.8.0", "openai>=2.15.0", "openpyxl>=3.1.5", "pandas>=2.3.3", diff --git a/benchmark/scripts/reproduce.ps1 b/benchmark/scripts/reproduce.ps1 index 6f3c2d7..22812c9 100644 --- a/benchmark/scripts/reproduce.ps1 +++ b/benchmark/scripts/reproduce.ps1 @@ -55,6 +55,7 @@ try { Write-Info "Generating reports." & $python -m bench.cli report + & $python -m bench.cli report-public } finally { Pop-Location } diff --git a/benchmark/scripts/reproduce.sh b/benchmark/scripts/reproduce.sh index 7c8affd..e838e91 100644 --- a/benchmark/scripts/reproduce.sh +++ b/benchmark/scripts/reproduce.sh @@ -59,3 +59,4 @@ echo "[reproduce] Evaluating results." echo "[reproduce] Generating reports." "$python_bin" -m bench.cli report +"$python_bin" -m bench.cli report-public diff --git a/benchmark/src/bench/cli.py b/benchmark/src/bench/cli.py index bcdb443..35769fc 100644 --- a/benchmark/src/bench/cli.py +++ b/benchmark/src/bench/cli.py @@ -46,6 +46,7 @@ from .pipeline.image_render import xlsx_to_pngs_via_pdf from .pipeline.openpyxl_pandas import extract_openpyxl from .pipeline.pdf_text import pdf_to_text, xlsx_to_pdf +from .report_public import generate_charts, load_report_data, update_public_report from .rub.manifest import RubTask, load_rub_manifest from .rub.score import RubPartialScore, score_exact, score_partial @@ -121,6 +122,7 @@ class RubResultRow(BaseModel): task_id: str source_case_id: str type: str + track: str method: str model: str | None score: float @@ -811,6 +813,7 @@ def rub_eval( task_id=t.id, source_case_id=t.source_case_id, type=t.type, + track=t.track, method=m, model=rec.get("model"), score=score, @@ -872,6 +875,13 @@ def rub_report() -> None: md_lines.append(g.to_markdown(index=False)) md_lines.append("") + if "track" in df.columns: + md_lines.append("## Summary by track") + md_lines.append("") + g_track = df.groupby(["track", "method"]).agg(**agg).reset_index() + md_lines.append(g_track.to_markdown(index=False)) + md_lines.append("") + for task_id, task_df in df.groupby("task_id"): task_path = detail_dir / f"report_{task_id}.md" lines = [ @@ -889,6 +899,15 @@ def rub_report() -> None: print(f"[green]Wrote {report_path}[/green]") +@app.command() +def report_public() -> None: + """Generate chart images and update the public REPORT.md.""" + data = load_report_data() + chart_paths = generate_charts(data) + report_path = update_public_report(chart_paths) + print(f"[green]Wrote {report_path}[/green]") + + @app.command() def eval(case: str = "all", method: str = "all") -> None: """Evaluate the latest responses and write results CSV. diff --git a/benchmark/src/bench/paths.py b/benchmark/src/bench/paths.py index cba6fac..ee100e8 100644 --- a/benchmark/src/bench/paths.py +++ b/benchmark/src/bench/paths.py @@ -16,6 +16,8 @@ MARKDOWN_FULL_DIR = OUT_DIR / "markdown_full" MARKDOWN_FULL_RESPONSES_DIR = MARKDOWN_FULL_DIR / "responses" RESULTS_DIR = OUT_DIR / "results" +PLOTS_DIR = OUT_DIR / "plots" +PUBLIC_REPORT = ROOT / "REPORT.md" RUB_DIR = ROOT / "rub" RUB_MANIFEST = RUB_DIR / "manifest.json" RUB_TRUTH_DIR = RUB_DIR / "truth" diff --git a/benchmark/src/bench/report_public.py b/benchmark/src/bench/report_public.py new file mode 100644 index 0000000..a4645b6 --- /dev/null +++ b/benchmark/src/bench/report_public.py @@ -0,0 +1,265 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Iterable + +import matplotlib +import matplotlib.pyplot as plt +import pandas as pd +from pydantic import BaseModel + +from .paths import PLOTS_DIR, PUBLIC_REPORT, RESULTS_DIR, RUB_RESULTS_DIR + +matplotlib.use("Agg") + + +class MethodScore(BaseModel): + """Aggregated benchmark scores for a method.""" + + method: str + acc_norm: float + acc_raw: float + acc_md: float + md_precision: float + avg_cost: float + + +class RubScore(BaseModel): + """Aggregated RUB scores for a method.""" + + method: str + rus: float + partial_f1: float + avg_cost: float + + +class ReportData(BaseModel): + """Combined benchmark report data for plotting.""" + + core: list[MethodScore] + rub: list[RubScore] + + +class ChartPaths(BaseModel): + """Generated chart image paths for public report.""" + + core_chart: Path + markdown_chart: Path + rub_chart: Path + + +def _select_methods(methods: Iterable[str]) -> list[str]: + order = ["exstruct", "pdf", "image_vlm", "html", "openpyxl"] + available = {m for m in methods} + return [m for m in order if m in available] + + +def load_report_data() -> ReportData: + """Load aggregated metrics from results CSV files. + + Returns: + ReportData containing core and RUB aggregates. + """ + core_csv = RESULTS_DIR / "results.csv" + if not core_csv.exists(): + raise FileNotFoundError(core_csv) + + core_df = pd.read_csv(core_csv) + core_grouped = ( + core_df.groupby("method") + .agg( + acc_norm=("score_norm", "mean"), + acc_raw=("score_raw", "mean"), + acc_md=("score_md", "mean"), + md_precision=("score_md_precision", "mean"), + avg_cost=("cost_usd", "mean"), + ) + .reset_index() + ) + core_grouped = core_grouped.fillna(0.0) + + core_methods = _select_methods(core_grouped["method"].tolist()) + core_scores = [ + MethodScore( + method=row["method"], + acc_norm=float(row["acc_norm"]), + acc_raw=float(row["acc_raw"]), + acc_md=float(row["acc_md"]), + md_precision=float(row["md_precision"]), + avg_cost=float(row["avg_cost"]), + ) + for _, row in core_grouped.iterrows() + if row["method"] in core_methods + ] + core_scores.sort(key=lambda m: core_methods.index(m.method)) + + rub_csv = RUB_RESULTS_DIR / "rub_results.csv" + if not rub_csv.exists(): + raise FileNotFoundError(rub_csv) + + rub_df = pd.read_csv(rub_csv) + if "track" in rub_df.columns and (rub_df["track"] == "structure_query").any(): + rub_df = rub_df[rub_df["track"] == "structure_query"] + + rub_grouped = ( + rub_df.groupby("method") + .agg( + rus=("score", "mean"), + partial_f1=("partial_f1", "mean"), + avg_cost=("cost_usd", "mean"), + ) + .reset_index() + ) + rub_grouped = rub_grouped.fillna(0.0) + + rub_methods = _select_methods(rub_grouped["method"].tolist()) + rub_scores = [ + RubScore( + method=row["method"], + rus=float(row["rus"]), + partial_f1=float(row["partial_f1"]), + avg_cost=float(row["avg_cost"]), + ) + for _, row in rub_grouped.iterrows() + if row["method"] in rub_methods + ] + rub_scores.sort(key=lambda m: rub_methods.index(m.method)) + + return ReportData(core=core_scores, rub=rub_scores) + + +def _plot_grouped_bar( + *, + title: str, + ylabel: str, + categories: list[str], + series: dict[str, list[float]], + out_path: Path, +) -> None: + """Plot a grouped bar chart. + + Args: + title: Chart title. + ylabel: Y-axis label. + categories: X-axis category labels. + series: Mapping of series label to values. + out_path: Output image path. + """ + num_series = len(series) + width = 0.18 if num_series > 4 else 0.22 + centers = list(range(len(categories))) + + fig, ax = plt.subplots(figsize=(9, 4.5)) + for idx, (label, values) in enumerate(series.items()): + offset = (idx - (num_series - 1) / 2) * width + ax.bar([c + offset for c in centers], values, width=width, label=label) + + ax.set_title(title) + ax.set_ylabel(ylabel) + ax.set_xticks(centers) + ax.set_xticklabels(categories, rotation=0) + ax.set_ylim(0.0, 1.0) + ax.grid(axis="y", linestyle=":", alpha=0.4) + ax.legend(ncol=num_series) + + out_path.parent.mkdir(parents=True, exist_ok=True) + fig.tight_layout() + fig.savefig(out_path, dpi=160) + plt.close(fig) + + +def generate_charts(data: ReportData) -> ChartPaths: + """Generate chart images for the public report. + + Args: + data: Aggregated report data. + + Returns: + ChartPaths with generated image locations. + """ + core_chart = PLOTS_DIR / "core_benchmark.png" + markdown_chart = PLOTS_DIR / "markdown_quality.png" + rub_chart = PLOTS_DIR / "rub_structure_query.png" + + methods = [m.method for m in data.core] + _plot_grouped_bar( + title="Core Benchmark Summary", + ylabel="Score", + categories=methods, + series={ + "acc_norm": [m.acc_norm for m in data.core], + "acc_raw": [m.acc_raw for m in data.core], + "acc_md": [m.acc_md for m in data.core], + }, + out_path=core_chart, + ) + + _plot_grouped_bar( + title="Markdown Evaluation Summary", + ylabel="Score", + categories=methods, + series={ + "acc_md": [m.acc_md for m in data.core], + "md_precision": [m.md_precision for m in data.core], + }, + out_path=markdown_chart, + ) + + rub_methods = [m.method for m in data.rub] + _plot_grouped_bar( + title="RUB Structure Query Summary", + ylabel="Score", + categories=rub_methods, + series={ + "rus": [m.rus for m in data.rub], + "partial_f1": [m.partial_f1 for m in data.rub], + }, + out_path=rub_chart, + ) + + return ChartPaths( + core_chart=core_chart, + markdown_chart=markdown_chart, + rub_chart=rub_chart, + ) + + +def update_public_report(chart_paths: ChartPaths) -> Path: + """Insert chart images into REPORT.md. + + Args: + chart_paths: Generated chart paths. + + Returns: + Path to updated report. + """ + report_path = PUBLIC_REPORT + report_text = ( + report_path.read_text(encoding="utf-8") if report_path.exists() else "" + ) + + rel_core = chart_paths.core_chart.relative_to(report_path.parent) + rel_markdown = chart_paths.markdown_chart.relative_to(report_path.parent) + rel_rub = chart_paths.rub_chart.relative_to(report_path.parent) + + block_lines = [ + "", + "## Charts", + "", + f"![Core Benchmark Summary]({rel_core.as_posix()})", + f"![Markdown Evaluation Summary]({rel_markdown.as_posix()})", + f"![RUB Structure Query Summary]({rel_rub.as_posix()})", + "", + "", + ] + block = "\n".join(block_lines) + + if "" in report_text and "" in report_text: + pre, _ = report_text.split("", 1) + _, post = report_text.split("", 1) + new_text = pre.rstrip() + "\n" + block + post.lstrip() + else: + new_text = report_text.rstrip() + "\n\n" + block + + report_path.write_text(new_text, encoding="utf-8") + return report_path From e582213af9332131627102b22b8212af44b60711 Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Thu, 29 Jan 2026 21:27:41 +0900 Subject: [PATCH 34/38] Add benchmark reports and publicize scripts - Created RUB summary comparison report for gpt-4o vs gpt-4.1. - Added detailed benchmark report summarizing extraction accuracy across methods. - Introduced RUB report for Stage B task accuracy using Markdown-only inputs. - Added separate RUB reports for gpt-4o and gpt-4.1. - Implemented publicize scripts (Python and PowerShell) to automate the copying of benchmark results and reports to the public directory. --- benchmark/README.md | 14 ++ benchmark/public/INDEX.md | 10 ++ benchmark/public/REPORT.md | 83 ++++++++++++ benchmark/public/plots/core_benchmark.png | Bin 0 -> 34460 bytes benchmark/public/plots/markdown_quality.png | Bin 0 -> 33872 bytes .../public/plots/rub_structure_query.png | Bin 0 -> 31177 bytes .../public/reports/compare_gpt4o_gpt41.md | 9 ++ benchmark/public/reports/results_report.md | 122 ++++++++++++++++++ benchmark/public/reports/rub_report.md | 24 ++++ benchmark/public/reports/rub_report_gpt41.md | 14 ++ benchmark/public/reports/rub_report_gpt4o.md | 14 ++ benchmark/scripts/publicize.ps1 | 13 ++ benchmark/scripts/publicize.py | 67 ++++++++++ benchmark/scripts/publicize.sh | 12 ++ 14 files changed, 382 insertions(+) create mode 100644 benchmark/public/INDEX.md create mode 100644 benchmark/public/REPORT.md create mode 100644 benchmark/public/plots/core_benchmark.png create mode 100644 benchmark/public/plots/markdown_quality.png create mode 100644 benchmark/public/plots/rub_structure_query.png create mode 100644 benchmark/public/reports/compare_gpt4o_gpt41.md create mode 100644 benchmark/public/reports/results_report.md create mode 100644 benchmark/public/reports/rub_report.md create mode 100644 benchmark/public/reports/rub_report_gpt41.md create mode 100644 benchmark/public/reports/rub_report_gpt4o.md create mode 100644 benchmark/scripts/publicize.ps1 create mode 100644 benchmark/scripts/publicize.py create mode 100644 benchmark/scripts/publicize.sh diff --git a/benchmark/README.md b/benchmark/README.md index d40f901..10f3138 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -84,6 +84,20 @@ python -m bench.cli report-public This command writes plots under `outputs/plots/` and inserts them into `REPORT.md` between the chart markers. +## Public bundle (for publishing) + +Create a clean, shareable bundle under `benchmark/public/`: + +```bash +python scripts/publicize.py +``` + +Windows PowerShell: + +```powershell +.\scripts\publicize.ps1 +``` + ## Markdown conversion (optional) Generate Markdown from the latest JSON responses: diff --git a/benchmark/public/INDEX.md b/benchmark/public/INDEX.md new file mode 100644 index 0000000..df4f313 --- /dev/null +++ b/benchmark/public/INDEX.md @@ -0,0 +1,10 @@ +# Public Bundle + +This directory contains the public-ready benchmark artifacts. + +## Contents +- REPORT.md +- reports/ +- plots/ + +Generated by scripts/publicize.py. diff --git a/benchmark/public/REPORT.md b/benchmark/public/REPORT.md new file mode 100644 index 0000000..4bfb840 --- /dev/null +++ b/benchmark/public/REPORT.md @@ -0,0 +1,83 @@ +# Benchmark Summary (Public) + +This summary consolidates the latest results for the Excel document benchmark and +RUB (structure query track). Use this file as a public-facing overview and link +full reports for reproducibility. + +Sources: +- outputs/results/report.md (core benchmark) +- outputs/rub/results/report.md (RUB structure_query) + +## Charts + +![Core Benchmark Summary](outputs/plots/core_benchmark.png) +![Markdown Evaluation Summary](outputs/plots/markdown_quality.png) +![RUB Structure Query Summary](outputs/plots/rub_structure_query.png) + +## Scope + +- Cases: 12 Excel documents +- Methods: exstruct, openpyxl, pdf, html, image_vlm +- Model: gpt-4o (Responses API) +- Temperature: 0.0 +- Note: record the run date/time when publishing + +## Core Benchmark (extraction + scoring) + +Key metrics from outputs/results/report.md: + +- Exact accuracy (acc): best = pdf 0.607551, exstruct = 0.583802 +- Normalized accuracy (acc_norm): best = pdf 0.856642, exstruct = 0.835538 +- Raw coverage (acc_raw): best = exstruct 0.876495 (tie for top) +- Raw precision: best = exstruct 0.933691 +- Markdown coverage (acc_md): best = pdf 0.700094, exstruct = 0.697269 +- Markdown precision: best = exstruct 0.796101 + +Interpretation: +- pdf leads in Exact/Normalized, especially when literal string match matters. +- exstruct is strongest on Raw coverage/precision and Markdown precision, + indicating robust capture and downstream-friendly structure. + +## RUB (structure_query track) + +RUB evaluates Stage B questions using Markdown-only inputs. Current track is +"structure_query" (paths selection). + +Summary from outputs/rub/results/report.md: + +- RUS: exstruct 0.166667 (tie for top with openpyxl 0.166667) +- Partial F1: exstruct 0.436772 (best among methods) + +Interpretation: +- exstruct is competitive for structure queries, but the margin is not large. +- This track is sensitive to question design; it rewards selection accuracy + more than raw reconstruction. + +## Positioning for RAG/LLM Preprocessing + +Practical strengths shown by the current benchmark: +- High Raw coverage/precision (exstruct best) +- High Markdown precision (exstruct best) +- Near-top normalized accuracy + +Practical caveats: +- Exact/normalized top spot is often pdf +- RUB structure_query shows only a modest advantage + +Recommended public framing: +- exstruct is a strong option when the goal is structured reuse (JSON/Markdown) + for downstream LLM/RAG pipelines. +- pdf/VLM methods can be stronger for literal string fidelity or visual layout + recovery. + +## Known Limitations + +- Absolute RUS values are low in some settings (task design sensitive). +- Results vary by task type (forms/flows/diagrams vs tables). +- Model changes (e.g., gpt-4.1) require separate runs and reporting. + +## Next Steps (optional) + +- Add a reconstruction track that scores “structure rebuild” directly. +- Add task-specific structure queries (not only path selection). +- Publish run date, model version, and normalization rules with results. diff --git a/benchmark/public/plots/core_benchmark.png b/benchmark/public/plots/core_benchmark.png new file mode 100644 index 0000000000000000000000000000000000000000..9e84603b8dd7a3731d61fa76211e7e37d26a7658 GIT binary patch literal 34460 zcmeFZ2{_dK+c!M4P?mC4BwOW@N{B*s6{*B%v2UeIOtvtRUAagpijZB_B%}rzTgp<2 z!C10RS!W1gY-4-R?{r=N`+nZ{{@?fWKF@t0&;1_9JI8VToBx>Q_xmpA`8hwI&-uNl zucytmdDmtP2E%pcwB`j2WhLjj?9KPbjfRE?%^wd?lw3)B_`v=)=J8jr|W4Cjx?S&b1rw^CHu$e(dIO5)0>?( zkAKWK32?S+)?kjAA;2Tn3>k>i)IaG zMn*AeYm1~Tk=37fvppvJvfpe>(GC+V(eNBN@|NDBLuO$Ue57Ca>dnGr^2(ySansldqw?ezzN32=1SOLyrye?A8okF#X0-ZktFX-UHQ)6XD2C@_5MlTKMJ(DH=dH1ViWOsRqro+h&bQpE!|Q7 zisH8KynZ1yGc5x{FQ4y=SDT3?4ZbtU zw=8dYcY7WAsm#?Dh04W-9$zgTZpFbzWvh_T-if$CX66QYw7Ous-9%)K?e8YVqh1sb;>$V!OVryF+^TYMM> t`r!tv3ItX)mgE1ho_O;(DReG43**|8r=GEm| zFX`4v^q5RC%W)>sG8gCPhvS{4e!jc2ZQr?%9&9fQ+f5fecF>v?7Vg}al1=q~@Kl-+ z8fG}nv3K-zbE;yWhnd`JFCqOFX=+s63OQoa^kB`;S8Reluc6=1<6YH*%l$Gj7qtXOjLwXm?z#E%wt zJC$m!8ebzNS@^%-BtAG)NH6~Jy1t#_JKx*T=>M~dZA1la;4%N{$mCX!v3AEKPa_NW zHqUnVmNVjQ9j&c|w|RamjJ9wQ^$Oj|p;+6b>_a2*l;vwVo_D#0;%3I*4unY#^D8)+ z`_51FnjNzD*`f|nAwE$-E&SuQWYQM@g%6CbuC8k_GO%bEo3`yE7zgL%5fA$B%k@<+ z2)kYh`{p2db&=%Xo?kZoxuKDd@&P7No-#SxQMu|Kn*0H#*<9UodXS+&@p|~~#mg0b_9It!c^L19M(=let=3A71yh>14>)v5xUeKws zZX@UOwI#}0p96n%t5=xn+}Fj`Q}Ps_8AfrRm;K|S^XD6H3E4P|FNIPrPFYyto3(QN zuo#WH6v#yyY0;mxa=vWwrJfAUkaH)ba)~T|VA&HZwwb82hLGbQ0eqqkW?+q-2j-(dYb?%3hq{9dMgZK8BC z(|4{jtF?;7xooQTZ2dsE&!A|-@IAk0;fljgk6x{PKraaQJp_^5|8w9cE&KkprOLJX zWNn`<5C=!du$$ZaiK~o`%FKM9A&HXi@X`sQ1Zj;<#*VDsN!wuKbE;XXM6q&hWs85RFCXz zh-jEyVbD}0RxOq?&~E0O2;*BhxR_>e%O~dh_q!r`l%>A;mJzl25_)mvilq7)q`R1y z#S!&2#bVd+Sxx)NVbhk5wzeVCaDS+DV>0%TvRWDpn24g}aJb&u`&_Nah=onzG~rJ4YeO)(>gnYH@SD z9+aNzWvVMM1dSf2eP}vXpKb0>{yw4h6*k5U-A+-kGRQ7gB)NNS$$w3Ht#)80jC9yk zMw?^a&a)?J?4DZTz`Q%`-(e}Yeh)#4Rd4c-Fk7+0~P!H@|!E&&Xv7?C`wl@yZS<`cTKver{u4O-g^_z}V zBt1W$X6o?Cw3n4okQSzN5IYF~VI}=cr{Czs3CMZ0dot%VD=E1Se*ElF{P|!`YbB%(-RYWngF_?t z#FOlbXTM+x31Nl}Q_9|>*;!p>4>zBW5v5pqkGENui^y1OHqO9a@p@Lj8FMI+01)!| zh2Hx|8J!l}n_XoX<6OIiXO10tu$1{ZVFg1htZS~=AI}Oboqs-#v z#Z5;~NwjX2OoiM#-fC!@=x#U>TD%~ja_a`d3KJB$&pf#iX;#B_7MB^pQ~{>xMJ`O3BOPM_xfH+HkV#&j9M)U_@PSy*b?PD3_4Z(c+EL72UEmwmd763cT)wg>wU?p_J2f$-bU>GkR`smq&4WJ2~Qf z+NmLmTk9C0%)+*^T*tH%1&;SAC)_CK>G78I1C|baECFFy0Ocva^8|3o?6%|-?1*5O z;_z#sHw!*T(@)b zx@O5Jxjr`@eft9;0==>X{Ie7#eg0maqej6|MwTTtPA7OZ@jF@~tXCd(SU4888P@k+ zK59GLW|XfI{_VPsbyIDsec-U>jp^6*QuP)t)$eZb5TrMDhE`JBMG5ly7GEJuw%oa2 zB_(=Uu-C78erIneVNRI$5+`wIZ=Gh0WV4tir=^Kdng2+lN6X?!a&q7_1Wf;nvHc$8 zo}j=DD-)?pi93IPRXFK%CRbdltY$;<&eMnQEbwUNtv=j7n_o!zxLkDpJ2`n3U$K?X zVzPPzu{O2-QNM4E2Nwp)8waR1b2W$P?a^21B9aYsS%fkw8p&v&p*;jkrNU9~2c50eDgC@zSbm}p3fPCxM zeBERq-%hOaf{BqSE`zsz=k50Qfa*tLd42cXwJR*&&-(`qT6xNhw#ZwKNarpBPB&9) zjl#2CW$@8S*BmmHQe5J*hq1{?*S@l%co{_ zbsNIpes%d#P&drmd*%I~n@WZz6Z(#RK0k-n%C;3{Z5_s2?`Kp6HP&j5B>|I16m7b= z1)(d9VI^5nNA*yxo-DL0gwgQ%H5!Gk0~$CpEe(0i-Gy>(Sib$(R{pBI4U100w#VBkrd#pH>nZKC zf<8a1JYQ8%PYqJj!W%7&t1+Q6k1Uqk`=u+=^f@a!Bij7}AC}Lyn=Q(imY(A1k<}_m zE&LJgM^Ec%J4>kMlDSNQRASNPRlhnBUG0@BT>UM`<+ix#8SM@w7rs;GP4_u&)HtV8 zS2JL?@2slbuVNqQuR6I!_0pN}s%@egmS$%0>zXhcBd8W_daiRfl|-@Qdb|Jrw`&A_ zC*4_k(R$5L!{j#K$S`_FL-Rk zs;}TDUhGZq=<~$&ymi4r@p4Y|)Ps+bD+AskmF=c>#cc?Js1Hy#iW1%Tn^I#WO~3Q9 z%r?D^kKKSXP_ESQv2|^b>qs}U@|9!rOZ*m)s;|3UyR1mHWW2~tBPh1`15V<8U%Z*c=cpKjYHt-eF*RqAn#(#&{p5sf2m8)6cf}K5q%qTraK;wb>U1%*sSn}--$3fCS^y+nOK^IM@drFYi@ziH`<(EpHShTO(qH=iNQ-E82 z`DYK;aoe$oXI}bmAv-+#MBiyA#TamuTzLCjz86n6zia)o#BNIXIF;Q?`la0`K?-@9 z^=tDyeT(1Ez-1L;6i^95aqDYBqbC~*{lY0#{I&^?iTC>zev^C@Z$Bi+XLp~&HI%$r z;^m{8-Hye}%US9xI7WlBUu3d2WlGhHZv10x9i}S=*cgo?sg*qzH#^1DE1^s#$Jyt_ zHmWWCP_b*5$S;dv8)pDKPg<8NqtQ{PHdjCgBBSBWgP|BJwM0m(i;ET>y-`I_0`=^r zLxugZ_kw0+S@UlSthbx3Cgv;V%jNy~;vHU}Q5t%3ae5|sc~BfTmRqa`1(1Q)T&Jy@ z#0U1@dp$=IX{D4Er>j z#d!lEC&1v!xqqa+oLa0MUwACOF(!2Pa6j+5ObahsMu~C&b?@HKH&z!%Mn=vD zvq!A1&R3FHrlE3`b=RDV-_QEM<@{bX5<6?1vx4bjt*t<19RTd&R@Z8H8cOw0Vq}Ev z#t%4|42w%WH;3ZG^HCGK`KcyZVt9xW-|-Rul`*Rlsama+y^nVBDeamUD}%bwkYN%| z!THoRZ&n}GUVnVR`b@K~w$Uq~3`Uft`}gYjT3dO@ z?7V_^-|ntP^;?Hg%2E{lfx#ROXFs8JgO18U#`i7$Pt7d-W^=3>{2T5WdN*5W6Mp2} zger0fi~?D(q>-1WEf*(XkkbmneIN}YA!?@z6QisW&J5X(mZsxLZyfc&^I z8qz=W@OJObKF<-A5cBoV2J*W$i6=Z_%#E9s=wFCt=cK@D^W0iZjo5dmNru{cXLWd> zTvIWv4XW*d87rOs)Dz1UseSUi$A7Z_Ap)e4B0DVcT0ZT5h3*Uf9}{)yLBg}giY~ga z*yWecU4Jd~;GK{8A+xWc8)vrhE%eVwZG8TKV_({-ho&#bFqmj=Q;0f*-{|*KoCsYi zz`VHrJY;%lZlYJ2?OMO*`t$wKYwV573w7#FEnc#Bt~9@1O!A)iR2;KKL`}t`o<}lC zl-&oQ>M(D7^L?-LHN=X!!u%n_GV!Ea*4zxOwbCgyYhQq(e%xhRp%ZpomEpMKhGd9#QMDlsM@l?mjs2hcGZ+%W7#_e&(DZp}#|M<2@mN?YwEP^h+*< zp2I6c(*B=%zi8GkFZc{BS#3Q((tkqFeEa>0YpJt0>63yKgR-oJafhPLYqYOd!*#c) zFW(?8^oP2Md}z4mwXbaud#Nfv$<;O8L2pO;eWy>2${mt&j*MddXllT=Lz5q->qc#Vk zgrnb5ZIv`7?5k`s)G{W&Yn-|YgPjTWulnL*Oa)ymSq+1?-na*N#di;;({w}*l zxNVGvQM-+)2BIsHN8BC9Q)~K2K-BsyeSLOUI1bq4n6qBVYQ8NQF_v*q78UkvW&k0C zorq?V?EL6A_dvUV~-?|)0g8*~A>v~S^^vqaZsnflfE$C-1<91_uHzGhy z24p%j1C(car;3&rzKzM%H|^M@QCxL|q!KB@#x3|X+T>d7wV9W%KOdYK+<-n#y-qwQ z(iDnF;Yg9Cx$$o6B%rZ|>OMt^OE<{^z1-eUK)iBWTU!RiJ3mzU!J{dm!_r&QeX>tg zH&#|GN{fTay?di_0;~>{Mk5Kg4pq}n=d_!fn#C0EpONIUd7k8qJ9u)|M~BGS&`^9d z>I!a2Qku&~KRIq~`92|Iwz%x*lmCj>NKyryQ(m6eSMjjh!JPaN(Hl`+rI9t^CCVwRatw-m0*W*G_k~rx206x? z`S9zL2X@-QrnWdJMU)mAe+4&mmg;wP_)Bb5I|+kH>PSV&>sT~M2&S)87( zd%z!wz)|6`NeE};PTSwA#y=WUY*GF4aTWJf2^1J83NDQ&XkJ9KNR)u4@}mnuevCqGG{TpfBn*%TPWq- ziQ`4xuKKU^h5mdn!gKjh^^!kG5U!7u@7%+yBxb0Y`%Kj=_MdeqO;f909YDz2gW~Jq z1*PFM9o>XyyfAsPa%~k4v3IDB{i*glqUoS20FRj&_C{c@M`^GE&v{Rj*ou{>QK9do}*E2ur`Qtq-Yksdv)Vp737Uf3Y5wy~CHQ>!m9b%s%Xx zqxi4k+YF2{Pz9?P@CqcdVLA*!af_+c9ZnwO{hIp9cP`f~nyzn7b$2VY6C(EB>^5NC zJWv;&e@OW=zhq`wR3EVCuANB3>Uk}#i4j%nW`A96D||eLn?_*CKB?%~Vr}(@ZMG4Q zLzq`#ov`=OPcxJLYs&fJtfaR7d7;@cM0oI->#0mte}-r^!7Hpk~OC0m_%m`(78Mg@>D zKfjWy{jSid%~7QeYSWLy(_xClt#+?;byOId3?qwyPu0=)ZwjQlY?FNtlBJ-?rpJK3+YDwZY+Xdw4rRYhlR5^w^P}YZU2GdpBrMmR@lR+nq;Lz0H$T1!id80*VAqpU zG(sYW_U_;6H}@uaC`DtvNNSX;yZ&y?8*g)u-@bl6di0yyF(40Hkk=q^3r~-vX6Re? zotmLv*zI<(^lu$19gNU%joHa_G^BDk!E-oP_YSGex)UH8O{G4JwC~R5J$EFhI=5Gi zY|TA4uoQy9H#e{dU4-#;VLR>yud-CuqQ9w-0P!uWjG7dto@;{|YGLQ>A+>mF#CHgL zNm&)Vo_uvn&wTOGj2rZ3ZKFG+JyH=qsAeUl1-aqZ&aY2CiNLE7K;dN-R&rZvm?p*> zl$b>E1wjzLv4zHyO1qfw|B;se%jOpndMRnM)(H!D-|jdD6Sr+qMlTvw7$YNA3-9@O zspL?ngYS{dbvCqCR`c77jNP3GIs2I1+(RUW>~3eTW)TAe8oAAZhC5WX!P94-0@_k-RrB=Wjh8kHU15J#TWlATO_Zh;McbZ}cS)mw?)v-zre)sq~1N zET_jv3vYg`o%EMe-Y3B4gz|j%d;ibx#6vD z#oG+8<3Gw~8W&|Y2H&qxjoFCujq583Rx7ARDWIBFIE1Qci5}j_kp@@QubM|_ISU0V zPC6gyZ>9+ftPf<2%@2t(yu-fzF}0dwc6tq{p3`vIMiUXDtUdU17SGfOHGo&XFW-VJ ztf0JNRLL+HY6s?uK~m);ZArlj=s;Iuv?4zyC-`9wts8 zU@Ot99HAk%l%G>|BKe$|hHs~81;~h`Hyn|^^JU8Kq&r55*uN8S2oa1)GbzF6+4$1C z1)}wsmkwr=sIewgocr({F9cpu)&krG*+-a8GHO4!D!r-aprl^c)puz-^$#)J9}?{I zO7;o4eUZ@}?u5cBmK8R;9=p+^+>(o5 z_IDxA!N(iB=Ste)H(u2!iB5Ood!avpLIiuC_DdT?oRrV>fTK*BgbXz%pS*t zWlOZ)E&6)n0~Z;{Nw`OlJEM?)fu4=^J?MydnSgcNry&V;YrzathTWP<|SBfok@75atNr(K*M2(T%s3B6Fbz}2|X#{3X z5fv5siJcenp@gFGY+JscoFuN9OM?^7%M!qT4J0E;#h-$=3qgVwE6x*;v9A72nh#cY zwo&tE#XEnV4|^^R*_Grwe`>73*>}t=w^6|2x@N~N$qi3)$m>&-i}EgrboGOa=$t~& zjgwtoW4UAm!Oi=q+N!LeTlwNgIj@f@zyBFwVQyXkIOLq&WGd7;cR{p%K!=HuR=2wP zQ0`!$H46rBc!YO)!S+nGP$3F>09XFu9!$+ks9}YA?!7lEHi1uk^==UvTt@tku*`1$ zVH>T-^cfzKL$x;^3MI)K$Sh@u+0`oXD%s>dlS6vdx)UVk7pP9O*2qe=yv=AzqO<(F z=0l|nF!g zksP|MU?J4(l5Inr9Ep7jgAp>()zuZgaRWDQeU5pnvi;RB^BQqmG4CQSlvcMhEuR`O zd;fd-Heopb4F@TzZtdD|>y+l%@LlP1QcU(rhXn?}_=3_>P1ZNo?eIYZd=>K#|4Cld z(a$D_Fo()`t~!sligdpz?PpxriQ*fAn$oSA)o@RD{hC6-Kjx#HX>BsOe#0YXZq98>#fi6 z9TpxLP)WPrj`67Fw;lX81OB)KkPb@9%8;*46*8>ig5dE9D3KIEoA%A9kY;{&hC$3G zjbuR9i!MNzgp&J!+zzp^K5oi(j23~7A75Wgx}JI2yfx%2jY!X z(LDMhOw(ihJq$fmy5cF)9G92DqU#{*i-fbBm#@#pfiSvD-EWyy@{LixCC3wK)*j}k zfYoEyPzSdxDCq~o1r=X+s>nRL?B2JC$ffB44=B!#y7X6jW@>F+0(S>#-ER|M_YF1n zLZ1!@%husMvGC;xX|V45hr7j3#Z5JgMiP^$UIVw;gg^SK=MzU?;J_OAg7d`A6^*(v zFen}6+Ijf0OgYD|&u8cm0{no$pS}tvRkJVK0=EVKqJpoQDD>UBs`UNOZ7pD`9OnP? z23(LXsvAu`hobo2J^(mkKQ##)vEG|&4QBybnTo9uaO4&yb?Qlh#FH=wCOM-P>prMf zvZoO#tB7^a83$nNINF*WMd+~dSIt87U1eT=J}H01DA?F3y~b;E!Rlhvo@gFi!I$@@ z>#k)zy!%B@E?ccKPj_~4Hg09)$pOw$8F5FuNkz~1A%1QxUN`Me3#i_@Y3i^4!N+%W zc#yxZcn|5m?{NC;WZ$ErxgOc=#I`S=-kdVv-#DKTS>%%+ym(6LhWF6=#Wad=KsB0# zo18PQh?u)Zk#p<%5+@u**tAalsU=OFIL#<0uDaE~x-KS}L^^PC?X2mSZ8N;qv5&>u z&xg2cn?A|-iTt^fa(?bJT`PVVkU9Yhc{^9!CtJ- zHI56xC){l8?0rX-i}kkTydX<7IlttvMsd)C+|Hxq7`Qy43s)+)V0$nz1tRvce(;;Ji@ z-m){&a@XMzr;5XAMMjNB3P`)SY^Ws{^y0gGJ737YRLUvO_<3A`r$R z<7`*8TFwxGzn6iJU5R86p0tVAd5@CK=dhy}jnw^L`^SA)#WnZy)J&G0v)GHV2>yCY zQ(^=-ENNc=u6oF5vRb8`Lq*PYB(|40(?+>?&}P#xx^S`b_xC;HhwfA(9M%N@)a=K= z;?t{PXORCIaSbid;-E^gnE(d~4VXALCD2D(z^HeOM0ne><2bcW#9zgCK!EbGimJtv zn{v{fBe~J3`fL0J+c}5)%)l!*4`#^`oNfm_oA6*ARVU~xD0vn;jH&^=JL<5hTBuZ$ zg9_eyyoqe3BfwODCejc1y5ECl(cZE z*`b?B0fD|`10I~5n71C22da`^A ztOi-y!k+07!nvs+lKxy?{wNy$l3>ma;n%_xysl)RArF&Yg92f>$xfTnOhOh2vj0px zB^`x)b@iX`^!hrOVP1n{^a5W24D;}9_NysXpd6~OMS=bm57x;dIs{f`@d0?65@(x) z@EMd3d%&}mGtrB0?GsB0yfClKGAm;GT!5>$%>C@YS_B{fqKU#BGGSu$e=ZRGTe#!D zxEJVGUGeUY0MMX%D=wh=GD3;N%}d;hiLd!Oja%YngbrAY}yDZQ!N z;s;_lj8)S!yT9QqICgoJC3hnT&5mpr@c2(|^Ez+T2!$dhB~;l_W0cNZd6>%wvi^D3 z*{`+t>X`eS+PB%Y#uH7f|I=+~e?ea2eWZFNcK%YmpzwQzjLGo$m+JICW77U=i)(WE z$7ufdX@?`4V;ImgrU!D4&&djbWlrA7Rca?oX_T_OvinyR1hNr!kw#zm$ofq3X|f>oUqvNe*B)1*!lw z@NmowK!cA2(Avib{_iLq!n7HP(SA{8%4>7Qm58544S-;auP~-CH-LtAix_JY=;qd5 z(B?BU1EkTbPjrx*m2$x*fjazfV0_{re1V*v-;%2h>K32dJ!TcER3X*8p$-XGB5YfP z6HbK*b~+@X(x@C7uM{Vtv9D=#8g;RmY1~B}E;P_3&KOyG+IT?&;RTfbP6;?ZnFA`U zrx#JB014XQT1YB0=llfKd~St$rf$Hy_qD5lgh+!LrGbPRCy@ARWXXpF5Wy#5s~OBo zA*P~B`ByHE)Iw+BLqPhQK4yOeCzk;z#0 zYFKsqynW;B(@1&3e1ggq%}Pa_YN=fMLtz!=T=nNyD%9NFra33OAKBKZxwSAnoKKs1 zJMnMsy*@l>lJ4f+Vvc9ShupeKRipFM%R4<966D`xYKu9( zf43#mh!LJw=9izh258XqOz4Q)=cdXC-kIvMAmeL;_uRKz_de#e$@byvJ(b?vtDQxB zVQ(RTr|kAknOGOqv2vX;q$1Ipr+>yQyZfwSE><`BY?rCKea0*?-^jC9boQfXu!sHI zS4Y!tp4Jn%r2PDQ>2rZtGBLv`ig2s^RPU~+j{4R^k>ucZ?%ed;9><}C_R>rbF~@OY zA(`0QG5%=wK>H3{!NgXZdmZPCPCQX;x$k(*-X_FRXHSvhWAUQn|G9|2P)#c_obYuc zVEwJ#AtxM^_hKD%HWxg+NBBAnzMT*B{&%_ppY^-locMQ*4*v!CWw`$a{QAlLv-b55 zOa~dQM`pix8q(VJ=Z3Dw)Z^A&-&J%kr!6;{e0@ateQSE$vezuJO~P}pe|pVw(3Rfc zA%dk``9WO4>Zoir;o~+>C~CB`u^?sQE(9M7zDIx_wMMdmNl$}8h}5UOBnVm-XF_FU z_l^Tp%SkVXpojxEeN>0b%ilb$14e3Xo>%Bo%=xVq9ka1A*w&g8t<4{|xfgA9S8Pdj z;!ADRHg#CHxxmplD^|9;D1B@9)6=1jdV7kr3Um)v7ag3gF3Q>3O(<&3!nv<=>l(D= zs{B;Uc<==`>0y5?&)4(aQ+i8)?!kQ9*16A~`{r<8JIcv!842=*ORo)Vj6L-DhugCl zmyQ&3axHb^df7;lAI?S*?)e(lCh1vUgW1+VPmTI}C2U zC}V4P0@Ob#CmiDUN){Oe9orW|I3akXdhA#*_4#Zzk+UGF)+S`+Vg1&bOEG(*`8{8d ze3=XMfF{4F;9J;-_4=xPdgp@|9=xdEnta9HW`9`Wi~6PoV~Il_3)+8E^uK=6 z_dY`fn>f*K>y+s9oUh=)=};S9Ym3Lc?uwedVIAi?I10Ez2w(TXls{CodG7dCyOwda z8!A7&!F!JSC;5XL4xALHsApq!&ijr&AHi~5Odj1R|Kv*SdBxnVb)0UabfqZ5v7)CJ zvjiHm2A-K zG9H}k#l$ky9S3eh#oUX)*AwVqr+#xL3C!v0Lhh~Sp~Vj+t;e55pqQF9==mRuDKM=T z_4Ml6{#>XEFZSS3l60>exVVOsn612easJr`D7Ak7LC~|e>*U5iW8_bG?D`O`(Nh47 zim;4zqvVFRVpciBa}v{y&Yp5|Zcg^5j)lS9DKB==+KStll>%N2ikz~@lE&(!<~%HHk0Vob-PJF&-KVEIR*SW+Zx*`l@qHO_vz^b_&UVxmXi+Yl~5O)<2 zX#cX{lovE(kWn94#cZx~!x0jYX-scg5~Tip^#agt4-{3-1?1QP^JxYFNWKxn=H{R} z)FyB3P{*Y7%r5;Z4&~jmIZ#h6_facoOQ!)6IdkONm(!^CZkO^+2PKeXFN6!L0#p%M z{HWc6EH8L4bPVSmbAlDp3JyR)yc${~uoNzJc|#@*n3G3-fg|3_-|D(c*)c+vF`G5+ zuFQ2yuStfSB3( zH~4>n(fu#3to}JBJ|7dkmr1(2{k&=fv$LCQ4b_Ez#6vA8rnZS5l4)AuE+>X0E?V>$;wI{n}k$8 zM8ML3+q5W++J-(6z|9-5icT$Xs#qK*fQcl-Hpz1zs*LR-Y{8j{gT0I`?s3&AeZu5Y z0=RbuRDc_$1-hEaP**1!qJFIzt$Hf51l@~fYDfXxf&r^=S&^^<-XI0Xa0a-~-RQ^! zotNh5XEFFYYAJAhgCfNTP|!5%5ObMbrZwlWPz{VMAt$MH`ne zKS%w>LIwAvqAK(~&`_I3s3!=yWpp^#APcVE@$W0px){^d4E)Vgzr}j1%nw`CRiOJ| z?d;Ncv6Wu3tj$?3@QJh_iIonf%)@^xn&T#yn7^WQfrE-1ok(}ZeS3GChAe4tFS;+X zv(I#xyXkfz4+HW}F=oeNQ1g@EpT8=h0lN+iexd2E+7o2E%0x;$AY3nW9);(-D!F!J zQ6Xp!r-3evg~@UE=KD1ZVXYhLP~VQ3NNuZyf&BOUSJ3^*xBM>ka#_&(>n-(D)KP0bCDaH;Qa!n6FbESg1fJ9YyCm zfFJPYS(x%`2r0MFY2@xVr^WiS{nmr4auPcpiWLu1VEP z`bW|ns2J(@wuvKuzhFvO8S@_Tg0SwCHj0$dGUK6TCh-Y$t;F~F%rwxE8ee%!7>7z! zr%e`S-FaHSu35A&fE(4jC_E{_u)BR*tl_izLPN-+>yJoXPBoaYrZW$BdS4vk5Qb(MH5Pcm7A7OhgCz(#`^ch> zpke7j8vW?`YqcEfa^LzsaEscpm43d;(-5dVOhKa{!;M9^!^b{jf?$%d7qj$q%^V7} z7(Z;)yOaJN`wc0jAi_Jr0U^jpmSu+QzE|Q zjHmv4#P>26CGZeBcLiNPMjZMT1cW3pz7VgopsqfG;)8A`^S}}4Kxqivg0wi=@(hVo zMeQs1&azr#bkM-BXcF=FfXuS^!fgw@v_iy zY9iQ@&-@(_T3AtP)dL#mN+cbmI8kukUR_-Lr3FBwmw32SW-nX*UsD!shf!i}fU|Y3 zN324K9j+USHp}b8%%Z)ER*Kk#Q_R()bQwd02tb1+=p3;M0C2Dld>==ZDd>2cFR@2- zq(zwT6FQ11M<(*iGsruOWdQvhz2v_Np4+e1^|WUv?@B$pz+7`p-&vcG63Ucwfdeu< zQwrdTX$MZ8gciew#(C%<&WwWmeD7aUy&BxhMF46or~!@u=MoG`DX0MhK8$N-ct+~V z<77!t(8ZUI+hx1|!dE(3V!Hw?!I;v500)W!D00)Ge`=^AG0lAj0B8FT6|gqD`1PP< z^T&DS$QWz7<2H38lQr|O4P>!K9lTu7OEwO>L~(y_0wTQ7)ZkE3ZuxuOHl%!j$w^jQ z>Lf|&_d((=qfawOrm3p*8}muZ*ni_Awf;kG|8b0}-}1txix(r;ZbDB-&4pjhE@3Zp zHl#qt`^xn01No$k>`*umi1kS7Yd%VggOOyQ3X?N+IzU$uZq4v6ht96;MJTJ8%xkak% z=lkVl*lXCxeY&Pp8!HH{%yuo%#C)7gez}JDHfdBsF9;1h&R`=uwiC0DwU{-VKi%tG=vATZP7!RMxF#64YJt24YS~3 zP-J^1T~2Rl%`wWa(KkhR8_@U+#4+R?SUv`4`0&nvxHekG(T~zR0%sFzp6E~>PBCT( zqN+;?ftQ3lJYw-ZioDATC-LFXOuXbV0z1dIFGwU$t8=pC0_Er${dhgu$> zVJ&Ac8EoGOR1F{apQ7gGU>ZhHh#TpEc7?@R#K}D*7-RRet5?ca%kGtoHZit;_k%F? zs55(C+ek+t=zR_AxNvD~B)zdM*PrHNdlOVXEXfa= z6p#`KtGxm(?MY_ACyL_~J=>oj{O~QwttEQ0?J(t|4Jhx=9Zz@P`1yliX~3W9F&1OF zlc42QIv;YjE4?pgkT5^G@luQBc;)f9!*(;D$4umU-I5d-xdvzATdpo9%sGV*N#LC4 zERJL~yri8!6My(;AT4S&O(LI?Gp1h5nPb?}znCCP&rh#Wk#R=shqX|Uu}8X=$w7)% zF%J$u-<&PgX3!!}tvJrm=w&;Yztl#r_cVNN@hqa~LUELy!e6rpgm{p^^#RsJ+l zd-U-P=VbehE4b6+j}qjc-_-x$ab7_-%U!4UAa2U*AyMvNeoVH>zAUa>A=5KvCEcGD zb3Jq?&vtdW+h@6#=6CNIyZBO~#kzOT=il{I5`_&7vW())vSPAdkjFR^5^E22AGYf4Q7ZRZ3s$Lbl>G1r#A*3r&fc8Q zubN-KArsdLMDG~8ry?1*_MYk_UJf(G3F z55csW+wt;Go)wMs$R2HN@bJnzT+Dg$ATEF2OgyV05BD59#VE8nCF$xr8tXNzl7qM3 zzvNn2KSW@WlbwW%Ds9=qf{QNw^*;ff93SS)iggYxHQUorOhS; z_gH+o$S^qULNauLwWM=8S1exBi&u0TRX02%x0qNALzp zV?nN*RthuNi|yfWf6IwFymD+ph6ovIH7e)%IL>Q}Vp}^;v<{sPC0d01eUn!bufbSG zoN%}bWSC;mSKb==?5*7|fFR2|;Xv+!D`?~Tf{XN)Ti!<*JtCV`PRafLc5aB}YKzXg z;hqM*Yyao@xxZKF=JU0uLuHL(_VisGki8UMeDTRM$=L%f{ZF55ts8#KTQ?eXOQ>!- z=vZwy;n=Wrb%{oLP08^+jr{#R&$fmR#X1_ZZV^>N7Xzwakx&jwBM?J-V8lf@f&QGD!siUBSM8 z2rktrYVc2m1Z89G+vO5w=()7`nU`N3YtEXggcF_`hIm@{oj7!4x?0Ip;zMe!sFJm> zSJuO+kp8mFb6gZwjv)@Kg%hjC&A~^IjZF>?`~~%gpJ!YBc4guw>Xy&=n_85doAUW0 zQaDwi9o5pxCkw*vCt=Y7;7YePT%6*$ISb$ZV=%+u7FwM%Q~J~ zjX=aBavz)@?^ZytyrlF2;*I`CgT@IyW?g`BY=X|~yDkH@8OZkGSWv^uqy~{sy4OuJ zhl!R{jlB^M9@1z5lCY(mcf4gqktCV4strEBFFBFBSUfnan2i z*%#>7dqP(lvsnWj$9oh{ZE7v|o-g|X#mPQ&KrKhP%CF6vDdoNg^=<3|k>Ev6jt&Tf zzM~!aRtz{bG(t~paS#vf3Yl=0h1l#wuXja#ES#3R>&Ufpe!}%%QOn9!FOUyuz*^~| zMjob-05Q@>Z+v?v_OlWF1}0j~B69;B3h3lO63=0D_N&{C1rX15v1*Hd?2@~FiF;Qv zXd4`5Wyk`!0wx4q#0PR)uT`%7U+tZDP?LMx=fQr&#-macMBykXO_bg(fOL`G1q6u- z7y%&&K@LYAAQU|cB8DPWsUo3;rbi+I(yJhWL!<>HLWn>R-0PQf_St9O-F;`@o!Nc& zkB1q@8Ap;#?)$!eSNVQF*V2zJs&FoZ{m;>~3$YY!jNG_1! zy^DwbR5v;tICNg)9Ax*!Bv@MMBcC(`M(58gRP?T3p>bDrk5gl7Tlat%p;KB{;IEck zDNga@m4~69Y^qh1~Bdg#dmb~7Vx~8dM)~c1V3NrkNcy$bO&CV+&gWURgQ4N_E*%pI{iWB) zlj=T;_W8dlW;ZUVd>KB}V%)DgVJx~Aqb%^madv%ES&!WYBHyDglS@~0m&3ocflJ?fS1Ejdoo zR0uhNx!zlt&^J@(H0~eYBQ_~WE&Sv6uKR;WRJ<;JNsnM;cwaG@NhDi#jj;~+U$=0Q zH7_+XIcTRw`mOv{MS`y#Bbf>n(@dYq%x&k2bA7os)>#ztxkpEx^geDko46oZ{-iMB zNV)XAfeENZwn$$P+}@h-tdRAB@kv#qAkU;&zQRP-+%mUcP+2Y3w%nzTT;(6cTHlFF zOtFtR0>g4IZ$Ywkf=j0R)m;Cric*q&dvR8C-S@AKd!VWb@Jd~-2QNl?ox35nz2r-z zdWrP$2E}7tu1~u!M(pe+Y*g}+IOo+~dU|6$NzWK2&f&H3!O*b z)S_n8NNt{(`b}z~2AW?9Bmpt8F4Xxp7(~sDV@63Aal(x!9^{f&OEUgd^8Bmh`OheM zkkTOQJ1~h2t`f1jd4+|>$gbDDRkP!3=ASwHwQKap=aZ^a;Do|WL(`VidnSaai-4n( zGkJtrrzsgEHvB=ta_wk5Uo7z*y8ni0YqM9o$Nq*C6#IMr0}2X^G+oP%>vfj{hA45U_ruVOu9gnJ~$zu?Yvj3j7FLf^)_P9b$5956pij zpJ@N35_AL{M)RsCc`Ad@1wG+%0fQar_d4c#Zztq@e(J4y^=0;-upXpEthhS|{jx!j zs`Cd{A@2*)T;H0c@dmHM9Hi2wh0|f|c<>Ji1r0PNDaik65JZaBgwl)2M3B0j0v#s@ zClT`*O-o2`cd5zSQh}uA8`=pk1v73@0M($6alw%;`VA(!>U&&v$X9~qWip*ZibuV6 zDu^ygy;k6+|2KAf8)FVsaC>M1&wkbEhvuJg9DFGS<(`?4-mBye#lue`|s}R zyz2Y?FExaMqH9AW%<-Mi{x>be*1uZx|IkAGf7}@VU#lSg2Z!B%njYg{pX6U<;Qx-6 zy)*mYubVn^#*yn01ClUfgup}M0GMkS=?VK=1VQ}ZyYrEgr>iD|f(wo0x*_U3MluZ! z5Q?L3fr0-t> ze^qS5gajtpYD3CIMtdB!>lD8Sj`ML+EFI~e zGbp#FKVX_mfn&XdbQ6>VL5AO&wzV0#q zEIsF*dlOx#yqdHfaF=clc5z8$HD@hda`ZAN84&dKXeS2{YL2%OH2_~|Y%qMyJXt|4 z`=f7Ta7_P$L%3+{;p+COjf!6lz;@L*~zmv?@6s^^1kUq+&fO{*I24b(NcgY1tl za_|$x_GdZv$&TI9y?yOQ3M^l%lhx&S@=O(849zY{-b=C^(&)|l_~Q0$IcKkANArHq zIy(r$rly?X~54c~Lz?D~*u?3v;dOB?*6&dH;CAT*)R|JiONRLH=}ayY#uG zeJy$Im=z}BRE{_1c5;eGc2#{sk94k5s%$78WAf3hs^t4b;{KMX@dV2V7w3b0&*`(dGH?N=92+;?0<JxKC=Aq1?E^`)zCgZkGg^T=i@-)fDN0l7jhR!Cdb8HOjEri zn8+AenOV>1o8Fw_V=`=tq3nO`JUf)oH(@X1j~C9?^e3I#VVm}9f+ z_F!p^r@Ej#(d8!nyXvKoh=HA<}`22)q$M;+m%cV|30;uOHpoXl{xRSn?^dg7kg6psYu za_unQek!uM#HfGWBAKXi`So2LJ0}EV=OOa*>2{K-K0C;K{vu=<)}Mcl`KM(&<}x-S z8QjV|v?e?`*qWhaPzshUdxZBD*3pOI_w3v30zz>caWM~pgv&XBp>V1(UQ0Rwx)*aLma1*CC% zI9>?AoM~XexgR>ZfJQA&6KG{hpM!7fB1j-rIg1nG;yT4S5s;h*@p~824Is$$pesST zOJnKN!*}r_Y1NaFR-Hg5b4H3d{%g3^xoRIW>bfZ%L}puWIbrE^M43SgQwqCG|7vAf zqP1uv)O(!3na)IXRn8mPy!wy1xU~685gq}cBa5LwmO_nlIQmWWogjjyIWXsL)d2{m z(6RX^cLXz8chrJ1Nptk*;c}63v6IEm_s;j5agfpD8Zasvtd`Ig@98X$BE@pa`-uvF z$r{u@F6D?gn$Y~L@IxR%Bs>*HE44t#!>PgU@bBaz*YwF8-uSw!pa?nlrGyb;Aej3) zq9!1P)*b$>23UB3Y~(Wq*U?9DxPpdvL7aRa*>qRtH;OFZHT~7(I-H$843KYXKhEIU zWwWU}$JkJ7`OWSwFi53t0bpzpi=KdBryzZ zwG~K@2)3}GZWln4+Zw$3CW&(PW{xSqZ%toIMZB-bP?M{@Y% z=T*H&o+8reJEgh@NFIy;tDi#jBKzVW2JFnVNy%nr(9L%VWMQDRobMa@VG2S9pA@+BA@~5Ib(7&xx5s+~j&g+8DYmj?9 z;Hdj0!oau8K1?b zI+%4&Xbw*H^Uolp<|0hZcn|yZC2Gv(cUx`wc7ok7aaU%7o zj0E+*QgvD#n(@uR{p_8(=GT+^u5sQhv<ttlT`1`WQ2N@yfA2GyM$MlJN& zO(Y9Rmk?<3R)9l?C8a88w!%%zV)k673DRKUoCmt_Qf+JK8AJ=vYSN;MW?h|^;VUVt zECTeyamH#ib6$BOk_qA+EiI9)@_aML>8prQ%-OFtpj{yQZ3Ri^d~3Z6hgu@}g+Ni7 zzzkRDcQ~D^q^Y$$p;Z55GAhK!5G8{|@i_^?!aK4O^rnf5w`T{At}ovQ2}>_OcLE*#xjk=hskr)fC!erZoYR9z#q0G`JjV2zJ>G=Y(@T&?$RFZocNmX;4%TfqG;qhVv&qRJQejFIPX&+j4BEMhT-WgB*lJ)oYUqFT2 z#*obcvP$hbBJ;{HAre^Q;_pfH!wkgNPTlJ965Q7F`MzB8#q5Co7#G7Y}tPgsT%&1DpE}>EjoLm-)E^Fv)bGEm)uLoIS9|Pv5 zEI@sEq*CE2V1oenRTG#%&_gfl1KP3YBX9%KVO%_HuvQB?YoPe8{s6(h#$*&O0LZTn ziEu|2)&Y2z1@8VUseKN^-vKykKLXM!^$@5_+8U3da$|gX6hyvW%*k&Ek|j$+*vHat z+JMJY$1|07FpVx5u+cU?B42zjhatktiLRNc3_8n$uk1etG@3f`J=4C2qP!ME0T#S!jQy`!cL zJ6#1!zKcqKyNB?i>dfz|O*haJ ze@rdgb)#eLcO)3LL7NAqE?%y@xGA0H6TVaNy;|ZszN1{i;(a<#4w@2{fqCM$ny zI_=kKfWflVSK?ulVzd60P047k)r@R`6X_TuWOy36;)w7NYYkC*pBl$=`4P97q^SC* zVj#E3cTi`89prR7c`U2+d4250GrWtj+Bl+wGZdptBLdGDMTZHGctM@?I@~i8E3}4J zLDiSwQ8>7O_GAR6F0O1&r!kOheGQ;EC%O-Tp5gn-`IlN7zB#kI9#2#SiZt$}Y-ZC} z(laISt7d8F{PCDEau~fLJ@J!X?fu-))oGkO9|n);BM_p4TGwFxE&7Y+K30@m?^FWFf7UyWJ))nybIj5SR=m zZLNvnjq6%*Do7<6{72Ou|5GwTO1oyQ;5%cWvrbb+0;OvSmlq)3ch4$jS5Dc7e3}Q9 zxP?qkys+Bs)D^_>9A89R4*_Y6*gPV>=QB&+$$=OtY6VfTAB%_z2s4zv4PjlXIL1F=k&c9XJ^kx^F^1Oey%AIv$1+$nF=FSmUxQarSU6;^OmnYDf_w+ z)M>L!|L>d9A>P+3KFuxUo!p4vuN zs*B{yP_XITURZTEfcIiN&$o`weM65x0#wmNQWlr66GLVQuU*< zr$Aw4ORfr5MmTJ^GFm6I=NpBEJO9Y)u7S{cK zQthyE^3#ZGC72#hOm~Xdcj^tHM*eC*96lSrIy5p-_cRYJ(^=vXUOUe5*g+I=x-nw8 zc(e+H(pi+_?1mkkJ|2r3H%n>ijC_dI5%GRDHCfAEfrF5^`qFC_EL(xpIWA+J5qd1( zv@0(?PJZgNV^AQ%x{ytgJv|XW)vh z9A5*lT)KbOUJ6=dyYHa;0?gHvvG z+0*p>j>g#UHBeV5!3BeOV0~!y59omDBk+uRw=UQ!8iPDu^CK^Q7FcKu2E zOEJwsmF_xgpWpcR6KLUJYh35wLi8)jhaY z0N>OBti=#ia0@?q-Wa91j-ain&1XEy1TKuCwHL|2lcv6Ge0`Yrb=+$f63ZhOzhY=W z;m(tA9&$y>-T&!jAmw#NVYc;dZ##7#OzO(cQ`;SFH@PS84I4_#X-q%!tST(j_5Sd4 z9|Y^|LMgBx0JbO2Xwn!kWVq9|w{c_ajL!}mn}X7k3Azt)0NN~J>|`~WC<2XhD1wqu z0_TDRR#gsim9DJY;#-tsXbBK{PN$_mUVCC4I_I@B=WSPd_DH~v3zZV}41vo9Rl29> z`{j$b4VK^j?qGCXXw(anO2_oBeh^LEv-5Ym=%WqkZ(AvU<8!a>v{NCt5$LxfkU&rC z-RsDEKv~_r@=0q$3rIWMOIv2Zj6nud7t?KrywDjonv6&nr4>D!cKRu)*wx~jK#A%G zN}(+r9uqiW@em!!>^+%HjXyd1j-7BZT1Ady6o|j`A6!@_Jix$?;#B2e1LRuNS8A@Rmjry8nO-tv$dG;ohoqYj*3`On=7If+fqzoRStKIbgDX#IT zxw78_00RGM1G!|vJ71daLkdg=lVhoPPHXQMPdVYRFR;RMoKJV2EX8{zj>~UTiWrx0 z#fRk*6NRSXRa1P+E=b06eDQ1DU|R!z;Utw_oE^Z@#bs6zw^VLN+}hXJg}Ni9jy8ynJPmnW^RO0WU~})7=#9=ot1^L^ zBQjl9@w+XB4JGb`4i7-d-Zi@tLac9e9Bi$v_=+mdc*-9Bv`|oEa8V>pbrbS|iY zur;^XP2s}ijJH1o9!V^`lP;u4pZEhX@`hAFi`;tyqk8p*ttmZx)>4G9*g?^v3w&zt zE3esx*fWqbo6n~#s_)5s)%ltHz?h*Md6daLVZ{_jr>X5SFtWvMIj{q}1lj4>Ww~7& zcskP@cDNo0sXc^k;AJivu0rA>)#G?jduV?+H^pXmFHP~hb0lv;Y~#efHECUIdo+cR zoG#H?hlzLc4AxgPbW<)Os04>qSIxZw*JSADqBDG!oxQq3lMVnWTfsRPZEYSO5Ns}i z2sWV>lzF5V+_74w8(+ZE+sRoS{wJ|p-^DVSo=1fy7w-);>e(3$LEbDGaPjX0rn_O| zQT$F`?%|1dx1;@a!^}x7yL#7Q|cd-u!SW>v&VqG-iFrw*@IR0I! zIxestk;Q$rK4J^@Hu7{vtJ-PnTnM{l3(JQ7O!Ic{h0Y>lY4a5rl0uNvje@v~u)@IkCW_waPqLzBF$M#~;h&`SG! zb=$OzIAyrSVVnMT;L*!N%hL}sj1T5THUW{bc=h-d8ni=I6%tFRI9{!ddF@w*ys0XCMAFUR{qoJ0l~H9o4c+U%z7w<4GYlWtLV4q~6#qXK*({MuP&gauA1uTh+`=yWY zVE=fbY#V#|W`X#(QmoNSXm?IL9Dy}8(usW5Xvq$By@5UUtGtW@>bw$a9Mb(MC3xcL z2E2xhc4G+>&LvuAxXvCoG3ZVzs9ZR(IyjK4BS}>jK_(+#ZRKw-uqCii)^!Xm8dzm3 zJI4ndXwR&UQWh^d^t!n6{=PE<0wkpa5e@4qZI9hTg;}b7!b~NYHOy9Lo-veCJ5aAv zMH>($cf|OH3n*6|rBLSgr0E-OOxC+0HkTCFE=8V|IDb=%JQrPEPb1cs6>XS?Pt93&g`Z&;i6ad>))H`}}_Pkr&0H*O&2I2m)QO0KLM`Z#p)5)PAcDe#PG(U)~ zy}PSxMzz_}aoCQh;mjvM5HS@+$+J=3K4WdsuxgZVYY6s;M7a|756OurRlY(+u#uJ@ z3zNJ&PvCinhGTVm4!#+zymNd*JFijPBE9couc!|(DHRyBxO5t{0W?sPT17yZKcB#3 z(#m2A@yF1LGN6$qv$Z~x5Hs~kj1FrTbgeIUinGR)w?yxqKWo25WD}p$CVWQNd(^4z7+AHPy*tJ04#t!T*A#9lE*Lh^04DNh? z;+OIVbzs3#dLl0K>D{{O-2J$_ejoS)^c&eUA(aioX8rt2Sbd_%d9f1Zzen0opn^!M zn~(MROp8e*_iW45i!TxiCHd+35#Q=n5!C8vC3X!;8QjLCeevyi^C?t6yaJ1-J4_n( zrJa`yyKKF4-`C5nqc_!k2-_|;+U(_Q ztAiGkqlzAo9uaCyN+)Bp1x@n^&7Y{O`&GrxT&Dt*@$S*p#|cGx8JkN-PkUMRPpEK% zz?vNo+Yvu49H?k~_a*N~eXUyMnLT%Zk$4}r;n_BWOwj|US~{PbVPhkn{3;qeI^s3- z?V+9Ew&wMQ!e@uM?wY=zZY9#+e~csk_z>4nyS6C)ijZx`tecHByJ@CfZk6JmZ}h?PMq27>>2+l7C6ks z0BRh)67zHe*H>xXZJciX^F4od?f+l@{ny(ri`4z)066=9_V;DaO0L}-G>UN!w>a(s OeI1kYMQ1Pn@?QXkK>j-b literal 0 HcmV?d00001 diff --git a/benchmark/public/plots/markdown_quality.png b/benchmark/public/plots/markdown_quality.png new file mode 100644 index 0000000000000000000000000000000000000000..721dc14cbbec9b78304658f7c4fd84a7c88f398b GIT binary patch literal 33872 zcmeFZcU04Bw>26p;1;E96_g^Dt*8iy(z^vv5wHOonhoh7kVq$N6#)@YQIHx00g)yU zKtfXiQHr7Wq97!6A)y5VcRg{RbI<+WbITa_`{SH%e2(#s%?rt|JY}u9=A7#ZI_s+xyxB;6KVZtt+@oPPcJx*Ig_zde?Dx?45A- zHaB+NwRCZ{adJE+C38ed{_rkq9PW;*vb40rzkfo?$;C=KJ;T2OuJXs7(^p+FnC&s> ze;jA<_n%`h_e;*4I)2$bcD$GCrHCbq>z6|5Gi}=65B8M$?>BsV`oe4OHybuQ;ok9|+xp})Py2R1;X6^oBK8a4&T}htEBZ^JXl_xJqE@{0 zZG61g>sVf1zX#Q)xE22GgJ-~CtSx&@y#8kYxXTZ{BPKx>tGO2aaDF3)Ec!1dneV^l zFB-B;?|fXJW>V<*qCDdIVUxuN-Brxj6h0ku8yRo#@mkBuECIm)w;=RFqrzqT8ouu<^C2vAStUx0 zQ(4r>-Pb5^Y3R=mFd&DB&TXc22< z;*ihyvNw!$MvVj0B@7eO;J8^x_CSC-VWC2-=~ckauAkmAK3yc67mKgD%2!!aq`ITE zvP-WRw8n}6?Yy@YREL7j{UJM!?a$1uIb@SlC7`=ST>l)e#_}EV;&4p!He=Ht;p;GO z_P`?O%ItOPSBrD&^B5^CXF23?;Lb^}N$g)9R^Wd35VwxCG;gq$i^qP<7H1<`8vK$r znX-X+(?Zs<=e=ih0FP*N@s7nhEA?<+Mv_>v%H;c`5ZjHIh85acA&hll*)Mw9y4|8#TUsfUeuL*0Ur;4xLG)&pFnEyUS-Y4 zh{{T-r(@+iZ-v)bO;a^iiy9d)xQ|`s;#zBy50JH=y$uDJ}M@C6DNARV0~s=>8rPYegC-V8cocS zNfcwAun(~05yEat*fHg!zm#BX;Ug#N@$>1(O|hjbqwU!ieO$iN!4~HX)E94+Qq5fJ zx^GW(sC3Jn;ZyGDpVKqm^&%JCI zFTrSXw{l=ed#OZ%@8jDMtMT45U&-83?|1cEc#b5G-L89i!Kp9mm{pr(Ufod-Cai~- zROxu(S{vGEdUn=1r%|WrD#2{8Umv-sR|<=#%8DI3-dzugU#&Yvk&P~lnDVBoycnt8 z>1CfZ(wu6L-EBqmiO!%#+htu<@LgWe&NguvZAq(_+Pk71B4jvEOA3?soJ7GQ)v09q zb4ZF_v}{_IaklvjOA+Tv{*d9trT$g(sFlhs1}!m0&tZ9&XLb9`_r_#&7j!#z(+GVR z#bfWB$Rp~x4#-8Cx_n*kVa@lKcX|ZR9$VQ7X^39!Q~-zH zV(NMbY*`a9EJfP};iE)H*yWzh1JkY)waC89VlJY2@?pR`VumT}2~x9Ay`bb=;;d^USp_?dL}k(F;qG>|=okjW=?xr+;v-=j)A;wO=nxG~2}7o@sfqVpSJ^m1O+BZw0&)nDf?2?(9+wK1S zt1I)1cOh=u%U1E|zL>PZL(?#F6}V^V+eR%)-9tvgILZYWw1lG@rc}?}KVPzFh){LDXF+#}NV+#xYRpi`ub1ykhi>cCi^DM93I~A7g(mHtDdYr2n zbKOqVwwlKBnPU{yaEUEsuDSW5*eOtrT!uT>o0$dfseN@w~GhA0fTHSBI% z8pgKBTaINGX7z+k%=frb>euoXBzNfF=ojo>2$n8;Id7Ju@u;mLN79EiWj&>3E?|*xU z$)vJ+AuT`ER~~s?(5&eTd1iG)56s+>t7#Dw>gMbvn^#-#)2qK+m{ z<`G%;$%g$?gw>^JiVs{|Lh>n(Qcg>2ew&4d(dQTUNl_@@VR zZo*`qIhL)^nd|=j!{2fyjB?0CcouWkZsyVbhz`f%iP6mBoXN1AYUIfeYk4!4TC;sy z-y{<{B@>eTC)2abx~$`24;Gq>8mLT=qK$PbTX|(Du;6-cXE*;zT@4;33CRA2}xU$c{sG&-BiuTS1Y*8n$>fs zAqN|(&ZPC|;B~dQ2Mv5@!i&OMto0UBHCP7=+w9*OI*^#$jS9A;9>)XN25pNXy@Obu zYFCOXy-wR_=AnBw%!ygD;rF_zV-heKlI6=Yi@uo=)ZN%{o&Jwc4;oXpNIryA^378+ z&92iLK#4DB0(ltuTnppd@Aso;i%xh9 z7auN#t(ir^^M7VCsa1x?W-oY@em<$0+dgKQLCneY8IrL0YJC0_>t?q(U)fUcREn0M z8R^1eyw~K_&b~o{d&>nKSbAB$2j=jQtlf&;<5IRR-BZfEpCjFUJwP-%b|s49y? z*HnTA(Nd#dyL_q9Nd?oOysiPGSu7ZNUB_>0q>j#ptpoBqbru0cwhmZoS#LGtIpr>N z|Kql`L)!?e^Mut%YH$`;^d(bl-=EH%%-b;z5ye@79DM^9=>smmus6#1IPUP@Yilws z+}qfz9oTMmBWPcAtTg8r?nc&eoYuw6?ccMMO?_eho7eKHJ~`RG!R8UA%rkxiF;zcK zxf-KPUT<@D)A;rqyFR@!F$p~0b^^DSKa#F+dEKk31iSo7&#h+3TkkJxxnIq&bKaBa zSysR&?sDTu-4(IDryl>k`nE72QRQCls9UwD+hu+4Qwg2V!wAcRMPp&3crWwzG}?(9 z=BM~}in}JJKcj8@>(jT6`1D=?)*k&$cQk8`%Fdc|Z5DbiaP`(xZ$gYiflaP((4K2( zhj%aIZxFYO-x~ONJeKcK`1va3E8}*9i}m=4qp3bFxE+^c8ENWYjTay$m|QL7QT*l}>3^G^)%BBPZK#s!aROgnRW&B?=$GOe>W26Vlm&Xi z@C*+T)~o>il;7n8pJ*F4MVj61+;5?>hLTbl^!JIzQ6F;#DyN=rP?wT zZJfXEi)@JzSfvmk`p(`PeTXpHYq_>7ds&QBjrxzbNcX}hSrG*^14XQvPXCw2w+hVe zP`_+N=XVLC#u=%ud0FzN2GqN8(fzXr^*yW3_}gWd$`fA(mZc}dgyb?uaG#gNi zRi396%?n-k(52egioM@uE?yAOF*oqn&GF{JQ#!F%Vp}g!dNKnfJH>S@ns%=^Y1J-0 z%p~r;(htyw`u;E`VF)eKfb*sgeVX+YhIDCz1>}-_iL)jT(i%6;Hk{W%BRK9I<+F)tNIERJhBl1V2uQ)6lW)1a-LPOo>!XiUp}Q3qNWP&ICu*FV zBco?~i!`TIuu(+gBegX(+n-BvrawNSp+2^!g_qu4|E1^AstqoLZ;U{y4Eru1m=2R>>OTK+$!M?G}cG(8b&-RfkgCbU2*WTB*8#$W)v$OywY?t01 z^igW?(euRM$26mZoY)Y*_gA>~(xhbzcNW|timXla9oRyABd8c)?Qap=^7yy{_4X4{v_Sxm%KRkptkWwts|)T^6p%Cb0Mke-SrBWm%;>k zw{mX8Sa!KO7sWtz`qX6G9`aA?Gw)2-HR{25@we629~I*CzUR^M;EX=v%&J57f7V zZya?~dDR*2>U-mF0-3y(%pJ?*r z{hIHPx4+Ee%Q-Na2AItl+;%NHYHGaa+rGhwz_wKXL_;e{RMOfy$aI*m6KS;6dhn9} zgy3}@N&ZIx)`mT?tIJdkSJTg^K)xM$p?Au4I@VSzP~^vqiBsu|6)53{%PuSQ;+nPT zr^aaAPUULz-KJ)JPqlU#F4KWW@ZPsH8`I*Mz&dl!A7DCh;R)4g1nX!lmEkLJ6J zsVH^wfl%X+&N?xf4#NixS_@6*`6sm-%o)c_b!B$ll$2Sx7O0+@)E$tr^?sce-;T^W z;X|Bz9)DLdm&VR!m9H>o$*XU62EMc2ExPuAwp~}K)F&!Lmt=E{8Lph)$5W7KaB0ls z(inEuWmrq*-L2S$1Dw-#2UZ02@^?C%VGZ0mfop6u?Z8?)D!Gn|mD0*~V{a(zBljL; z21~U~%pE8Q8`v1|khRB{m$R!SA#_@)&wVf?O~U%Cx>8MW`ev-IX%62_t_P@UM2@5CdJ(P13U%281|MUqH{786kMGv2TKEtVzbvBbvG2&7$ z*F?`Dtc^CIUMAu8$wvHVnQD>dpY&j+mPSj}nftaE)S(2rJ$A%<;n$*LzHs+1eU*vl zQ>&xKdo&q>Z0Y=(V?;)#ot$_ds1Iir}Cv5DyE8f#p zbXMY!L#C8!w6aX=j(KWTP(});z3R~OOVOLgOp=5{uiRy{?(DW6ckRB>yT&g|6K#S6 z{IQu(JIt;bin!ubI(vwsH2$ulX@^zp)3wd*76i5V^@!r33e==$k!=q(lgF}RPSR*@ z%4_O3xCckX`EN)oo6U4GBcc?VbZcRgr)CZ`HfLXK*q+qmrn zLrgWbJf%|ef!0{-hO9>)*T!(CKcdL5SP)kjsdOu`-s`8n<7d8#Q&)8uc$Mr#eSYhQ zPylNbe)FBkZ+Eeq2l%PHs*xD;BvE7FVPba(?avh2$}Y}p!Qs1Qp4=jx3Ju@bIHcu@ zf2`ga+*X^TuJubuTy@~V*B-{rWrS~Dl=%+&AZpITESMre&I)q)ixa;IW!SOx{oH4l z`v(X@y#x`j@thGDo7yoZl*Hi`^Br!dA~gd4TL{++d}Ew1-&w`70Vtn^%M&bKOGDQwA9VZZOl>u~);mc$ak#8e;W z_kiqY9PGngT`z9IyyV7XFcV)EsYEe7JiR~MKwZh060#Uf{p=z{M#VaFcFZ^`^+*2& z9Z^HKPk)I2T-vJ>YG2^!N>{h_0QO=`z})?Zs!z+C3wXf6=Su$rTQMeNmvOu{Zixp8!N0re*gqwTaOiP5Cn-s@Z~Di?{k zGIC>*j>2M31+PYl;oNaoW(k#)v3^z=D(SPmP-J)iq)rDJcqcl3dhmzd3SrP$MzZaG z(KOJpxqWsli1zX6)U_B%HtS&N4T<+Ol(Pmv^OnvG^*evXlYWH=yhU}lt8U%;ec10> zW5d?;1G=s4eS_=H0q?$l6b+rs~cnsf14t zR;QxPu}BalUM03-Ow1?_k$4!r= zxBG@eD5`x{%d3NGpIiAFXH>|q0%wyYt^zmYVn?Of9#F67(5RyF6Z@mA;-luDHkKOf zNE{-_e6SuDjuCzgi(jL6kAFPH67&Vl0DiNw%0Ir)wD;+6j04KV!E7IfMj|0SB0)Du zV(T97P@`F`{vHfFLtrqtgBnYnlo!Qi&7v4-!GAM8}~d<}em zGSts1qCk;2n2BO&f0B`0k}UNdVxj>e=gDvddVf-wdNjoSr}|1m&lz!3-d6FEUO`icIhHvBn?ct=ZT9lcGfG%f6wEV?2%@7b~x}~18XW7WJaSeHG z;71-Sn6=yAVmsd)cI1{(t9jdcD5l-Ao#CPoWcNwY zu5NPA&e3aLZ&{1mEGO`0j=cYO7dxh#aZ)v2-T`|kT%duTK)ah%bZl+-e!ZH)`<$C? z8;@GJ|L|8T2z7L1FCM?5M$_y~nOfX5k0Z7%Z@ibjsZ3oa4_eb8jbtlGqqo*H-V38C zkx9BZ+Fwp6Bfu!N#-<*y04{Hsy2{q}8m|A5Ks`RySK1t^GWl}K0+2^Y$|~dmU0C%x zW#zR}#dTAw8_?AfPX4K>Fqo0|?JO)rmo6Y8g!NE4HrkeHqP)6HZ?;&Oi9EXqWD}9i zN%EAq0Tr|CjlS!b^`>^yi3MPkUd!D4e0itFY|Ev^xmee4XHZm?7#ge&GtfPpbE<3= z(YV!O1Xanhg@KR=K{LDMg=!6bAa-wc0n^b0Rbae;8Lp85mH*z-<(b-|m{4?I?w_Tw zu`w8uj@f=cHUj}9RPUQ*R3;Go(=MeJxJT)TIG@7a43Tr;^P#BCzy2C7o@yC!b-&M; zNq-H2Tat7>y;TfdBOz(T$Qa;z+lvH`z1okz6t*5$LciDaqKFos<3j2;Gj-T9*u0IPE17?Jw6E05mKQKEeC%xEkn00V0K}?|7>h4d}>lF_dO*yXY z@|>TH6M*g1BZzxIaVlrs#lm9h)-o2+uK;CIQzCFBK3JWYG;|De*!n&f@3%tGI}DCl zeLMGW7ff4^8>%jFRKQrTRr6%L=@I-b1vjMGaBPMfVl)r3pBIC9egdE<`xl-Oc~{_v zgo8<+&AVTz$66V5=VYcg45!87eHf$(M&Cb_4<0VLeG)S zJ5>pi)4ag4k%0A1{!tN7i0xHA_FKqq7}5HrTK>Z|@cY$;9vAX=^j(2BQ0$6}y4K0q z+}|vJ8YoxyQD9Kn5!gzd9)*LDF27g;Pq2qAo`SNHMp#|3X-GJ31L&&WGE8;mbkZV( z$_X4`*i?XR(gFHFtm@WE7Mga!R5#<>%@ioJVsf^Z(UzGER*J~05-|BW#4D;S>%(j& z>Z6L)vJY;p=s;@Jwt3|*a2CpCwvGYGFg4nAjtF&YkAYL2SZ$AeUVRf%B`Nr1RuEo% zXpiT_i1AqfH@nWmCm6p z*9;HZLG#FotEAs3j*_{l&HQTM`%F7*XI#JQw9;j#K?~NXaWmrDBZa#^h^tQrEi1|G zQ%{BBy+%_q0bDQ=kd?eyKgBho0PVEXl=y#U6%9v^ zpw#T!$D=a-q@eS5jL78jOl0{vV^e3X(#2JPjVXvt0Tg3B&#H7aVr@2^iR(R03kv;cO>f)nA%CuLhMU8mrzs3fTY?jMCjokp2``*sQpN zEuhVSOeako_kcxfr!KvwWrNY64|onfEqdhW2}%KbPHMy2C8N3YC=PW{jS!ro23suO zQ2y?>R!Gtnj0QHq_ZlEZ&ruuq1%b{U-HiCb}V-2^S!wvr=&yw zW=zQHnRPvq_~q$p>VyyAahs}b#&xqn=a&GQ)bK`mcKEQPcCk1u{$I{uo*ir2bo{$WOjDF>mo#In+`T0dvt!;Y4UpLKy z@Z(5ev%Jb5`?-K$GI?)cNCe#yV)~c&i-H6< z0#NeXZn|xHW`qz@N@xDaw_a}^IL{~EohawMuF*L+E2(h}N<3E*67#Jpekq-LFYv>K zWScR0ch8!*1HpfnmraK_wQv(puERAxBmWw+SXC}uew@Xfn13C=ATp4^T4w4%7 z51<#%hqT4*Ca(*c)Ofo#$NGoR53lzlDKVpx5?DnYzfxEh>lJz!zeo|4VeQJ1+Y_-h`%Uz5np+r)pndam;sWXTt} zy7nX9JkFLUaXXrtDEm$oYNC5colP_e?TBn8*X*tI%H|rY?*_N?)aRxpa{(Ohc#24 zyO_s9vW`a2OcqZW-5b48Msq6IH}H)0Zl*>e^{8PXM)GoDL6)2vb7`|ypb@|h!5IgJ zhKcT_A)IM_riQb>=7R>wtArcE2X{&b&ZcgEkXO`xu#NhV15bM5cwp4Reo9xmlp$iF zF&sAOfpt|0YwK=QRw8pgDqo0A5}UC05iz^p!nA08+a9~ZTPt1eop;>jQuY>~3+-J_ zxUKAh?%Ll$ZT`+d9;9=je`AuWEFMVlpVNA?ZS9jP1}!YV)UIROmmd98J3B`fjW(vA zIiWz_o|a6`chZ!;_=|Zzbg9wsERLh;^4LsUOXKR4 z9zPooai7~1FYx0sV@RaeNtW?5+@Q@1S$(K?lsRt&`&omoO&qj4l_0wIbrr@jv$36i zk55vW2+5r>o=0gDi9KBNDxyPs&+`!V=Yo~Y=APU~Hm=9cn=>3lf8aKK+U|3pdNASu zmb?0HpHO0z2QK&`F67V5T*6E^@h8W|!Pgy9#pBPoJnX-&{uoIm>f2=%-A?j|2WyeH z$)op zUortwFkmX$+D`Z>~iLxbPUy7K$I`k*^XSY;i*zVo;W{=dEKVT6~q|1bC9Xq>4d6vE@7_v;7L= zqtNd24@{>^HpBuCP$&y^$MSo!*6mF{1&&h&f~DiXp|CGfg}Lc9k#EfNIdR#mf62U< znGImCYlKT<;kaW?=~z~A3WCT!PD5j0)3f{-)e`;+VCyAKz>>xBy5 zB~q@R*;a^^`_%(FITDm+5)!7#Nab&Gsg;Z@ZVx)|ZjK$x&ikf30=d%1>RcAYFqKognBokyM6A1_Slgq7F~`bzJ(9Ra8VCB;w{ zn9XvWf+$iEhNe+`;vbP-t5g<-d^bK+mHw~dEs2J1%hKwAzfuu=n_9XdHiwwJ+X@Hc zkUR!S=-PnG9AtLs!n>hM?PhLfWWYGhz^=!b*l1%e*Oo0!#_aT@y<&iAMv1p&+uyFF zBaeR_xL2q)-;>w3%zsV;3{6U#9_<&-sg&q74 zRv~rPyjoag^{)F@_rgv7*-2Hkmwf{!2{ zI-FtOEi(aF<`-g1ms0g-qt?9p0Hwyh{<~9sY6R(gE_Sf7UeB*SKh>|%Pk@riI|Xz2 zJgR$9ja;U2=lh4dKxpW1|Gh+`9$%&NpLYTP1?vR6*Qhxp_ zvgQ!&*O>7CU4nmYcy-bhqXy+U0|57s29O2mYU3iYwrL`&I=UpF z?3e604wx|(MP9j*yd4dGxc>9001PVZxur|)Ookm>!6ej$6I<8*C{SQO`~exh1}<5Q z+12@iEcA%L4TS$+Mg~>K|N9p(F8{2Q|3$_FSA=0gX>9-tN(<3SLPxG?LSW5rg1jG} zq8B6IUW=^1Oknb#!0IcNwQfjgNjJ_WEer&w8W5PxBS4_tJUGi{U1d%WiQUr}vWZie6>!6grdX_Qsfsx1BtcU>r%B#)(_=;Yynsw_mlHv_91b8-@?N4<~> zlFQh%r{CP$+I0=!2Cyv?1%46>bmyTVw1*(2-7O=4^Eo+ai+F*xN27}}68mhx^gEj} z@vef$2DqBKd1MYV^5SJbs_2=>4GD-9#X)cwUIOOIAZetL^%=SSQ}pAa_k#~P(Qf^Z=eKQ^{b{Usu$oV(o!pEn;!co0oz9py( zzOMaq5~4l*su#MWUuc@u=o$n}yo-@@U&(;vmj#ZO#IX=WP|dScp?D;H-5+zAbN6H4 zgu!XV$;*PgX5`6c$T6b2k=7h6$m&DgK=#1*H|cNhW7FfuB4&|^4uxnNsDEcUcbftl zZN*J!gHECJJ3U1H^o<91NzG z0VDsS8nQ(#{c>SfC`G)c4}4ZUEw5*dQ z`W4ERtcUnO>gD#`5j*IWc3TJXlMatw3{=*>+EPU3nT{7~dqrNIP2`#0Vc5P~y<&Lu zd`4|6-uk^%Ktx7uLF!d|g97pH@Tt`AOJC>12i2;*Lg$x#XO_NBiI)?9^3Mc*i_&TF zd0zY?O?O>`Wgx+uMN&J>c@^Sc)2 zYzP9PssS-`3Z2z-Ah;vGDGoj+0d`46l}u=D58%9Jl-hCNy-fwFiQGq0gGHg;>;<&i z{qZ|MXMS`2&6z9bO37^aL2FE+J22}1yuS&Gh}@9jqN%c_nF+wz$Et?_7rDMz0u|z| zk^%Th@D_*9HwT$nC)8}Qnx}7q+iFtVOVefd0gpIKX8|wxzL5c}k7S|WB=E?HxF3FB zGWu~nTyip+-UC~8vB0R~pWT8B!YGS?+ORl>Fb|@vl2x%Sl%U9USTEn! z@_*L6FYNgKI?oHPP1~O!2+jbp=EiX;x20}q7F+T(QedmQ)^UxNscJQQi6 z9wMqDFADDeG`FcfN%J=7JBUYCenvBKMHae+@FOrOdH_!JP6qJk5t%KiO&nYzPv*c) zeX*Y%f;7F_GPB@cro_kCrFT3)&D{%od+g|MKolqIHV%RqTgrKNjV*5ee@bv#6c+(* z42isj%Po#~wlc`U3|#`t77vt##WP_1#Bm?lNb5f#_Wy2 zkM)8M#TK#vMVWa*gF{nJ_0E`rF57fj%bMp!*gnYQFCz<~E>0D8Dv`;Azp#0HuvmE_ ztqHltE_8XZBY^dkXNuGSL@s><4WZ-jTw$N59fDOZi}2;A6;u@KMG4E(BRU$)3&D{d zY+|#OUy}%s1-TVo*r1XMd1?+$vGw?by|kl}31CEM@m9Gz-f?z8WRENa?>ssSs00bU zSj8Nz+G5~*n@|T%PuC)71L=b50jOUPsEc=o{p^URsnfJ?4NcI(Tc#IyB6j$1wq~NX zx(IX65n23bi|qAJ4+JD3B~vnRV~v1+jk09W=FqHe-$(xL4%R991kf5yaD5Zkc%<2> zfs*af2P!~u=Mcg{g?>E5g-Y=z(7c~X75#k1P7~@r%-7&JN<~I5w|Vv;H5ek)2OgKN zg|&zC&V|!YannXUV(5SZQla1-T0(5I5b#(-&Oe}0?At3GL;Bz0qLd1?g(MaX=Q(R7W$FjGN48l4s?Ka0m8Hp(`d9b{`MO#I2Gq9fdBno;6L~eBZbY+!xpz@E`S6#8H#}1eVZ(zl9Zn;R<%W%^W~W=phbJj-D-vJ?#N$l@EkCPH3;DZG90(?FO$@ytfX`|HR=Nq<3yXQ6?{Ia# zm6cr%d!y(uxhk#Y#|vW@H{}<~=ZNvtw7#9ec%T5j>Sfiw+BfE6)g3>OpLp0@<&@cg zb4w;4J~a0RvC~v^An!n>)mD0?RogS=t+uVZSKKy_@vROX2wwHr-R2J_eMf`6Zwp5D zX>_FszU@je-f{F;X5O!H{41(XH)H1hZA#uR_~F-E2hXCgf)R7WZU)WYD#b9M&fKU( zzb^0cYA5}$VG_@Dth?j?0Bz;F;vYJ`oaguNaH~|!kv>?dzP~1A?xm06H;Mag`qDgC z%V*p@=hej=n#Izem$q9?!}U9Fij=6|d0{G_XA%ZuQC?oe1NL*!X)--Jry#ksqy zmVo`$ZWDORRo&wlP2XqX&-kersz>_j#=Nm>Ubf*A2j@y<490`GbHYM%JFEn6SnOP4 zoDfN`RF(PQTA_TI_$J#h;EFz8^wHGAQRS`un_Jnq{8gLV$&$MjQx7}hZwLqbXzrfd zoD*jKUaz|BHQ&<91HnG}yUV%@2(km`HHXl&kF+80{-qI)jRza z_Xp)FY9Fl3(S2T(b8dI2^T3rd?g;oNyINz;Q~KHuNw6^w-W{@If32(O-Q zI2_ev*8xNP>_u)NQ#!IXWtIlJV4|o`zBg%A2Y)5&qCjn6?v>lS%_V9E@=vzx+o{vL z@dEWg@cfZt9*MfQC+Ev1m`iCA_68TK4+JY4w||^A=Th!V&sW=QE=@2N98ggm4w_<| z6=jCL%-5S2Iab?~eV@r>v|Qb`{Por#;p_?AMc{ zOY++(XWO<{bxQwq{_h$g_lKP*NBdDXHapobfSiDpz zqp79PsUHU^D;2sU)e0YlN~xlXZtG2zosfDZ9WVTT^-jh)voG(R2VELalddZA$RZeF z1AR^gNR~$~C~&sSn@UchhC(K^-$v;}BJSB%4R(@nPH2%jNk#Sn2hjABOA)D1BV#+B zZ#JCdQ{ZIMj`91JrMtf2I(SR!2$DI7@Gl?EKs~qjxTUO?zE!2at=J0RR z4JMi9Vs-}&hNfMEm1zNRgi^@2(Vg;)-%9I&F*kv zvuiJmum>HW#zYi;`nhsiVsH~IwAd9W9t&;Y)CTxR)8qgf7IOBo4<+aWuttJLu-Cv` zd(!Q5zwRGFGJ>%g-Y}+h3;Kc|kAUVX@}3}?tP8Z4X4I*R3Q2AHVp!Ggk4vVr^?vi!Z4hCCJy?>=(cAhY#11>lhw zV0)H)=lS~+e=t!6PZ6Pd!SXSYW!vL06FM=R4am@nI@qJ*tYCE&Inj_&hdaX@JuL?J z3*!l^M;Eo8L+^lZ>rYjyc#qqjX#Btw^Img|U4VU99*OE|04sVN62xS|M9V<>#{{(F z7I{OC9r>GU=V_!|u$@I{8=$tyf{a7#n~0{ZXZ)#q_-PjSOwc(E>_~>hEBsiSq?vQo zHXFE|0h0FEEGBS^t{d1}M6G{O8XH2PO~WY_4x2{o6(isq9E&x z{0I=!k8vd1m>B<_gV2`s&}XU^X`%{xbQ7B1TDErf-Qk&>hUx065&fHj-U1B;?KgW8I_f`==nL<3C;q{J1_Fb~6sB(ANlBM@r_m zTtvj^IV47ocIKH~{yiK;4Z~k2Px7igKRFDOo6@LxAE~Czs9}U{8v;_R@T%zV^?r?m z0V4i$^db!8@PuOLFrZX(6qP8=NTJ5T3VjXn?hqUcL997G3p3jo(fa$jlh!pLjBZFa&Jhh%nWB->5A-NU$we<^4vN9=KKno!Yk_rwQA(JWI!^aRcjjWRO&4>|3I#M#sr^xy0 z682l-L}1}5bBdv+l4h{DhjRkIh5RMR3pZ5$Ud$5qx4q*Zy`^3;M(p+G7uvE$nK9X` z0teyPuvy8Ebl~e=)*>N_1oYYtZ$5PCg!{I147mZ)CE}V%_}`1tdLJ{Ned7fB`Ua@~ zj3&95FzT1W>6N6H;CTCXH384A%o5geHME<$>S1p^)j~G7j-RT4HBJhtd%Fv1_%5%> z6lPyWRMF2D9r}V;F|wNzS*Vfe5hB-iIt&`X|Kn#++yD(!y?XIRJSdzFf4BUBzVD~c zrzs7`vw*H?9zEzSV*nm?!P9ZhSDMo6z9`)M33k`Zv>86aY|3$iL=Bu#fD z+(9`X3}*(y4x|gn71x#fmi+{{*BCNrk0a=`AgHkj@Z#QNkn%4UI`+d!lcfnUa zpheBAz{Zfh*k0;7#EI5AW{_7I`7?`%$bk%8M!Y<40ypw_ zto$BxSL^2E8-%0PeLR`dVT4%V_4C+S1@+L6fTm$LX8SuT^sS{K6Q(4YmN4KWn}P0jasMZ{ z=V-2_ZHR{Rhq~S@13s!R3_hHDy55hyQ&HevleU-rE}}>LCIMEWq$K=813DiT??P`_ z+G5G>lMi1*5REj~`2Eh-7&XvbMC6e{VYbEz2Bv3~4(^TV{U-X-aGnGAn+Wy?aqlsn zC{7|_Oc{;`k+4L80X1;ZSg9+2rY(K22^@@#5*bgVU;<@9`A$qXq3P^wA9()1c+)^Pw*o(AFVwcj%*4dXWMh(j0iTb zfwJLo<9`CiR?PtL0LhBp`M@C=?N;|vCb^xRKm2(5*0X=Wm<9Y7A=Cav>HdFb*Gk7B zD8GihIaCe@W2G35a-+d&xPVW?Fp(Mx%iD?!NWGa;*!~GvNQxPNppv0!NZtXCWwEJw zevn==4n{c@QC!1uCGX!e#J#=&&Pc+erXDu-K&u{=X(+ZMR~Xwjj9Pg{Ky)BdhfsUg zY4FS1Dno;)Nq!%C1dOO*1QP}pX;kH{xnKI zhVH7V(+=ep958gGp9jFBBxpf$)T_`>Lf1$oi z22crvywa1a9^rcOHvZABNic`vfa7}6=XA{ z)ikspZ*TnB4*@q8HCVyfNBK3&aHPT+!cs zIA^2H3TAZh2N*PYyZCKyM~`GsGIF(+zWMg$a@QXk>-D+Y*;R2nr4|12JxLh|smsas zH_DTn=_eJ=iZMihpTIXnkNCe5Hrh#{wMkIupFg}fWEt)y9bLPB)|_;4ZrE>|YbV38 zRKikv)1telWdrs_HGJ_o)g!h4%h{_vM8kmNFLK@WRSWrQAA1bQZJh1`TI=8`QII=L z(<->7e{!fu;9+qsmuF8BSRkT)wQQ8sPv!wuv$#2} zPiH~u|0vgXSjqnGXBZtI(q{|Q$IoX#uZ zVV_gG=R|>Q(%qO77L=Q&cH4*IZ_Jq)XD*WKJ2cIh^il6h(J(sbl}#A{uBSr6GMm?Q zUu6AwThc{=Fbfmwkeb1RY5_lU-IhY%JTL7fbehgx59{o&kmfZPFO_v~Yt^}qoiFo< zFJxu|*;9}z-x+_SUzlE5_MUHv2AtI0-E%`Z2GW%|XX~~YKa|U;czq^3I`ib=R5DMM z+sS_@u1eRpW9P?;cbgaga64&lkP}j+p1`;CutP_-C*`Ka#Xz5P&#Tl^`T8fOMJ)oS zR$hvSuHWeVn9tI=?tI|Qpaa1-gHNc2-V8e-5`OBcNYd7Uj(s%)S5o+@6`eCK1>Q{1 z?!H9a=i0ecDv3t`Y(57Ez!$c-n#6rAleVMW$jHhVI%pI6P9 zI>k&jt|loTHa-|^dFtxnJihF0meXCkNB;~`tPz>E!zrG z-`sqz6{j*dZ4R$cU*+`p#PVEV?tc7Af<}sjB%GW>Z5i_au8hEgE7mNaL zK7{8#^deWW0LU_FbKt_BEO#e+5zk%)22n7Z zf53m>|M|}ZlEd2QM`sVSwkf9}|Fnnydq8xTR#){tHA3T=PK)zvmV)(bR_^`TVS&47W=)VmKDyr`eV$ZfS{W(p)eWqSlk4X& zF`^R$#y+G4!3lAulXu~k#wl7BjevcZX7-@??3a}QUIS?N_*U4cpu6el`|;eorOZP; z(2j@j%k6nrIq@o2+aUFNXgp~1E+;#&p}SvJqL;DF*|HU89;XQzOhCeV(h2?Fi$Jgy_$iT-=wF|`@Fce z<5sv{6Ic{4PAQ=iC;@Z6m->Cq=yb=Lza%vSvLZd$fBt<+wYQ)HxfmDf~ukrVFf8TJ?|JBXhYZh2sHodBSHguC0Cw1 zzW?zM1py@~D=>-KKk$u>PbVVVd=Sw0GW7O{QzRM;(~;3u-#Y88@B8Ddwf}J`ND`9weV+Td?(6#9!1q0ibW>5~tMAtPIiqWk zZ`=%)81Rl+v%YGqXMnP$67+eP+GolR`-@J!5(6{XX4KEI1YcLLLz$yG0 z#iMRC4IhlVcwhhoK~#9l&3#jLOPs+AqzmchnepCYN7;h-+OMB$J5W$;G8MWgu8`OD zvb8EnBSFdAE)h5u4X@q86MN?aHPaX|LA>jEo0az^R3t;YzXP^|q{+#b6y1m{bnWN$hJsiI6ndaT6HO{f}-JadrS2|x^ataP;Rt`9|xo*SmRcWcXD=(=(-^^#NgeDX@QlW?MC zcNKT+MQLcunS9HxE3b@w{IY2|r$5vYOtM*_9x=MZ8WraAqiHG~TMX2mX{S&NtW#JK zzsXuQ{=~r@i$6PB>}^8EYPTq5$gj z!YktR5RB85o?>0)4z^+KxT|58YRwnug`R*m^I?NuvV4S6Y7}OdU6OEwwlRO9RhJ|Q z9F%1os_H=gwIqp}ugxW+e|%^P>v`ZGH>wwyl6kFjf66Y3e4VmmieJQu>YFh}c0ZhN zIN8_HCNQI$zJnW?AU%KZZf}F(hfax?B-WdM-HHAT2e!?He>VMclYRbBcteqV$;qLq zm=uE>AF6_lFEkELkgqFnpErz6evaQ<^av-a%^uA-QV>Yfrr4}lC)jQ7vq*wMbuz-= z!QS?Mus4+0RXe!Tz1hO_iwBvx6NG0u|1Q@5XNq<1*d|;2GrF0Mo+} zb&oV>+YCBDC>*#dNeLCdd69>e2J3hV>NXGw5q_+@&DP{70fv{}8fV{hWZTgf>qkmPWgEE+mS^$4pm}+DWdZCOo zYB$e<`?^y$1QgkJXjLX6dr^Po5-9!eLQ0pBk_m=c#km^Zr^sV+{GC4*BZ5|b)UtFP z>n$FE{L{~1YV)ApKqhkJ+G^yFM)9CWGbw)b2j84~X(prQ>%8ylCNYW*B0+k&rwue; zb2~e`?@M|6)UbBt8Od+XN4blCbNl`G8a@C0py*=x_j~$H;{Bg76aW8njoBT(MS`A8 z2SnPUfhC`aI*%yY$;&B#4BOH%JwaYuErEKS*X;$pSPhBEJRea2tQ**x33NU2iJt|T z8^|YLO8}OT1-G^i&yT>f4*4I%Vr7l!a{vqeFtHaU|xkE_BeckSP?o zMhg(I1KBiO|I=d%l`A5NBP9W0q9&mIY%Ze zN8L*Ad-xUh-ZDS`7I`m^IBO9l2g8>G_q`!lDGx5$Mb$dKPz0xls)F`j6CEF8Y~Oho z>vZ&&=696GAP`b!-t^0{V;@NVl`>!vR&Sa#H~Y*wDnTuCrDt*@dtk&Ge!o0k4jj>Jg%Rq%ziu&pz4RCT!^pCmN=SS zk9n!fnW(WSmuYe8q9jFEcD2Zmw~-~N>2oOS0vOFia3ST?e8wl7nK{roEDkV^T;==iUz@aSYQ z5Z~z{85@0uacW+JA$q|jUbN{;j?kk4nr^MBw68dOnlH-b%CnTmI zR@jq-y9d{g@1|n&y0CM>`)pz?^HTO}!4bu-+lID0?Uq&cVuh zkpl)%Lbh3Lzov8t;YNI1UI&z<^}KH->G;fRdS|ZGyZMI{8*)Epv8Mdxu~x!1@m)_B zDMc!)!glbDG|t9cag1ZDgpQGo`M6tz?wHax^CDE^KvSoE z(XKCX7-T-gXuPNfvkQz{x*8%vIL>p;O&{bSPL^VTppNZ29o(l@N51PQ9 z9o2CL0b>O54-rH#V+h^r$oKay!;pX=_D%gz=(dN+7A7DxPbRYYlt)HVrFqX(nWwN* zKNQ(_pbdEpA+AQCVEh?Xqyt`t`NPnE(wJlScoxQ$qg?}uGQKYnDw8GDL0!KD!S?{< zR=hMKjY9sw0v!DtrVagzFKuQ7A_crBy7>Gbq9*_47sUI~E{2QUM@l?-wG-QfMLh`eX^~N-V z31cl7&m_kuKHg9$8Q!LGCPiCg9Ly*qFg^y6HqW3eDNL;(#h{@sWrd=AZEvM>*Ihn7 zLVSh$9zZ(0-Ywk?f_kRVU#>EfG(Qc%EQ>?{1cFy5GYFRD=X{WUPkV&&9n<3taBwOs z{Gmh)8B}bn;hm`5ZTD;2A(VJuK@dF@fddWf=h41$PndiZ+m0MV$Su?ugxa*EAKiho zdLQkjRY&l*{cFXC_HuM$y3^)9G1| zK%QLBNopt;iF#b`R10z*!mTd>{5kRVyq^cAmdBNa`ec%|cr*!cX@$Pk9$wtsj7ZJH z6tB9S#?AQ0U6YRJ{z8rXwIqkNdBjFTt!oMw93D&#g5I(yMN16$Tg#OlwJal1^__v=ZM7)dwa!289Y-dLLBSv?Q!{P-ro0Qb3o$C2JEpA|Ch zX76@`Sn`|Z@KqdKf|Y<_%TTbI1lVDh`^y$pAE+};2Hr(Wd3}IG;%ExA@EVtC3vbnK zEJ8R4tLFZ(x{Ci(DC@`cY=BzN9yP_e_Io z6`bC`OhXoxG+4IP?oz>I;{xvC3v}6ehN3$5rm`sGHnb%x z+*@n%QdeRJL*yEasB4;8hSfhWCiT}h-xK9L@{t#hxQr(C44@T-gU3JFJ*hVDt|ULm zMgfIN7?oG;0aAHusz0U*Mkq;{=|O;wxwxWltaEQ+!k3&BwV$q~J1iny8#>2qjN+iD0u{Tq4x#+m zh3l{@r@>Z9lxFQN4wgfc#U1671HpGlts)o9+(@J542XVZ-eXG`;>(2@wgJ`U1zq=v=iM59nMC9Qi4oO8{$U zfekb`W=PV6mvy9c^=%JUx?>Jm9H#h#^gdH;dL{&JNw>no#npZjr=*iF90~-Z>@;f! zw+5_Eo!U6~@8It_=cRYL@9X=H+~y}5F9j=tU^ro6APD>Hl_Na1UUq#{{E4xCWXQ8c zCQU1*{Q5q~O0Xkt((AVs1T5Z#ra<`wfRl(?!w(y(yFTti_Qxs&n&U-nSc{DXQ35+pL9}!wmFO{Dw^&Fk9*eZhe z8FA_R_2*{Nj>?wB_imbmpGxez^kg^eI%E~?lP(Kl8R}Q;2F^(nI)cU=C7U`NBn8Qj z^FmnK&>D-~#|BSXez|SKzP5|srPt$s4l1_{CGX5T`7?&+lt0;LzU&Sa8E<-&6ZTFS z-dsBiIj84vvdVYWTkkt2L2pPa^;6+s8RI*j+RNpdw<5_XHMzKu@f8eg)^EO`(Thw% zjylmcIA{j;NNdxNQQ}%~_eufeC2I&U3$6W3=mAM?vQ-&_ZM5^wLFB zRKmNBl*S=&b627{c_Rg_XVT371teW4&f7*_Yn<~45(MV2_!DHM2Rb|XKY}40&CQMhr9s8 zAK!mo9;jQ3uvK*oYv9|1ax>%d_UapN$&*>5MiDx%hUa6~P4)KzH+5QB0qDXfJJMAy zoC&w0d+pLU&Y^w%%jUin)`JAU}c`c=&SWh z-sQg=w#&PVoGOYt`|(JfpuT}4{!gv4c$fzD;(Y54dSgxk35G^mlDK&rhUur-_oUMt zxep(e8SM~D$J^mm&dWy&7(N%XHZ(RM-CO64QM?c@d_X;F1WjLTndG{-SmYlcK{&@0hx0!%gf$$BP~c6$;+8fj$4hRbrcNVv!7Myn zId&x@ssSY_O0$OZl?_AUsGfpt^I@dNVqv8A6I8vRxfNv3<(%qka|Xy_HF-VLeztzF z4U)8n`)~y#_4`t(4G!|>C4xyIGfA{Fh|V@)qqK_B7AzjJGD7#R2Sj05Zrl=6Bdi_hSsq%>T{H($sfY$GrKl_wM661&=Dk8&N6}D%j2FH0z*ZriP92#!K+r;2rn4SyRMz!b6I!Qt-+|^! zcv~`9$yFNnzCi+ZLxLa3FYR!5!@_}l;|(z`E=V0hJ7mLpw5f8kbT{KWn{B@Aq6JbETj0g{6<-+i#`1SCwsD8) zIE{K+9YaN=8whjx_Lkee)o^fRU>Hy&lk3mTwmZ=rf7M zAeAu#>OX~badSmJ9f=G_`{r^jag0*E;J^9EZg0tQG`pd!{ z8-z;uU0KcnrN}iOwIK2N>~b1BT*34y_CjHlWH9hPY>~6!oL68`7d?mQ+C^JCYj@>* zfmMEfoCuJabXOnbzajwMn#nmgaG=s<|A1AhhWAD%_g!U1!wY#)hr%3hy9i)tw3b|- zSf|G{vOrXDdL)A0wmPEV2gaeHi`i02P%zxIOZs&!3LzDplhn9z)z!y&@cXop3+Y{E z_)`I6ub97VMwmgR)Cb|+TKJLEB{v3vum)ZQEJBZgLwWNM`D?FZR3mU7^r=fv!>W9{xptZHB9qL(gK z^cC)Az4BWgIo(5r_45+)(Lb4`*v@QrEC@bck`Q@05UkO}Av0|SlRM2j^^J!y>%47G ziWLwY$#cMoeo@l1@m6HvLW&|*T5V42X~5vUq0-JJft??I#<(FU6H%;iqmu`FFMTw; zD0tj$M2og)N^;OqjziZZQLLn*z4{P!$B~o{a>&|~K0^_4yGX;^uUz(=<))unm)r>T zd#WemDiE(7_k$?x&`Wn}>UrCfqMz|4N5XVPZA8Zmg@uc~wj)q<%U3^SA;BSTw!a3d zBalE1^9xr0ye6-=6d?c-g$tf`z2LlknkyNgJBjj9kL>S43%hraxAXUoRml6gZru%KXK>r@6}HdC^c z%Ay0a-`>@~pj5d;FOvOqMJ93zB^46wE~1!e4e~iB<%&o>#XcF6ubj*iay(u4{?q7G zVcyH=7g1Y8D?i^8yT;rTZCq!`Djl?0KxfW|p!<*d5k}c7Wd;ZW#P|7iG`L;*<8NLS z!e?GujtnFH1*flx(+URM1Glm)9>h_o4_4sx@yKSlbF@k3<}mef@Xo@MFvH`hW^1C+ zui^y}>9?0fV={6VBw#CXDwq~ z`&=qY`vvZHk&6aquOW5FxptkOl;?e`H__*J(=Q%qbodB|04w4F?YgN~f9yAy7TLXX zxLY*;Fz%@6r_!u<;_0S%oBJG_>T=knDwv<%;ODHiUd2d%W%M3eKL_$$XHbM6*?!@T z@D05W_%nMN4pPH3vHHgOBMWvuEh;@Fqg;nobPGF3`ye|~n+hSwh;|&2R!P^TZoZ@b z7F_aUq`1S_m{eM%E7H${w_0+x1Vc&B+-WJb10{A258=BZkmTE}^-kpR3Z#C6Bnnh< z>>l!}kEpiauc*7u?vll6)tM?!G+YjyveuanHVHzgXFUZtX!kGIs*+x%8?5pVFs@&- z2o^VD|K+C$8?wE`NLdaC8VAYTjE9#iKO(Frrmn$xuY9sYsKlH+R%6|abn92+kmuA& zjcp~}P&Nnc5G@d4JPWCf&R{$wj8)Oo4sSA!FlhpUKIMz7VmhTvy>EMoBG5jIy6K^(mS3x^&8Cf_ihq3&*FE5jy*lL$vQ`!( z$W%iXZ6Qj*3#|ds#Z46xshI4C3wmF#oF4wi!kKg@%g)l_PvpuF_VAxqVGhlEDaKoU zgY&eDn~to}p9Nv$Y9P?!PZP9TG4$2I8=bs!L7@J%rCve&+Dd&}?*xuw56A{+}AezQ%8f4%v)jZ!C`xKzJ@rW{LF zgJI~4>9MxIvQADtTB>exs^WzbX(ppDfp_XB~K5ON<* zqys_P9T(z7uK;u*55qd1FrgZJx|(Q9{mT4d(Gi#}#UIL8t-gOt^KHpR zB+~-*4ZApQ$V!8u*f8vzs+@bpK4MCp0JHuu1y9m#1i=V*V-8FAGh{9JHo$vbivTj^#! zB2PEaMn70w_^@|m13XG)0+CqaLs;^HsB$($$r1IW&e=x`hWid z-`}VN;-P=XAy}hDilcDzzO64pu33!T7U6XkzW-ZH`ouTR+8;Z<*-U=??wecv|M>Rv aH|qpDU55*-(yY)=pE5EvEIe`f_x}K3*l)}L literal 0 HcmV?d00001 diff --git a/benchmark/public/plots/rub_structure_query.png b/benchmark/public/plots/rub_structure_query.png new file mode 100644 index 0000000000000000000000000000000000000000..a4ff435c34ae253b50550c06a031424f97ba834c GIT binary patch literal 31177 zcmeFZ2UOEpyEhuks51)KMv!U;3n0>@qo61V2#6?EMU;Sm(h1c;K}0~MDiDwkQbYns z30M%6PJ*DJDJ>KOLIQz=a-SXM{my&O`Odv(t-HRp&bjN&TJz4lr2hAR?`QwovqR47 zX>t9*{|5$x;W~Rp(-4FCEg6GZ-MV2N{N={}&S3bLqU)(ku0~EbTs^Kj+hBCBy57F! z^puR`aLX*2|hPY=1vV>EZmcvlC_GRJt;~ zv0M6PbEPyhLHcGDHL=gk*uugh5q<>*ll;sLJMr6U^q&D3e$Kz%kF4TC|DnEnZ7cdW zX8TFZ|HMD2$Gd;CiA~SR$x)1zuqaRSTV$nwxo9RXWaPie)!nn3?Eq z8j1J*^mw<|lCIphv}WIBrvI{uQ$;_i*uJy9+JBkxrNB=%(YgNMD6Ok#Ph1@=z=sMk8Y{OpEuQXB9y1kK7~s`(rC@_%BIn{skyxqidg4y}^Wel&yPLCB7TzV6Qz z-Rx57eggu1XLv1K4OH)(AXiS+3SeP8wIkG)=5A4Y{Ma;dlYHOLyx%Zc!NRQ=^$IDw z_AJI!od~r-OT;(ITVcVo!{Am^tZ6R?S^w{rbQ|pDFINjOSMV^;S{T8-vNp zHp{DCg`wom&3h-Q`qujwlnp^eUbTWagwmFmAlPnuWM8`kWZsjZPK zTVjouo^QZlC?cz5Fs-{aH=FRw9P!)Jcabsn<>kV*MZ?F+qA881Kd;_++czF1zM240Mt=>jUe7II~ag@?xWOX(~Z0DAA z?R8xIQ5r9oz?a9TIXwC2FnVXlf9IJ?AHfeu}iFVk-1K-pIKvPJN>vy z`~18qUerYLR-GeH%H760zN<&~EzOp?(58+bY%O)_Tfn7UI2UE}%nT-&(=<-lZ*knJ zWj0MSo7X8f^hdW#OP-6rOQqKSBpBTC?|*G14aYm-cU->y#60LgzK%+<0RLYZ+2g%8 z;vHg~>ORMO3n9r!IaO-8ytWc^@R%A8njelY_#9#Xb4DhukXbb=w?kBfTX7_L!6VnG z&`}6C@?tcvwD*wOf!@Ba&8Z98jC&DG#~!y%U4M30AO9^49Toit__>>SuN!{EHS4(> zC!KByNX3ou_|JV{u*M7I=f2jzv$D3}Q}gr5XNv^3)gHTsYosSm`iS<0(~1arGJ^&F zi}qg6mKP`e1rm8NaZ1{^F&LqfL*IQIu*t<)EdsJ$8f#y=9P%5p+DbI3UYaS$jG6wl zH)62$)rh*<>dXyzd!AW22MU4lZoSALuCtomGs(Y6!pt;EaUN!T@At*Fi7YO`cNb|_ zeL6z@*J6poQ>3woJfo+J36Wwe~P`p_FH-kbD+M=fE^2jh#E(|c`dScd$!rS*2y zcpuH|vQYMq>U?0RmPgC4n5geM+x|>4Wb^F%jgpNs*^^LOd`JiC^Sf_<;F1*m!S96J~x&F#6#IC-4<8jR3;Sisdj#GkJlO0u_Z_h*rl>feq2Tpi7- z(P!%vh~lF0uB5Xw>m;~B=J$3VZnN$YPtq6(6 zWS4p__b)!!Cf`=1(oy8uVlDKDpI+MStfP8%>kprVw_d;At#^6D-|v5(Vp38|uVL}4 zd`;!`U+}-S_@J(~(!fJW?^DezJIu+)x}@v$t(#I#M-G>Se%)dH%z*6^$R#Hh{5dRd z?-VYjj?taF7xPvyYaPb+g9H??Gfwm&y!S-52i`uvEJL{kR=>%lvUTmFgDoy&*_m>q zW$R%#ruKObgpGIeIvf=mQzznYUR4Q&?h9;qo-@CTCY`uq!o6$)xT`vO~dC7!QJ?7G)Qp)-fERv6h-9EM9LFeX(E~o0n=L>$4mA~G9v91eu zxZUe1VqN8#G(21852F~BGzmz<@%Bz~roPj!W$B9Lo61fa%XJwN1q}ZFX}I209PHgQ z64ryCo_OH_?Of=?jpxlYOZzR%jSamvANbGn1-guQR&cM@SiDYwYq{{9x%8TM+jiaqQV zV``WE%~7OSH&L+|H}l1$ps?v<$pWCHVpgjLPhQo+bc29{6|IvTSKG@b$?b@h?YNYR zBc+{n8xC2v>FxtuH3q332JR#3W-m`&Zb=-Q!qNn$NSNS5E@? z$#f3u^kw$Q_f<1{y))r5DnE9)!5pHp8q;+alzsDxJG86kK1!1Q+9(k>{<8N@olv%Y zXTD4E!iWRkez8M38#E)T#jI}w3K_NxHh3vJwn{%qmH5rF6J8k`WA6OhU1E(x@M+6X0P81DXgGwv=1(`RYa4KL>?Dfa*_smU$f(H^$JIh*ES`6d!anGo zaoJ;cx(_a@tWk5+Unt!;Mc(^XXd7D&=cP5dxCa;6DLN}^aI>t$*7^Z_5?vn0)44)p=&P)W;4U#U-nX!1$?IInowuy3fr zYROq)gG6+Qn5m7eYQ^1e!x!4Q!qp}J>f(ElU)X8+2Un`)ukL(rsS$H!eU1G)Qm&s~ zQ0te~!l=mi-)ljI&-_6DO*Qsq2_b`>hPC5i(_99Wef0y|4{FD1eziyvi!~r!()~3Q zV+d-uF{2)#{_?en53NCQ zJlDw2yglC`ryl>&puj_~M)C4i(c90BvU{C25%^Rp1-xnLmi=MGiHuxtsaA77S-T6W zZARJKkDs!2q2`<2z+zBOqu&IXoq=F~hXwkiq$NEL4K zWv*9W>*pu+^ZZIDj2ZNrvv=og)Z3?orLJlA-neG!+fQ_COvv3UHFuSELQiM>%z}R^ zUJ-L%5WzD?fHf*ZkL7WOsLc^*G8@va%~k14hjoNs?g%8=KDGU07Q~`dvQQy`Rick*Eqhl`DYCNN=+j! zED#Hs*WK@4*5Eh2d6T#N<98>Q6z8QsFJ9U$n_1>XTe*CLt$T1S&2dtDd03aWs@&93 zX^Z#1Quej>+UCzf^e7 z=uM;Pv6l6sWbHK%VtVVo&5)ex6<>ecVKz9eS?qnHj>RuJ`B!fX=DaexZi!RBw!6io zhTqDL?Se%f->kyy#!(A?PJN5oeEfYR$;ZXizSRA^(I4;jn1`sBZ&0EUj3f-Urf6Md zkdByYxIMNH4S1#M%bpeQ3uk8JmAL5PvSKw=2J0=S!Ys69pTzIM*>ejXf7RXPt2d^l zRrcU%){m})NrTa1t#1!AswPH0s!8^zoyz9EWAo-xxlorwLewGc_QT=0*8RoaUrX9? zot8Czqpuqz8cOw#|F{R2Xsfw1{P?pS*m!i6~+ zYeXCFd@?NPlfOMzOT4e1^*%a9@O_+=^%&9V1DE`fVAoh6MS^PnMpj1Ii3WR>wfDq zsr%09Ty`SPkaAw;n{xl@R6#3oPVU;8{%6Ye^P{m_i%ZMux9M|nW${KNcCOOy3axj` zeBmJ1+fwiA7~_+4n$dO>N=Nf z(VOji9I|{)tgS^ny-KLz$bEa4z4&nuDRQ>NHHBW5}%XP(#TH8A>*xkH0DBt61 zl*k?+=1E2+3KUx2-|%M(RI=05FlOG1E3c&*B&&TCVFRD0Z6LeDhPAFU!ZSPxlo2Fh zeD>F}nCAQRx~wntPZM(~+N%%T^X{9z^3v?iC5QoXDK&Hv%hTAxiq*cBTg4i~jzDDY+D)fc_@|yeg!)^GRbAxz;to7c_ z4=wK?;3@p}C~nVbKF@c*@eUrH=<(=ln$g&+jPD6BYb%n(O+-r=v>ZR#wqbhDufsF9 zAFV0`u$QQSv)EFrNo*g{-MQQQ-mSJ~F_+CM5vihm`hL@q1gM&^-h3UrKKaC1$~=xS z(pDCHb#1_wvD)+(!UaX!URYNHc){g}p|>K}-F1`S8bts?P1SOLp1m)3i0*$9FB#octS> z`(UYys>xYuB)DefZc~*UlBxpoI@~3i=xUgMM_?w|dcM%|wB$_1MEPOn*N9_$>2Gm* zAD1gH)b3j6Y-ST}%#jOp3~I`^5Su|b8>_U8Dhb_SqxFc-z{#P1*fZkVuQy^Y+ge1r z&Q~s7QXBB!S$+J$)=h00KugrP_+`YB%(2{CvX~=ndke^=y*T@#O{#pfUbx%zfIzjy zQGGHDZ(?n@klB(*Z3z)1jNzwhyDeK@1Phx|*Xkq)ZYWfHIHZY9^7QX7MzYF$jk?Xz zt|#K~_&lrRxEx!vUjfpi2Pb{IX3{FIT!H>Yz~$z{bhxV*=%jiJl4;tXjKYWhvrC zW~Ya2pf2PteGlXro|{2v?2AQ(y93DuK~ff4WgZHElO1Pbs~G7x5zrIZU)X-~AEjHA zBm2f#FlShE`^|k4k7m%p3 zvHQH1Z{9Jd1^VVr&+LyiCw<4AFY6PMzHx3Q;rLX|HhX20pB0#Zm>26|f4%BD-jQei zd>3x#>4>ETzANUgJVd9aEm?ESN_vU_AyQ00%*9gdqYyu>pwGg`h$?8Wug9)q!cO`w%@ z4Oz!|qzzM*Q+^*Y z57P0^ns!sR*j!lxF@SaD9bpvW>r_$yO*P-*U}>dfvY<>oLwX*eL84-CStr-NF!z$=HH$F0I`3^N58^9o zPJRW_O0_e4xm_i4z-^(QxB62<=xaHHYL%tn!s-Xr)CAiqN;~V7)1_Cp-R23R zO6Ea(^*+hCzUIc*`XM<)ef>6-#Z11BoDOO>8)Op*pD!UH;{QV$Ls$(P2*6@{i-K40 z|4_KJFk4#PLuq=!HP%b?Hofu8hQzF1UUaS5qD63b!6X}v$?QLl_g|WKtrPM{1KmK` z3M~~Gm*Ga`T@7ewD6^bM^yqztw8Ts;VXu9W z*}664$A@6~@nbDYla;Q3H`0Jj69DU^-(9mlgFR6pndmkAY!r&54wM?bHS0Fnc2MD> zFc@1&#iq?|z`N}f0mjom^|i6REd5iMgOU2fNh~)SMYadFz)`4s9F-Xd(k&UOm)_f~+oLE6{D6k+}vY8teoNa^-n4)(*Y} zM497O;LcRM^I|bZBMU_J627L{aP!M#y3$#WUi*Yn@U8$TZJo?Jp)`?^tYQ))BO4bmO~hni>WuCj}0PbjDE5&%cRr)bNe4 z*Kx45A9psh4Wwl+F$-k!^70!TUT`@$SU7HUc%|}Xc_})h_={VaIN?QJ9(txElhs#E zB9xR%l-`#k6r*R}&0bh$n>_Eh*f~_v8dOq(p81L&^Iz5`Q^v}>cTie8I)*O)RcRYt2qOPOe1L1 z=gxt;2+7!iEE6-0wN^-LD8j5zC@JGH9@&%kGfq>9iY8)fcHg;`m*c&8!Yah;wnuT3mk(ub(ACjGu zbI1ZWnm!5>`mlb|F&k~xghzG!M$bt!!>7b5E23AOsz-*#L|{zl=f#Ma&m#wI2vrS5xeyOU1@PP^Hh=SH91 z+Tl{CM_z->)RC_#5f+a=rxggbKD%ISoStt{F+5&!hGVA^%(lq+GupI3t~!gYk2?7N zX0j11Vr|&mw0&8Ovx{t|lhIWI$CR~=g^fMB7sN+F)itiOk#{0R^(CxgfD^Q*>%_M2 z*Z{Q=6W~4y3fGa~mPOW#&c-9J)W_OVwc^L-*SBu{K__6zBq%xw2hPPkW`-8}3m(0q zZnu$R@Tymc6StrpR0b5j4#%zPFGrz*ss7GeoV?d3d~pDj-R=W=pNhl$zM{)>x#R~6 zZ|4vY+46_uG5X zRs!QCC;6VT;ik?z2k$#ziKR{YP?fMt^HhDaTZN}gq4v-js=lnj@x9^4_bkGeq9+u+ za#6NSH1gdaJger1bsRoF*s^ze23ctwPmqb60c@;=`&o@(?xxtQAA+L2Pk5|dxoFUm zzI&xdU(|MJ`dVmBpP6ouw3=K5)m73+>&E7j=tptq!YZ-ScC+oY@?UH(u&P$hyC5oG zzw^S9Mb{F0GHJ3J>;t=VNpcRR+$|lj5fa|6-k1g@F7p(xg-l*iv*IZ1-})kcb5}Q3 zo_uKOg#3yHHF$xC>FsyFaW-mUAlx_+Ccr!SL;#p&yiSZW;ck1h^)yFAko?_Sr|CQ7 znGat0zD_Mw;Bch+!vzVrnX%~3rc0CkONIsCv*_k_R_59>&#V{2s>ctvOd{r9(bN5`#}apmB;8g0?;Y68(DaycAFN|&qiO}r0IFgz09f(VY608C1Fw@x4p z*%CzXWQPLoI5s|)JuFEDmqr^3ny!EY!GK$|&wG-{00nXQ3NFca_UuM+!ug^qZqp8_ zI>AUkdVJrYx)*YuVP|nIpXo3ot1ZvB2#RTqKF~%66&Y~oQ|~y~T&Go!-yo=t+CIkE&wkF%N5`|p40?&r+^k+mR{ zR;M9B0xUdS)9Xr@rBwAhO|qTmfNVHTtg;vdT&traQFgsZnKzG$iq^#L%5OK~qccB# zzCWVt^LEwx!7I2Yr*<6dV*;~N3ML{CIBZB6s$m%~13+NxEWqOKZDl3FB7VnxhYnkv zl_W9@6VEHG?gEIriUG?-sV;Ey(yw5(bN&D~$k#%}FIuw@oh*Sba_zoTmkw21g-LOg z=wtUXr}v+n*^sQSJXPaUQDpEJ+7FVq9S8UjFYW`3?*7QUxsnezkO+4PqB?sPfNu5a zIl(=-Vat>Ee&0q)T4H4KGaVos0Iu;XFl#ebC^x>5s|&u-$+iuf`N zxDz#^9$|AW8U^+rN7GaBu7ddGqe=j`Z%M${U*%Ey`O0UfvghpygYpmDN@*&yU-$Ab zPP2EcS(26W?S6AdHlxp#tXn3db&tsP?D(B)p}DEi*LHnOjZjsL{S`-cmvV~wjy`x~ zEwCH~{WaWjGOvEXsKs#nPvOG0y@+CJ_pv%kNrztM8cs}U*P*0N)gnYU(wuw-1R#;X z--hU+%d_xj(l>|ak*KK{fz5X#NFf7CV*fE@x`tXU?BQ|dBND8M;U(s$7jupxtDlYlU3_eMp>x@3ET6V&-^4;BK>1?#!^ zFQhW*21xq*+Hjqr%%q}{Z++3;q zBZH`B`Uw5F;{d$C9S4z<=asq97flZ)-hTi4{H&3_%9lrZX|fAYxE)+s2jXuT;%A{e z43CR_G&EcL7Lo-vMG0chq^xTNRq_kneWyQ*B6u60>E3o60db7FXep9#$4814!StGD zz-p3=?Ren)TX7Fmer$V|!O-3ALxHuhR?FVKjhGz*c!aWcpFxIhgkL2{m1nbhN2j6*YGqNnQ0UApzwTCz?o-TwCPm8j`hFSf>25zu@Wo1 zZa0G0BZFxADxWc9xLV%%_@~S7VSKqXAXd6M;>8y@kpeFH?(+juP@4RwlN}ANT?J(e zJN1#bWr+9MPJ`E}R@HteO5s%8sY!o8`5f<1av2gsVb1#VVCNrD0r9h}m&uv_BPpRV z)I}iUc(e$T8y zz~~vXD+s3bvpArEy(IvnlZQFsjDkb|+lo`5K(%?hC` z44vO(tD3%wr5P;b-FH0eH)kzb4rsuL-TNEaoXlE@$sH9iI9%@h>g_iL7-Dk3sx^v> z>_aP9$L>74EI%vH2RM|xm#zYh0GN}*MlqPiUbLA9&SzKxTV~wQB*mvY|FdiZ_y&^$h)k>m;TYF3^IQ+*2%|9$y zs%3S308osUmllm$@G4#BMpm~P+Q?s1qkNLvh+(7uC7 ztF$_@crs1=@1Naqa-(=uxyK|P;m!lP&(pTAZ$0QXO4=XX+(s`Gzu$6V2O|tkXiNeVuVv?=5{yk>@Zm2N-e(e7B zhyVLr`kyo&5A=A|Blx8DIK2LRAlrX=DcfVR7YnpNpPJyJ_Z;B4prrlu;O9}O5ZVZT zjCGYdJ%=9TFV&ILmodts&RVrBS>(a)};khlmSqx0A_I;1vmK|(qZ`_s-rkR z7#Uw#N!C|W32w2^E53;0n$y$yps_Om%V(N7mhBJjM!qiK_TWE!EjS*YRSYCb7NllQ152}*RF-;tWvH^K2=;;DaVroY5g*OHV|DK<+I)l`-v;F?p{`@) zUg-B2iXMd=mk+VsP)GK-J|iGM!NWa;Q43qoQ+qW~xsP)R$mJ?;xY~X>;P$-`N^@du?6N&%I1bJ?3D8Usf8kYjU#(AJ|5HQFS}-{?D3$armwB! zsjbVo!;29D_8OJE`=^9IDo^2M)4BG~6z^VIF75(TQe+S-eWNkU;E!QoKoZMB`>n3w zrv~a`Gf#0%$ISwQjmjhfjgmdRC(Ld4u-Eb;bDVG$M1!!B-7!Yrf;{mQhX-B7U7I7a zZ)qHn9uA?3g;2w>Z81-6tgY?f+_yLtV?<%$Vr9pP&r2g~l5h>^!#5$|S}xE6>#m+gq!-Ghjo~2e?5N)FmY1t^f9; zS!pt1+DX2tQrE`&f6*;*08iQekS!-Djody$3Z(D%QPghZq(Gg>v~QB}YP z$EyqRDyAhqqlxic+l%RVfScgK!Q!U(&d)lY^fN9;ui>sB_R40<(!waP7} zRWo^vCMUn5kCuDU7pa?(u&YOo`po?jlvxI)PKQVNmpT+(oGC=v$e_{1g{!hg+KJxG z1CsV%j7@Z*8l6*wz%K^{3i~qJWPpvJ{5%2pcqT_*ft247#tiy&48ev|j`Mmp+_3tf z_((v$+3YvY-{ZRr$?bouMK&v1%|F2|j@E_w*B|}?L;p1%Xwv)}EdOsh9+S zpvQlku9_6`Z>>;;&&1z)eRiy`+J6j5g}yUi$Pf{U%z(UOX9H^MAcreW#O<(fZ9@wl znI>sma$k2PLYqJn#T{Kxz-(Ghd3m{_+g2?hw+sBLKAvM8d1jPbMNNj#7(wEg`w=2Z zNP^{fV0aa8IWI4?+k)Ux6|F(@>_%4LI&Q%@Wa6T}g$ximGC=9~$9JjNwkG?iICj7B zvgFieb)BJbfDm`Uhadc6HJEgy9faZ{BZ;NX1B8TQP z^%9?Z-!ds-f)-X!Rs|Jfn&9~pl$gs)-hjQm@F;X#%iczY{t`p{4q60OD(>0?!yi-s zDZ0u&=SA4R6kZao8Efe1ypr0rf!W0`0Vdfrm5X&y$0*fS{9CS(R8s*VWX}hSu zLVBn78%*yX-U(d~UKG$JrwK6DY0BbbPfa?0gaEV(StVm%`eIu1jIYCkp$jFu@tN)d z{q8im8tX${n;RZ^9V)%=2}{7?V2E5T!Pdn_pkMZjqekT{;4fy@Ypy9_xsD)Gx%sys z%DA&_efceK)+5^I+3ViZ^V@w?=Y58bl_qKdWge^LcauMC3fDS|Bo~=?IheKDbi8$W>fKbbC zA2oiW(;Z~23;^kBG@vrrng?6LBqou#iFig8N`<6@%2wR0%xix8gd)%pmv0eZuIDvH zoaUf4SBXH=T^>KTt0V`5gHmu<=V1HA@&cL3pD`xJ#?elaMUBU1nvI@YUY_MZuw>~` zAOO$p^ML@!>1YIQLb1?!9OzkqilQi&K>niB>J=-awaGfg>72JH#!DooAD)MlSmfUZ zq6pBRiB+7)e83eW0PVu_EP!me%q32%5Dxr=#QzAV!cVM90zYw@G4iMHr`^GWUBN=` zm&PC=Ghc<|DWobofH&h7!z-K4W=Ac#|ys=7$s;1i(QnhsH57 ztiL2~)w%Bk=-V5%Y%_Ro4SjQZQYA=y?j98Wf^Hc`NEg`J2jHA8ww1s&GC_3R#k35c z7Hm7r0?%1b#d|g(b;tp{qfy|Q9uyEsIZzHlI&lz*ir1i`X@RQkec}o>Ey>SOEFU9f zoH`?18z#_6(Onth2PecIE4kZH-2}Kwd@?XjqXS>KKT!idJJOnEg z>6?Igh(j!B6m~`1Xt|Zo=Uy`c8oIJLWXRC+L%Och?W&QTD(*kO?r`AHW(@6k0LoRx zc7P0kpz&$yI80o`M9p$nh`nn7f2#_P5l8buJT-EdLne}GwqGzfXtfyyTMH6vs7J9g z2+hQQyp2G|;E;NXdRkd1UHx6T8j^oxGNoJpl6ubodPCqUJgbgX?L*T(8He7(F8t7Vo61sX`n;4#I4 z)Nj|#t80b~Mzy7>2#$u*Y}cB6Vjc&^NM6r&bkC9l`Px^PS-*_=zQKx3=_H{vbvSqt z4Ad+MYKt__@i!p7rX#}@Dcy;RJ{-%~w!pM_Jp=SemjWv6ps$Ocw@Qqx?6WeZ+g6Q{r1?v3NU!|eb`@jQdcnsiEJQ$hIi&WaIZ*=&0h{-2ALm8S&lLH;7%zv1@@jY8Ltmus zGX5)YbM5y)6d+avugwS=n8={FJw`D@pO7vo+P!UhpHyBb+5ZFi72ENQXp=K^8{W~u z!g;c630f5iZi>UN6DliV1$WAtax=TVkQaUu#3j%{Y2sW`^0O>^&bX-8ZA_K~H^zKD z3ksFpZTuU_;WDs@O#kiLS>Pg`vtp5AN#JYueOE?OAU|wk{E(>$E#au@0q%n8%ai`g zIw&7DMoRy2Lvxafdgh@zxQJ8aINwRRgINTP1yx2luY8h}BaV){r#K!8>tL3AS2D`gXBBgW9QUao6JA_sD6$uQ!a zAg3avtzbJA*M(~Lk6UAaEYTiQedHAoJ|BqZcc0op7Y0{$y~;!K8k?0 zTEq;XhgaO~bl^I!`c8=Y*wZpNKS`)2dB-OqGuZ%clmuEYcpHty;7zWaochl68(`Py zjiR1TB=?Txaogh&p#bDk4Mmi-m^f$PW7!}W7-v#CTK{5m*9_~%q65Ry)1=o4&@j!dc zeN)a1Rj=5X42=t9)R>$G46#g^hQ5N%cP9Tf`v2iXkn%r^F2QGfN$4n`SI%eZ+R^~Owr*G{ zYyTD9Ml*9`GBg=Hk^6Q%5+SDc3Xe$w0PjqYibbI^_~K(#V7-H97%hp)dJ%z>G)*Rs zU0B6{j-2RONOHS^Y@P{|LLOCg?9Z=-qfmm+AbTB7u^2O#WCJtRMk|RsiE{%-G*>ae zs(%~=?l^LXgke>TR~mGCIiuS?fr1HLvs1LtV*b zopc~Oae8Wtj2reGbffOw2^v0{;#QIfrUJgs_rRlgZEdCQOrXfZuHe>KbLX?qL7$I* zQOvmvA&txp$#npI3u2+|cO5lExHXzC_tDtIA{zCum@j*MeRZ{|6;8_m+!LM(Tvo(? z;Z7Bjqyya|loIWLhD|npQlr?6FmvekqkWtFcw_IXJI{MO;2fVo@^h3Ln zieET^#dJW}Y%Ayr{R-OsW!9#I^nDaQa!S*X+#`Y5^*@i4{)p6{W*@!!o_8j?7KF3R zkKPv?KXf@!nXkHoc1lkxPg!DjE9KO6_BUn<%m%gG{~nopF_5`dpj_L{3iG)W))AR}U( z_hP5M$gF9;GHpOOl!`SPHhZH?BdWd}#1^S8YoAZdneLmE_%$@GY(zB^(~m#C+&_ec z*B{W2?=3d*qQozDG{-s%l+Us%xf-G}KOVbgdZcI{{&2=e7pj|r8?=CRX{R(viCcc) zb+|N9Bv`xFn z@cFNroBkfOM8kV>J zGh}>yv?KBCTc*ra2WKpoiiT>YPc8_N2j<~ZSx4UJX%}T=%1mXqbhV16SG`PUd^S6% z>pN3~omyxMk64Z;J!(-RXete?{iem^+Z!}*XnLC#mEj*pDva2^aAWoPq@1ys2W4j8 zoZ$El)Gv^Rn&sc|w?GrXbw$|a$siEmPe0;o8=01;KYPO!|F-N1$#uo*f1791mqP;U zlU9F0GLMt>>2Sek1rUU5L#V3#O#ve^2-35aK344yEdc{if2>%q1hs#%(LzqFi?zg! zD$i8G;;Hkq(8hC_x`c9%!ieae0U+9Q zE7k)kxvP6-EYNWb%KA~?#dPhp3+z^kN(K~_A8C<#^__^`14V6FZ#Y+J!a(%ZNd%1k zYzVH%YCGf-NZJbLIIQtXvn$A{SCP20W=pMlAk-O4?r7*0OhiE|8_@yabQI9w?u>@vj0XAnAT0`)CYL{TdMp&#LWNohkp&wKh2M} z(bXdmR!u|U_L<2(wV8!E>Z~3AbFSO3~`qy zLEJ*mo$8LEK4HO=5d!iODgZgK&;>jy=|8W-Q5xYelCP@^IG#{4ZS)@zB2}#`C_+TN zJqP7W2c1U7X;QI;=%%Q=mlw(nl@C9+=V&?_KcVAlaujTU29;D^0L2W{$zmO@kkQ)7 zsadTtAbhhxv}x=V#d}^WV>-;A*=!t-&OZ0 z#s(7$w`fB_G^^AzzW|#2_rW2+%|{VQ!h&9ga<}vaAH-Oy@Niz-#BNTF1>yZ6MzQ|d zJxC>qp{DJb<{}`_opBzPnwKM5!C_oTKwp5|sQ-vPoAZF}IR73l#{po}fwC6aH%s)hzJQA#Bl& zrbFw;|8tM(|1|-06{2fyl>aOO^zUHjB1Dfy+0ioj%ebNHn-`m6sb1AB53L9<*n7Mv zX%nG7Z9%<`jLeGyIN-kh++fW7c@j z@TdQ5XO(dc)TX23AVTHf)G|MjL*oSzB5JDoq0FrApwhnP$*W~t(l-XZ`iTG*>z31A zxGBikSd^#~lWADUTN5s2UNgd7kzSwDl?aq_2ZBAy@xSv zIh5o;PV(QMrDvSvM7CN=;i8K(=H-Z(x0g)Bk<) zE41_1DS;SvE|K_iV4r(bGjfOk-wZF&#*EUT!zV34$qp@%ySwZ>rQhpOlHtaP${`#Y!Q z2j|CSAo)G7u$G^z!xOdGl2L0DI>15ZTFnCmzi`h+nEZ(?Vhf}C{@Q)>%^rp>A(pK# zrgOImeZpEL3HEbSnOv(a1TQ&8rNZUO@pl$7O12LlUZ%^7bnkXiAaZ z7WLprJ;j4oC>DiI3Ki3juxvv~|0-WFkJ9}XtFj6}DEtIq^Kd!<@5<(BWHFAP(wbWc zR$Dl+qUxU40rvkBjz9F;^`6i6HXJC~@WL4>!QlLG2p#CSG0n#Z9`Q$Tj;n&!Gcpi! zLu(dL;dpltdN_J{?-!()R<%<>#n6RDUNJ^7N1-5`N<0w2{UMmA1;r~e98VOEUdp9Xe#puy@R4Mw~m!hwY6PVr@r}AHyi2he5 z|G#PYC*b7&AAgu;>9rlmFQJhiZ*w}E`0*vZ6zI;#$iFZ-n#2ExC`sa>0CZdz<$9#9 zA!^S+1qGc#j665gHdkiaI}o9U2Ra^s`gvtQKga|lR8a^7%u!&j4zndIwN)OI_XgXq1sAUh;JhoJDqGXcYC#po(u)?YBq*-S;^< zRp^F4N9)GWP6hKMqhlTg*9DJ6n|J-QtSSRC^8s-eIAf7ZLK1g*65cjH3tN|3L>`9` z*j6k0U>4Cy3gZaZ@@&17w4$tr&4XvtFq5bg5g$9msot&^gj55B;Y5@wOE{;8KS30y zZqnQpXJX}B>{Ic=rup+UXD}6W9J2#Mg&ST#-vd|kHXG^p5XFLCf!Y`a(T2l2LF`Yp z(JXfzeY(TSXUa$DTEK_PMK^}3U*_aY6XujMv>@EX%4h;3Q|*RctgK|jMwi#QrJ_Dh z%iYc%+@Z8s6nvBJ7UpI2Txz_iy5d55d$EqT^v!gcak`Yqz+{{-J*xI@!X({DoUG<= zendV-1hAtV#&#wVd5kT{xjFgSIx~+h7%zT%R8-y=n`2}= zrZ`S^-h0FSnWS<0d1+(g zmLpYM{`~D}Gg)wcLS6&S&HGR}%^{m>CQnQJOHggc&<*Swmy)Yf5=P@a z8HYc)mkJ#P(?_);{y#3obOp>n>Joo#AKir<-;JO}ihO1)|lO0J&& zk~SN!)i}z`Y+>>`*jV6=uLz*?2>6-2FFveq6*`f-y>QQkb>ivR=tFQ$uPLog*6niB z%$tlO_HBOj%;sHk)1^k*q2eW;OG@rM7V*EN?^Eb9)PX;oG`4l5##o9fv1Ze_@|bG3 znr%><0OH0r!5tRN)~(5=cMyT;`w^lh#%Gx#6&Q5lC}%LgiCvjgj@T zR17z4@mThMyR;fqF@j8=7KXji^ki_sDwXS>9{VgpJ_EWrMyp7bVW`!1eimGOs+SjM zF$fvM(uAY0n|v0*y^}dE_8qnQ6$En&DRvE=Di{V6KfiWh*ybOvj&1%GnF0rsi1wkL z)bVaLC^_BMWf0;2zluBWpr*5Sk0Y|`zTzsPqCh}JRs}Vn^o~*lq$x#eR9+Vp0qHfM zi-nfBHejWrAVm`+QVl^}APUlvE)fC*0+>@XkAT=FXiv_m4Ys|8+*e~;&y_;)-72e0S<#O@c{wuHkvfacBSs7;VD ziV5ztEp#NiPXNDmMgkY*Pq01U5e}}6mK+`|c=mrQuC2RXZ?%kt&=>U_)!95NcjSWu z|3w-qv0crXDDZ$-B+qnz69WAVIKmNnG64=(QVA7OLpp8%--kc?h$5EQK%idC6<@qM zxddVRCy1PlG^LL}T>G>#?4j;Q#90Sq2{O9CCV*nB&mhV(`d_HCHIWi5(#@CE5dJaX zp%r0(oP28}TBX$*nPT8+zczR8Bdq=7*-X@z_}8e@>EA+!|5c~$Mu_i2OEoj#DJFib za(=J})-(juP$pl2NV^%V%}uizP@ukSxANMIM#kl_Xi+RpYd>7OS|7b}%kbxwgVCv4^f;Sv662> zI)rXG=$UO1I-2K_s4Fhin!7eg8}>%wF2IdzLS@gI!;sr?E$jd&1{H>It-{cwZ8b!j zBM$WOGXOPtok)gnqstd2_CX5f*IW024p()B?O0=un$2sv1q25=xcb)xEpzG55kRZM zfGr^{=D|W97JnEtEu`iE6ykP0QHr9!@C6^SMs9~CV50u~Z}kp*&9w+{jMF5|;t$MddW;-L$nT^C96Kf$5nD4hkRu#2% z+L643fmpG|**Ye&QjhifyY;%O4rKG$Lh2WpP)N(B9esSb}3F z4<369^zy7x=g@oW10-lICc_Ctql^h?cePyhQ_zpB;S&5taEDY5=#?-vegizIsXwyw zIs;z+083u)7kfMs1;MySwH~b#r=Qwix)987|dtjRE>5JQFtj zh_c2PQ5cksp-TszS!Fz0@LRJ0!|ew%R6i=$B-#@B7oxW^Q%N&DKWU|*p|1}dI13^2 z&eymElLOS(nwQx;0P{ya2V~c8>L~`-VoPg0YSlvFAoqL?`G`C%9ViXfq*0C<7c-8o zO+Zl}c5jKNGkmHY_f2K~w1{43DbagMTDF$uHc_Hg#!Ng`btq;`Z7zI|Q$3jPo*>im za`GCA#97wKV}(IU zI7@~8DQ+Gz0FE$1=IG!7FmH@WIR@Dw*GvwStDkr=^ToKA?w%RJGs~0#WJU*=9fuw zO*E|*Pr)8;lI@>D8mqa>DQ;!;Wuh5Q_^ij=%H17AFhx)?By=yti7;UDK6XUTjoadZ zNN~gfG5N)iZ#;7DYq2Al4S#xs;m9b90<(x6M8KB$tB8x1{c#QW zq*3M{Acnj^Vw5BOS@<3RQhAfr`otWxg|NU)uzD^t8W`Cz_?Qp^)zlFR_&6H|cbXjL zu*P9`zp4IVTG&gZ6`6;qt#dUbMaOtLM7Jt{bvXI}aSq@QXrn=Co;T8i z$O0gg*9lg_{sU*(n{4 z(2L9-T(OxL2^i%z!3yE7cO?d`s$m1@N*bLN_}>L*z#9h_?a2lYrrbVQ#y{$t0ZN3j^1>X)~Rb|C%gC- z>H%chh|9g2rJ6Ws?9V{r+mVnaGocl@$D?>Kd`#^IdYj=$cv7;uDh+14b%%UdD5X`+ zg9@^bzlw|^qgS}MkLB0Bcb|Uf@^~Xo*ejJMq!q#L+=ZJn@9zQOy(4RNlLeCo-Gm6@ z>Q8MQN-`33%;VhWfoE!fGzHU#)e|hGcKiH>*0xxM{4C&?w=7f?VfSW=BY~59{o350 zHoj#i<gIg6)0&owxd+xRA)(2m6^e$;+Nk@N3Z7HK8s&$=!fy5CCSXC9~v-^ zc#iL=iOy4{C~Pq5L#7EVVxvq5bxXB#1^iXGKS>GCiDWdE?qtrO`+o#kz;j8e?nBzX ze*t49FLnoBf<=*hLYIv!RWJD=ohc;iPK%@`Ur+`OC5GDvG``(fKNCEWS~#-ytX}Z{ zY=?U*C0_bBbMbi&PnF$evhqiq$VFB8+tNTB#|rI8vHi}S>4d7e zEnodyZuKf+x;gMB1qa-D1try(@~`N@MUI2+3P0%gw0M|zf}g8658?+WWH&Fvp{D2| zK_$p_Td%>j8txu<%|ZK5a4U%or$^b=qKbmIoHEV*EePM3$Sj0w7`k?A%du_Ij$OG$ zd4r=FHB*0iXe)mK47m=Rz`8Io$B41k4VN| zvvWr_G9BwFFU4up+)2DRwT(H_BMT|4nVYV8ho~_~myR$r%tcX}*t!srZbw&tW5g`8 zs#XYFlr+lC*YB;ru+N}>+xiD;YSjTYo$b?iWKO(&Z+0_3@kXAW*nnD4jOds+Tk4O4 z8)ga9vj8srr=M)&PlQGkeA2O&4(%Kq^mzB7J+RAxu^@x1FmH^%xy069jeqzz5vCG$`w#5N zcOEXc#?Jl2M@pN7{pzUw&1YQsG0#3)E#>w7@o!x=YTIL_x>E?B8)^BJkYwUTg?=!} z564Ae*2^U$Z@y+@+=XyBBo%~)&o-&!&`o7C!J=N$`&K|H*8gp`qlz80scvXhPqw*AiP-?PbnAB5ui5 zzaAB>o=?L$?c(!Y1U(1)!q8Il?RTeYWysNVpuUbdO#U}=xZJj zH|WSQB4@Q*w|i_=xXn(C~(Hsl*GBz))7Qte4#$ZTeI02C-EW zcIZ7mb7RDnF4!xadS-BgBAC1>bA8Gn{ z#^G?w-!|%>*5`k+p22p>dGJ~KKmKgYuERD-ZO5!*DkBSXrRA+Q+kgXFA|_<7zFltr z7`H+3xhP#UDr6fXr7o+)$F}8Aw29=9|C~~SB*Ah^NN%n1)WBbmTCw*FLs#sml!GN3 zJ31N#;5lkh!{u3>7Dax1TXnT02+Dpc?S1mJ+TgEMSYp()bgiNvUxPgIXvBBQO82gs zTDZlES}xU>+P>h7!=^7v-@`hT)}_?7w}8EfSJPdo-w=h2E5Z5no*j9Am+y#@kdMAx z22rvRY^iM^7pmK%JZhKAo<14+d~3)fA{V*MBP`A*NoyVl7|yivvt81Co5NNM$qk|Q z3Ci-r|GYjL=J>exbQ>MiVz4dFkL|o~T1#%8jPx0TBvrJ`>^`f2AJ7G82yYIBo3A#+ z$ghIziRIjP->DYcG#qKy`2aEYjg_K4i9%0)k&n`EAH zwmqTM{KWV=rkapz5G}fE$85c|rG@^)CO{UTD08sdOo>g>GM>lx9?2L|+@hhPJ}l}! zv0lD;#(s5R0c!ftSbfWQ_hc^3h+I%X%*wM%;O!HBG3JYHeT6E1)eVxO+~UNZ&^kjW z>1&UTzLxD?R=HY^?2xML&oszR1kQPH%^rQUVIp2xUV(l$PV%-`SA8(vJb2sFXWdW* z*<{LO3DB)QGaV-|Z6NgDS)7*v)iu`K7%~jhnJ`E3pFEfRFp; z^-QUQelS9)vTL+Qd#^ohD*Y`D7V zEA!gbQ>Db*(6iKJWG1;6EZyP*|J$PZTcF$_T zMycvFCe#TyzPa8ef_svGSIxr5Hh&ZS1vh`2ov}~rts7~ezhvRujhf0d`nSG0it4`A zVTpq;J>C+Q?r_7UvcC8aYczzY?l3SV=Pkb8CAd?8p)0gaYFN2HrX@X&P))9lFKSQu zxG9zyK0K>K)P=sy+vU6hdTx3>f=aW#f^p z6d(HZA$2hZS8?OtiWyM}+tfB}B>tl4FTIWNgITe6;V13@cd_rDH__wPxR-b%u{o;k zyT+aP2ie>V`3+varzZ|X#qply-0YGoj>p}gNA8l>Hs3#RZ~H~A_rgadNyUq&8(|3_ zGRl@*zsZE0j-ml~sg#HJmJISv_)qSUQ#<;(d5x@OWA zN(s^M_Wn6SpJm&_Lm{ukbWEpN<8F3u%e+Mfm0NA1_9S6*5kybxbS$Y<#z9B01Ini~ zRN5>L1tQlh8b~35x8yJMyH+~K^hQAbxBtN4!*QpbK1X*hl3(~?wS5#%k zG~u$)?6Kws`ulb`GtScF%~y;n(-)0RZrPRAy|+&{NLaB;S4yPUmhGvQLwKu};f~g& z;o3Dnll7v>d8wKQ3Ewn7jHJ5vS`?oBEQQ;RE&$QVo}f<>K$I$JJxE3Llj_nLbW^jY7}3fcp6F z8YJ@a?Vnl$_M66WnB2zt#n1O%ONs3_^{iZ^$l!R;A?!*4DBiG?A^`L!EG zxMxnutA2o1le=E?m2ru|`({ROEov^@n>4@QZA8x;KP!uU{y;fl_U?{L&^0)N zI!^qz;+R+zr*6Gt)bgyaRFcVAJSj00I2Vxc8bV$e!{vq&yNCl3N}IE^Gkss8fqovw1TF@F6bm~Rf+nyZ)x8}$+oc}g None: + dest.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(src, dest) + + +def _copy_glob(src_dir: Path, pattern: str, dest_dir: Path) -> None: + for path in src_dir.glob(pattern): + if path.is_file(): + _copy_file(path, dest_dir / path.name) + + +def main() -> int: + root = Path(__file__).resolve().parents[1] + public_dir = root / "public" + + report_src = root / "REPORT.md" + if report_src.exists(): + _copy_file(report_src, public_dir / "REPORT.md") + + plots_dir = root / "outputs" / "plots" + if plots_dir.exists(): + _copy_glob(plots_dir, "*.png", public_dir / "plots") + + results_report = root / "outputs" / "results" / "report.md" + if results_report.exists(): + _copy_file(results_report, public_dir / "reports" / "results_report.md") + + rub_results_dir = root / "outputs" / "rub" / "results" + if rub_results_dir.exists(): + for name in [ + "report.md", + "report_gpt4o.md", + "report_gpt41.md", + "compare_gpt4o_gpt41.md", + ]: + src = rub_results_dir / name + if src.exists(): + dest_name = name.replace("report", "rub_report", 1) + _copy_file(src, public_dir / "reports" / dest_name) + + index_path = public_dir / "INDEX.md" + lines = [ + "# Public Bundle", + "", + "This directory contains the public-ready benchmark artifacts.", + "", + "## Contents", + "- REPORT.md", + "- reports/", + "- plots/", + "", + "Generated by scripts/publicize.py.", + "", + ] + index_path.write_text("\n".join(lines), encoding="utf-8") + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/benchmark/scripts/publicize.sh b/benchmark/scripts/publicize.sh new file mode 100644 index 0000000..f062326 --- /dev/null +++ b/benchmark/scripts/publicize.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +set -euo pipefail + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +bench_dir="$(dirname "$script_dir")" + +python_bin="$bench_dir/.venv/bin/python" +if [[ -f "$python_bin" ]]; then + "$python_bin" "$script_dir/publicize.py" +else + python "$script_dir/publicize.py" +fi From cbd3abaa4ace5a8240e7e0bb9762f4642d36636c Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Thu, 29 Jan 2026 21:30:08 +0900 Subject: [PATCH 35/38] feat: Add note about initial benchmark and future expansion --- benchmark/REPORT.md | 1 + benchmark/public/REPORT.md | 1 + 2 files changed, 2 insertions(+) diff --git a/benchmark/REPORT.md b/benchmark/REPORT.md index 4bfb840..aca6354 100644 --- a/benchmark/REPORT.md +++ b/benchmark/REPORT.md @@ -21,6 +21,7 @@ Sources: - Model: gpt-4o (Responses API) - Temperature: 0.0 - Note: record the run date/time when publishing +- This is an initial benchmark (n=12) and will be expanded in future releases. ## Core Benchmark (extraction + scoring) diff --git a/benchmark/public/REPORT.md b/benchmark/public/REPORT.md index 4bfb840..aca6354 100644 --- a/benchmark/public/REPORT.md +++ b/benchmark/public/REPORT.md @@ -21,6 +21,7 @@ Sources: - Model: gpt-4o (Responses API) - Temperature: 0.0 - Note: record the run date/time when publishing +- This is an initial benchmark (n=12) and will be expanded in future releases. ## Core Benchmark (extraction + scoring) From 275d4587847fc2bd3279833388ffa354210557ba Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Thu, 29 Jan 2026 21:46:10 +0900 Subject: [PATCH 36/38] feat: Add benchmark section with reports and charts to documentation --- README.ja.md | 9 +++++++++ README.md | 9 +++++++++ docs/README.en.md | 9 +++++++++ docs/README.ja.md | 9 +++++++++ 4 files changed, 36 insertions(+) diff --git a/README.ja.md b/README.ja.md index d1b8583..bd63662 100644 --- a/README.ja.md +++ b/README.ja.md @@ -17,6 +17,15 @@ ExStruct は Excel ワークブックを読み取り、構造化データ(セ - **CLI レンダリング**(Excel 必須): PDF とシート画像を生成可能。 - **安全なフォールバック**: Excel COM 不在でもプロセスは落ちず、セル+テーブル候補+印刷範囲に切り替え(図形・チャートは空)。 +## ベンチマーク + +![Benchmark Chart](benchmark/outputs/plots/markdown_quality.png) + +このリポジトリには、ExcelドキュメントのRAG/LLM前処理に焦点を当てたベンチマークレポートが含まれています。 +私たちは2つの視点から追跡しています。(1) コア抽出精度と (2) 下流構造クエリのための再構築ユーティリティ (RUB) です。 +作業サマリーについては`benchmark/REPORT.md`を、公開バンドルについては`benchmark/public/REPORT.md`を参照してください。 +現在の結果はn=12のケースに基づいており、今後さらに拡張される予定です。 + ## インストール ```bash diff --git a/README.md b/README.md index 6f0aeb2..3e11687 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,15 @@ ExStruct reads Excel workbooks and outputs structured data (cells, table candida - **CLI rendering** (Excel required): optional PDF and per-sheet PNGs. - **Graceful fallback**: if Excel COM is unavailable, extraction falls back to cells + table candidates without crashing. +## Benchmark + +![Benchmark Chart](benchmark/outputs/plots/markdown_quality.png) + +This repository includes benchmark reports focused on RAG/LLM preprocessing of Excel documents. +We track two perspectives: (1) core extraction accuracy and (2) reconstruction utility for downstream structure queries (RUB). +See `benchmark/REPORT.md` for the working summary and `benchmark/public/REPORT.md` for the public bundle. +Current results are based on n=12 cases and will be expanded. + ## Installation ```bash diff --git a/docs/README.en.md b/docs/README.en.md index 1110e14..c94cd26 100644 --- a/docs/README.en.md +++ b/docs/README.en.md @@ -19,6 +19,15 @@ ExStruct reads Excel workbooks and outputs structured data (cells, table candida - **CLI rendering** (Excel required): optional PDF and per-sheet PNGs. - **Graceful fallback**: if Excel COM is unavailable, extraction falls back to cells + table candidates without crashing. +## Benchmark + +![Benchmark Chart](../benchmark/outputs/plots/markdown_quality.png) + +This repository includes benchmark reports focused on RAG/LLM preprocessing of Excel documents. +We track two perspectives: (1) core extraction accuracy and (2) reconstruction utility for downstream structure queries (RUB). +See `benchmark/REPORT.md` for the working summary and `benchmark/public/REPORT.md` for the public bundle. +Current results are based on n=12 cases and will be expanded. + ## Installation ```bash diff --git a/docs/README.ja.md b/docs/README.ja.md index 69e02df..f6dec91 100644 --- a/docs/README.ja.md +++ b/docs/README.ja.md @@ -17,6 +17,15 @@ ExStruct は Excel ワークブックを読み取り、構造化データ(セ - **CLI レンダリング**(Excel 必須): PDF とシート画像を生成可能。 - **安全なフォールバック**: Excel COM 不在でもプロセスは落ちず、セル+テーブル候補+印刷範囲に切り替え(図形・チャートは空)。 +## ベンチマーク + +![Benchmark Chart](../benchmark/outputs/plots/markdown_quality.png) + +このリポジトリには、ExcelドキュメントのRAG/LLM前処理に焦点を当てたベンチマークレポートが含まれています。 +私たちは2つの視点から追跡しています。(1) コア抽出精度と (2) 下流構造クエリのための再構築ユーティリティ (RUB) です。 +作業サマリーについては`benchmark/REPORT.md`を、公開バンドルについては`benchmark/public/REPORT.md`を参照してください。 +現在の結果はn=12のケースに基づいており、今後さらに拡張される予定です。 + ## インストール ```bash From 552977d857004df3b78e02afa6757996453ea76b Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Thu, 29 Jan 2026 21:51:19 +0900 Subject: [PATCH 37/38] feat: Exclude benchmark directory from coverage and linting checks --- .codacy.yml | 2 ++ codecov.yml | 2 ++ pyproject.toml | 2 ++ 3 files changed, 6 insertions(+) create mode 100644 .codacy.yml diff --git a/.codacy.yml b/.codacy.yml new file mode 100644 index 0000000..da332f7 --- /dev/null +++ b/.codacy.yml @@ -0,0 +1,2 @@ +exclude_paths: + - "benchmark/**" diff --git a/codecov.yml b/codecov.yml index 8998011..4b9abbd 100644 --- a/codecov.yml +++ b/codecov.yml @@ -8,6 +8,8 @@ coverage: default: target: auto threshold: 1% +ignore: + - "benchmark/**" flags: unit: paths: diff --git a/pyproject.toml b/pyproject.toml index 535ba96..128e8ed 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -74,6 +74,7 @@ omit = [ target-version = "py311" src = ["exstruct"] fix = true +exclude = ["benchmark/**"] # 静的解析ルール [tool.ruff.lint] @@ -113,6 +114,7 @@ max-complexity = 12 [tool.mypy] packages = ["exstruct"] python_version = "3.11" +exclude = "benchmark/.*" # 外部ライブラリの型情報がない場合は無視 ignore_missing_imports = true From 53890f1ae8d006570acdb1212afa2f4b7c4add2d Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Thu, 29 Jan 2026 22:33:08 +0900 Subject: [PATCH 38/38] fix: Update benchmark chart paths in documentation and scripts for consistency --- README.ja.md | 2 +- README.md | 2 +- benchmark/REPORT.md | 2 +- benchmark/data/manifest.json | 6 +- benchmark/docs/spec.md | 143 ++++++++---------- benchmark/public/REPORT.md | 2 +- benchmark/rub/manifest.json | 46 +++--- benchmark/rub/manifest_lite.json | 40 +++-- benchmark/scripts/publicize.ps1 | 3 +- benchmark/scripts/publicize.sh | 2 +- benchmark/scripts/reproduce.ps1 | 6 +- benchmark/src/bench/eval/markdown_score.py | 7 +- benchmark/src/bench/llm/openai_client.py | 5 + benchmark/src/bench/pipeline/image_render.py | 19 ++- .../src/bench/pipeline/openpyxl_pandas.py | 67 ++++---- benchmark/src/bench/pipeline/pdf_text.py | 17 ++- benchmark/src/bench/rub/manifest.py | 1 + docs/README.en.md | 2 +- docs/README.ja.md | 2 +- 19 files changed, 203 insertions(+), 171 deletions(-) diff --git a/README.ja.md b/README.ja.md index bd63662..9071fe2 100644 --- a/README.ja.md +++ b/README.ja.md @@ -19,7 +19,7 @@ ExStruct は Excel ワークブックを読み取り、構造化データ(セ ## ベンチマーク -![Benchmark Chart](benchmark/outputs/plots/markdown_quality.png) +![Benchmark Chart](benchmark/public/plots/markdown_quality.png) このリポジトリには、ExcelドキュメントのRAG/LLM前処理に焦点を当てたベンチマークレポートが含まれています。 私たちは2つの視点から追跡しています。(1) コア抽出精度と (2) 下流構造クエリのための再構築ユーティリティ (RUB) です。 diff --git a/README.md b/README.md index 3e11687..20e5283 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ ExStruct reads Excel workbooks and outputs structured data (cells, table candida ## Benchmark -![Benchmark Chart](benchmark/outputs/plots/markdown_quality.png) +![Benchmark Chart](benchmark/public/plots/markdown_quality.png) This repository includes benchmark reports focused on RAG/LLM preprocessing of Excel documents. We track two perspectives: (1) core extraction accuracy and (2) reconstruction utility for downstream structure queries (RUB). diff --git a/benchmark/REPORT.md b/benchmark/REPORT.md index aca6354..e146d3a 100644 --- a/benchmark/REPORT.md +++ b/benchmark/REPORT.md @@ -1,4 +1,4 @@ -# Benchmark Summary (Public) +# Benchmark Summary (Public) This summary consolidates the latest results for the Excel document benchmark and RUB (structure query track). Use this file as a public-facing overview and link diff --git a/benchmark/data/manifest.json b/benchmark/data/manifest.json index 70a53f1..7bedfbf 100644 --- a/benchmark/data/manifest.json +++ b/benchmark/data/manifest.json @@ -135,8 +135,8 @@ { "id": "food_inspection_record_01", "type": "inspection_log", - "xlsx": "data/raw/food_inspection record_01.xlsx", - "question": "This workbook contains three sheets (\"検食簿(1)\", \"検食簿 (2)\", \"検食簿 (3)\"). For the first date on each sheet, extract the lunch menu items and snack items and return JSON in the following format:\n\n{\n \"sheets\": {\n \"検食簿(1)\": {\"date\": \"...\", \"lunch_menu\": [\"...\"], \"snacks\": [\"...\"]},\n \"検食簿 (2)\": {\"date\": \"...\", \"lunch_menu\": [\"...\"], \"snacks\": [\"...\"]},\n \"検食簿 (3)\": {\"date\": \"...\", \"lunch_menu\": [\"...\"], \"snacks\": [\"...\"]},\n }\n}\n\nJSON only.", + "xlsx": "data/raw/food_inspection_record_01.xlsx", + "question": "This workbook contains three sheets (\"検食簿(1)\", \"検食簿 (2)\", \"検食簿 (3)\"). For the first date on each sheet, extract the lunch menu items and snack items and return JSON in the following format:\n\n{\n \"sheets\": {\n \"検食簿(1)\": {\"date\": \"...\", \"lunch_menu\": [\"...\"], \"snacks\": [\"...\"]},\n \"検食簿 (2)\": {\"date\": \"...\", \"lunch_menu\": [\"...\"], \"snacks\": [\"...\"]},\n \"検食簿 (3)\": {\"date\": \"...\", \"lunch_menu\": [\"...\"], \"snacks\": [\"...\"]}\n }\n}\n\nJSON only.", "truth": "data/truth/food_inspection_record_01.json", "sheet_scope": null, "render": { @@ -145,4 +145,4 @@ } } ] -} \ No newline at end of file +} diff --git a/benchmark/docs/spec.md b/benchmark/docs/spec.md index bdf10aa..2f52117 100644 --- a/benchmark/docs/spec.md +++ b/benchmark/docs/spec.md @@ -1,93 +1,87 @@ -# Reconstruction Utility Benchmark (RUB) Specification +# Reconstruction Utility Benchmark (RUB) Specification -## 0. ????v0.1 / lite? +## 0. Scope (v0.1 / lite vs v1) -???????? **RUB lite (v0.1)** ?????? -???????????????????? 0 ???????????????? +RUB lite (v0.1) is a small, fast-running subset intended for quick checks. +The full RUB (v1) is the primary benchmark for public reporting. -RUB lite ??????? +RUB lite assets: - benchmark/rub/manifest_lite.json - benchmark/rub/truth_lite/*.json -????v1??????????????? +Full RUB assets: -## 1. ?? +- benchmark/rub/manifest.json +- benchmark/rub/truth/*.json -RUB ???????? Markdown ???????????????????????? -Markdown ?????????????????Reconstruction Utility?????????? +## 1. Goal -## 2. ???? +RUB measures how useful reconstructed Markdown is for downstream structure-aware +queries. The target is reconstruction utility rather than raw string similarity. -- ??: ?? Excel ?? -- ??: pdf / image_vlm / exstruct / html / openpyxl -- ??: ???????? Markdown -- ??: Markdown ?????????????????? +## 2. Inputs and outputs -## 3. ??????2??? +- Input: Excel workbooks (.xlsx) +- Methods: pdf, image_vlm, exstruct, html, openpyxl +- Stage A output: reconstructed Markdown +- Stage B output: JSON-only answers to structure queries -### Stage A: ??? +## 3. Two-stage evaluation -???? Markdown ?????? +### Stage A: Reconstruction -- pdf: soffice ? pdf ? ?????? ? Markdown -- image_vlm: ???????? ? VLM ? Markdown -- exstruct: exstruct JSON ? LLM ? Markdown -- html / openpyxl: ?????? ? Markdown +Each method produces Markdown from the same source workbook. -### Stage B: ???????? +- pdf: soffice -> pdf -> text extraction -> Markdown +- image_vlm: render -> VLM -> Markdown +- exstruct: exstruct JSON -> LLM -> Markdown +- html / openpyxl: rule-based extraction -> Markdown -Stage A ? Markdown ?????????????????? +### Stage B: Structure queries -- ??? JSON ?? -- JSON ??????? -- ?????????????deterministic? +Only the Stage A Markdown is used as input to answer queries. -## 4. ??????? +- Output must be JSON only +- Scored by exact match after deterministic normalization -- ?????????????????? -- ?????????????????? Markdown ?? -- ????????????????? - - ?????????? - - ?????????/???? - - ?????????? - - ?????????? -- ????????????????????????? +## 4. Task design principles -## 5. ???????? +- Prefer tasks that require structure (blocks, hierarchy, adjacency) +- Avoid tasks that are solvable by surface text order alone +- Define canonical JSON outputs +- Use deterministic normalization for fairness -?????????????? +## 5. Scoring and normalization -- ???: ???????????? 1 ?????? - ??? -- ??: ??????canonicalization? -- ??: ????????????????? -- ??: ???????????: "012" ? 12? +- Normalize strings and JSON structure before comparison +- For unordered collections, compare as sorted sets +- Avoid ambiguous numbering in answers -## 6. ???? +## 6. Metrics -### 6.1 ???: RUS +### 6.1 Primary metric: RUS -RUS = ??? / ??? +RUS = correct_answers / total_questions -### 6.2 ??? +### 6.2 Secondary metrics - Cost-normalized RUS = RUS / cost_usd - Token-normalized RUS = RUS / input_tokens -- Stage A failure rate = Markdown ????? +- Stage A failure rate = failed Markdown reconstruction rate -## 7. ????? +## 7. Directory layout ``` benchmark/ rub/ README.md BENCHMARK_SPEC.md - manifest.json # ??? (v1) - manifest_lite.json # ??? (v0.1 / lite) - truth/ # ??? (v1) + manifest.json # full (v1) + manifest_lite.json # lite (v0.1) + truth/ # full (v1) *.json - truth_lite/ # ??? (v0.1 / lite) + truth_lite/ # lite (v0.1) *.json schemas/ *.schema.json @@ -99,34 +93,31 @@ benchmark/ scoring_flow.mmd ``` -## 8. manifest ????? +## 8. Manifest fields -- id: ???ID -- type: ????? -- xlsx: ??????? -- question: Stage B ??? -- truth: ?? JSON ?? -- sheet_scope: ??????null ????? -- render: ??????? +- id: task id +- type: task type +- xlsx: input workbook path +- question: Stage B query +- truth: ground-truth JSON path +- sheet_scope: optional sheet filter (null = all) +- render: render settings for image/pdf paths +- track: evaluation track name (default: reconstruction) -## 8.1 RUB lite ???? +## 8.1 RUB lite notes -- ?????????????? -- ??????unordered_paths????????????? -- 0/1 ???????????????????????? +- Smaller number of cases +- Unordered paths supported for strict but fair comparison +- Binary scoring (0/1) only -## 9. ??? +## 9. Evaluation notes -- ??????????????? -- ????????????????? -- ????????????? 0 ?? +- Do not use Markdown string similarity for RUB scoring +- Focus on task correctness and structure preservation +- Keep normalization deterministic and transparent -## 10. ?????? +## 10. Reporting -- ?Markdown ??????????????? -- RUS?????????????????? -- ??????????????????? - ---- - -????? v0.1?lite?????????????????? +- Public report focuses on reconstruction utility +- Show both primary and secondary metrics +- Clearly separate core extraction vs RUB results diff --git a/benchmark/public/REPORT.md b/benchmark/public/REPORT.md index aca6354..e146d3a 100644 --- a/benchmark/public/REPORT.md +++ b/benchmark/public/REPORT.md @@ -1,4 +1,4 @@ -# Benchmark Summary (Public) +# Benchmark Summary (Public) This summary consolidates the latest results for the Excel document benchmark and RUB (structure query track). Use this file as a public-facing overview and link diff --git a/benchmark/rub/manifest.json b/benchmark/rub/manifest.json index d6078ca..741d785 100644 --- a/benchmark/rub/manifest.json +++ b/benchmark/rub/manifest.json @@ -5,100 +5,112 @@ "source_case_id": "ffr_425_01", "type": "application_form", "question": "このExcel帳票(Federal Financial Report / SF-425)について、次の情報を抽出し、JSONのみで返してください。\n\n(1) checkbox_groups: チェックボックスのグループ名と、その選択肢ラベル一覧を抽出してください(\"Report Type\" と \"Basis of Accounting\" の2グループのみ)。\n(2) not_required_by_epa_scope: 赤字の注記 \"Not Required by EPA\" がかかっているセクション名を返してください(例: \"Federal Cash\")。\n(3) section_headers: 帳票上部の番号付きセクション見出し(1〜9)のうち、見出しテキストのみを配列で返してください(例: \"Federal Agency and Organizational Element to Which Report is Submitted\" など)。\n\n出力形式(厳守):\n{\n \"checkbox_groups\": {\n \"Report Type\": [\"Quarterly\", \"Semi-Annual\", \"Annual\", \"Final\"],\n \"Basis of Accounting\": [\"Cash\", \"Accrual\"]\n },\n \"not_required_by_epa_scope\": \"...\",\n \"section_headers\": [\"...\", \"...\", \"...\"]\n}\n\n注意:\n- チェックボックスの記号(□など)は含めないでください。ラベル文字列のみを返してください。\n- section_headers は表示順(上から左→右)で返してください。\n- 余分な説明文やコードフェンスは付けず、JSONのみを返してください。", - "truth": "rub\\truth\\ffr_425_01.json" + "truth": "rub/truth/ffr_425_01.json", + "track": "reconstruction" }, { "id": "flowchart_01", "source_case_id": "flowchart_01", "type": "flowchart", "question": "このフローチャートの開始から終了までの主要な処理ステップを順番に抽出し、次のJSON形式のみで返してください。\n\n出力形式(厳守):\n{\n \"steps\": [\"step1\", \"step2\", \"step3\", ...]\n}\n\n注意事項:\n- 開始ノードと終了ノードも含めてください\n- 分岐やループがある場合は、代表的な主経路として線形化してください\n- ステップ名は図中のラベル文字列をそのまま使用してください", - "truth": "rub\\truth\\flowchart_01.json" + "truth": "rub/truth/flowchart_01.json", + "track": "reconstruction" }, { "id": "gantt_01", "source_case_id": "gantt_01", "type": "gantt", "question": "このガントチャートのPhase3のタスク名とその開始日、終了日を抽出し、次のJSON形式のみで返してください: {\"tasks\":[{\"name\":\"...\",\"start_date\":\"YYYY-MM-DD\",\"end_date\":\"YYYY-MM-DD\"}, ...]}", - "truth": "rub\\truth\\gantt_01.json", + "truth": "rub/truth/gantt_01.json", "unordered_paths": [ "tasks" - ] + ], + "track": "reconstruction" }, { "id": "certificate_of_employment_01", "source_case_id": "certificate_of_employment_01", "type": "application_form", "question": "このExcel帳票(就労証明書)について、次の3点を抽出し、JSONのみで返してください。\n\n(1) checkbox_groups: 以下の3つのチェックボックス項目について、それぞれの選択肢ラベルを抽出してください。\n - 業種\n - 雇用の形態\n - 雇用(予定)期間等(無期 / 有期)\n\n(2) numbered_sections: 帳票の「No.」列に対応する番号付き項目の見出し(1〜14)を、番号をキーとして抽出してください。\n\n(3) warning_text: 赤字で記載されている注意文を、そのまま1つの文字列として抽出してください。\n\n出力形式(厳守):\n{\n \"checkbox_groups\": {\n \"業種\": [\"...\", \"...\"],\n \"雇用の形態\": [\"...\", \"...\"],\n \"雇用(予定)期間等\": [\"...\", \"...\"]\n },\n \"numbered_sections\": {\n \"1\": \"...\",\n \"2\": \"...\",\n \"3\": \"...\"\n },\n \"warning_text\": \"...\"\n}\n\n注意:\n- チェックボックス記号(□など)は含めず、ラベル文字列のみを返してください。\n- numbered_sections は 1〜14 すべてを含めてください。\n- 余分な説明文やコードフェンスは付けず、JSONのみを返してください。", - "truth": "rub\\truth\\certificate_of_employment_01.json" + "truth": "rub/truth/certificate_of_employment_01.json", + "track": "reconstruction" }, { "id": "tax_report_01", "source_case_id": "tax_report_01", "type": "application_form", "question": "この市民税・県民税申告書の右側に配置されている縦方向の帳票構造を解析してください。\n\n次の条件をすべて満たすJSONを返してください。\n\n1. 「収入金額」ブロックに含まれる項目名を、上から順に配列で列挙してください。\n2. 上記項目群を視覚的にまとめている「合計」項目名を1つ指定してください。\n3. 「所得から差し引かれる金額」ブロックに含まれる項目名を、上から順に配列で列挙してください。\n4. 上記控除項目群を視覚的にまとめている「合計」項目名を1つ指定してください。\n\n制約:\n- 項目名は帳票に記載されている日本語表記をそのまま使用してください。\n- 数値、記号、注釈文は含めないでください。\n- 同一列・同一枠内にある項目同士の位置関係に基づいて判断してください。\n- JSONのみを返してください。\n\n出力形式:\n{\n \"income_items\": [\"...\", \"...\"],\n \"income_total\": \"...\",\n \"deduction_items\": [\"...\", \"...\"],\n \"deduction_total\": \"...\"\n}", - "truth": "rub\\truth\\tax_report_01.json" + "truth": "rub/truth/tax_report_01.json", + "track": "reconstruction" }, { "id": "smartart_01", "source_case_id": "smartart_01", "type": "organization_chart", "question": "このExcel帳票(SmartArtで作成された組織図)について、次の3点を抽出し、JSONのみで返してください。\n\n(1) top_structure: 最上位から第2階層までの組織構造を、親子関係が分かる形で抽出してください。\n\n(2) sales_departments: 「営業部」の直下にある課の名称を、上から順に配列で返してください。\n\n(3) production_sites: 「生産部」の直下にある工場名を、上から順に配列で返してください。\n\n出力形式(厳守):\n{\n \"top_structure\": {\n \"取締役会\": {\n \"社長\": [\"...\"]\n }\n },\n \"sales_departments\": [\"...\", \"...\"],\n \"production_sites\": [\"...\", \"...\"]\n}\n\n注意:\n- 図形の色や配置座標は含めないでください。テキスト内容と階層関係のみを対象とします。\n- 余分な説明文やコードフェンスは付けず、JSONのみを返してください。", - "truth": "rub\\truth\\smartart_01.json", + "truth": "rub/truth/smartart_01.json", "unordered_paths": [ "top_structure.取締役会.社長", "sales_departments", "production_sites" - ] + ], + "track": "reconstruction" }, { "id": "basic_01", "source_case_id": "basic_01", "type": "mixed_document", "question": "このExcel帳票について、次の3点を抽出し、JSONのみで返してください。\n\n(1) sales_table: 左上の売上表について、月をキーとして各製品の数値を抽出してください。\n\n(2) chart_series: 右上の折れ線グラフに含まれる系列名を、凡例の表示順で配列として返してください。\n\n(3) flowchart_paths: 下部のフローチャートについて、開始から終了までの処理パスを条件付きで2通り抽出してください。\n - format_valid = true の場合の処理パス\n - format_valid = false の場合の処理パス\n\n出力形式(厳守):\n{\n \"sales_table\": {\n \"Jan-25\": {\"製品A\": 0, \"製品B\": 0, \"製品C\": 0},\n \"Feb-25\": {\"製品A\": 0, \"製品B\": 0, \"製品C\": 0}\n },\n \"chart_series\": [\"...\", \"...\"],\n \"flowchart_paths\": {\n \"format_valid_true\": [\"...\", \"...\"],\n \"format_valid_false\": [\"...\", \"...\"]\n }\n}\n\n注意:\n- 数値は整数で返してください。\n- フローチャートのパスは、図形内の文言をそのまま順番に並べてください。\n- 余分な説明文やコードフェンスは付けず、JSONのみを返してください。", - "truth": "rub\\truth\\basic_01.json" + "truth": "rub/truth/basic_01.json", + "track": "reconstruction" }, { "id": "heatstroke_flow_01", "source_case_id": "heatstroke_flow_01", "type": "flowchart", "question": "このExcelに記載されている熱中症対応フローについて、上から順に各対応ステップを抽出してください。各ステップについて、step_name(工程名)、description(内容要約)、special_conditions(条件や注意事項がある場合のみ配列で記載)を含むJSONを、次の形式のみで返してください。\n\n{\n \"steps\": [\n {\n \"step_name\": \"...\",\n \"description\": \"...\",\n \"special_conditions\": [\"...\"]\n }\n ]\n}", - "truth": "rub\\truth\\heatstroke_flow_01.json" + "truth": "rub/truth/heatstroke_flow_01.json", + "track": "reconstruction" }, { "id": "workflow_01", "source_case_id": "workflow_01", "type": "workflow", "question": "このExcelに記載されている業務フロー図(ネット注文フローチャート)について、工程を上から順に整理してください。各工程について、actor(実行主体)、step_name(工程名)、next_steps(次に進む工程名の配列)を含むJSONを、次の形式のみで返してください。\n\n{\n \"steps\": [\n {\n \"actor\": \"お客様|当社\",\n \"step_name\": \"...\",\n \"next_steps\": [\"...\"]\n }\n ]\n}", - "truth": "rub\\truth\\workflow_01.json", + "truth": "rub/truth/workflow_01.json", "unordered_paths": [ "steps", "steps.next_steps" - ] + ], + "track": "reconstruction" }, { "id": "basic_form_01", "source_case_id": "basic_form_01", "type": "application_form", "question": "このExcel申請書に記載されている入力項目を、意味的なブロック単位で整理してください。申請者本人に関する項目、配偶者に関する項目、収入等に関する申告、預貯金等に関する申告の4分類に分け、それぞれに含まれる項目名を配列でまとめたJSONを、次の形式のみで返してください。\n\n{\n \"applicant\": [],\n \"spouse\": [],\n \"income_declaration\": [],\n \"asset_declaration\": []\n}", - "truth": "rub\\truth\\basic_form_01.json" + "truth": "rub/truth/basic_form_01.json", + "track": "reconstruction" }, { "id": "flowchart_02", "source_case_id": "flowchart_02", "type": "flowchart", "question": "このExcelに記載されているログイン処理フローについて、工程を上から順に整理してください。各工程について、step_name(工程名)、step_type(start|process|decision|end)、next_steps(条件付き遷移を含む次工程)を含むJSONを、次の形式のみで返してください。\n\n{\n \"steps\": [\n {\n \"step_name\": \"...\",\n \"step_type\": \"start|process|decision|end\",\n \"next_steps\": [\n {\n \"condition\": \"...\",\n \"next\": \"...\"\n }\n ]\n }\n ]\n}", - "truth": "rub\\truth\\flowchart_02.json", + "truth": "rub/truth/flowchart_02.json", "unordered_paths": [ "steps", "steps.next_steps" - ] + ], + "track": "reconstruction" }, { "id": "food_inspection_record_01", "source_case_id": "food_inspection_record_01", "type": "inspection_log", "question": "This workbook contains three sheets (\"検食簿(1)\", \"検食簿 (2)\", \"検食簿 (3)\"). For the first date on each sheet, extract the lunch menu items and snack items and return JSON in the following format:\n\n{\n \"sheets\": {\n \"検食簿(1)\": {\"date\": \"...\", \"lunch_menu\": [\"...\"], \"snacks\": [\"...\"]},\n \"検食簿 (2)\": {\"date\": \"...\", \"lunch_menu\": [\"...\"], \"snacks\": [\"...\"]},\n \"検食簿 (3)\": {\"date\": \"...\", \"lunch_menu\": [\"...\"], \"snacks\": [\"...\"]},\n }\n}\n\nJSON only.", - "truth": "rub\\truth\\food_inspection_record_01.json" + "truth": "rub/truth/food_inspection_record_01.json", + "track": "reconstruction" } ] -} \ No newline at end of file +} diff --git a/benchmark/rub/manifest_lite.json b/benchmark/rub/manifest_lite.json index 5029568..cc5ca99 100644 --- a/benchmark/rub/manifest_lite.json +++ b/benchmark/rub/manifest_lite.json @@ -1,4 +1,4 @@ -{ +{ "tasks": [ { "id": "ffr_425_01", @@ -8,7 +8,8 @@ "truth": "rub/truth_lite/ffr_425_01.json", "unordered_paths": [ "section_headers" - ] + ], + "track": "reconstruction" }, { "id": "certificate_of_employment_01", @@ -18,7 +19,8 @@ "truth": "rub/truth_lite/certificate_of_employment_01.json", "unordered_paths": [ "sections" - ] + ], + "track": "reconstruction" }, { "id": "tax_report_01", @@ -28,7 +30,8 @@ "truth": "rub/truth_lite/tax_report_01.json", "unordered_paths": [ "income_items" - ] + ], + "track": "reconstruction" }, { "id": "basic_01", @@ -38,14 +41,16 @@ "truth": "rub/truth_lite/basic_01.json", "unordered_paths": [ "chart_series" - ] + ], + "track": "reconstruction" }, { "id": "heatstroke_flow_01", "source_case_id": "heatstroke_flow_01", "type": "flowchart", "question": "Extract step names in order. JSON only: {\"steps\":[\"...\"]}", - "truth": "rub/truth_lite/heatstroke_flow_01.json" + "truth": "rub/truth_lite/heatstroke_flow_01.json", + "track": "reconstruction" }, { "id": "workflow_01", @@ -55,7 +60,8 @@ "truth": "rub/truth_lite/workflow_01.json", "unordered_paths": [ "nodes" - ] + ], + "track": "reconstruction" }, { "id": "flowchart_02", @@ -65,14 +71,16 @@ "truth": "rub/truth_lite/flowchart_02.json", "unordered_paths": [ "nodes" - ] + ], + "track": "reconstruction" }, { "id": "food_inspection_record_01", "source_case_id": "food_inspection_record_01", "type": "inspection_log", "question": "Extract first date per sheet. JSON only: {\"dates_by_sheet\": {\"sheet\": \"date\"}}", - "truth": "rub/truth_lite/food_inspection_record_01.json" + "truth": "rub/truth_lite/food_inspection_record_01.json", + "track": "reconstruction" }, { "id": "basic_form_01", @@ -82,14 +90,16 @@ "truth": "rub/truth_lite/basic_form_01.json", "unordered_paths": [ "applicant_fields" - ] + ], + "track": "reconstruction" }, { "id": "flowchart_01", "source_case_id": "flowchart_01", "type": "flowchart", "question": "Extract flowchart step names in order. JSON only: {\"steps\":[\"...\"]}", - "truth": "rub/truth_lite/flowchart_01.json" + "truth": "rub/truth_lite/flowchart_01.json", + "track": "reconstruction" }, { "id": "gantt_01", @@ -99,7 +109,8 @@ "truth": "rub/truth_lite/gantt_01.json", "unordered_paths": [ "task_names" - ] + ], + "track": "reconstruction" }, { "id": "smartart_01", @@ -110,7 +121,8 @@ "unordered_paths": [ "sales_departments", "production_sites" - ] + ], + "track": "reconstruction" } ] -} \ No newline at end of file +} diff --git a/benchmark/scripts/publicize.ps1 b/benchmark/scripts/publicize.ps1 index 7d57ec7..a84fc2e 100644 --- a/benchmark/scripts/publicize.ps1 +++ b/benchmark/scripts/publicize.ps1 @@ -1,4 +1,4 @@ -Set-StrictMode -Version Latest +Set-StrictMode -Version Latest $ErrorActionPreference = "Stop" $scriptDir = Split-Path -Parent $MyInvocation.MyCommand.Path @@ -11,3 +11,4 @@ if (Test-Path $venvPython) { } python (Join-Path $scriptDir "publicize.py") +exit $LASTEXITCODE diff --git a/benchmark/scripts/publicize.sh b/benchmark/scripts/publicize.sh index f062326..14280dd 100644 --- a/benchmark/scripts/publicize.sh +++ b/benchmark/scripts/publicize.sh @@ -1,4 +1,4 @@ -#!/usr/bin/env bash +#!/usr/bin/env bash set -euo pipefail script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" diff --git a/benchmark/scripts/reproduce.ps1 b/benchmark/scripts/reproduce.ps1 index 22812c9..a799cf4 100644 --- a/benchmark/scripts/reproduce.ps1 +++ b/benchmark/scripts/reproduce.ps1 @@ -1,6 +1,3 @@ -Set-StrictMode -Version Latest -$ErrorActionPreference = "Stop" - param( [string]$Case = "all", [string]$Method = "all", @@ -9,6 +6,9 @@ param( [switch]$SkipAsk ) +Set-StrictMode -Version Latest +$ErrorActionPreference = "Stop" + function Write-Info { param([string]$Message) Write-Host "[reproduce] $Message" diff --git a/benchmark/src/bench/eval/markdown_score.py b/benchmark/src/bench/eval/markdown_score.py index 9178cdc..56399d0 100644 --- a/benchmark/src/bench/eval/markdown_score.py +++ b/benchmark/src/bench/eval/markdown_score.py @@ -62,8 +62,13 @@ def markdown_precision_score(truth_md: str, pred_md: str) -> float: def _normalized_lines(markdown: str) -> list[str]: """Normalize Markdown into comparable text lines.""" lines: list[str] = [] + in_code_block = False for raw in markdown.splitlines(): - if raw.strip().startswith("```"): + stripped = raw.strip() + if stripped.startswith("```"): + in_code_block = not in_code_block + continue + if in_code_block: continue norm = _normalize_line(raw) if not norm: diff --git a/benchmark/src/bench/llm/openai_client.py b/benchmark/src/bench/llm/openai_client.py index b3a8c7e..fd374ef 100644 --- a/benchmark/src/bench/llm/openai_client.py +++ b/benchmark/src/bench/llm/openai_client.py @@ -2,6 +2,7 @@ import base64 import json +import os from pathlib import Path from typing import Any @@ -60,6 +61,10 @@ class OpenAIResponsesClient: def __init__(self) -> None: load_dotenv(dotenv_path=ROOT / ".env") + if not os.getenv("OPENAI_API_KEY"): + raise RuntimeError( + "OPENAI_API_KEY is not set. Add it to .env or your environment." + ) self.client = OpenAI() def ask_text( diff --git a/benchmark/src/bench/pipeline/image_render.py b/benchmark/src/bench/pipeline/image_render.py index ec99e2e..dab6c82 100644 --- a/benchmark/src/bench/pipeline/image_render.py +++ b/benchmark/src/bench/pipeline/image_render.py @@ -19,16 +19,15 @@ def xlsx_to_pngs_via_pdf( tmp_pdf = out_dir / f"{xlsx_path.stem}.pdf" xlsx_to_pdf(xlsx_path, tmp_pdf) - doc = fitz.open(tmp_pdf) - zoom = dpi / 72.0 - mat = fitz.Matrix(zoom, zoom) - paths: list[Path] = [] - for i in range(min(doc.page_count, max_pages)): - page = doc.load_page(i) - pix = page.get_pixmap(matrix=mat, alpha=False) - p = out_dir / f"page_{i + 1:02d}.png" - pix.save(p) - paths.append(p) + with fitz.open(tmp_pdf) as doc: + zoom = dpi / 72.0 + mat = fitz.Matrix(zoom, zoom) + for i in range(min(doc.page_count, max_pages)): + page = doc.load_page(i) + pix = page.get_pixmap(matrix=mat, alpha=False) + p = out_dir / f"page_{i + 1:02d}.png" + pix.save(p) + paths.append(p) return paths diff --git a/benchmark/src/bench/pipeline/openpyxl_pandas.py b/benchmark/src/bench/pipeline/openpyxl_pandas.py index ec73dfe..4f4d038 100644 --- a/benchmark/src/bench/pipeline/openpyxl_pandas.py +++ b/benchmark/src/bench/pipeline/openpyxl_pandas.py @@ -11,35 +11,38 @@ def extract_openpyxl( xlsx_path: Path, out_txt: Path, sheet_scope: list[str] | None = None ) -> None: wb = openpyxl.load_workbook(xlsx_path, data_only=True) - sheets = sheet_scope or wb.sheetnames - - lines: list[str] = [] - lines.append("[DOC_META]") - lines.append(f"source={xlsx_path.name}") - lines.append("method=openpyxl") - lines.append("") - lines.append("[CONTENT]") - - for sname in sheets: - if sname not in wb.sheetnames: - continue - ws = wb[sname] - lines.append(f"\n# SHEET: {sname}") - max_row = ws.max_row or 1 - max_col = ws.max_column or 1 - - for r in range(1, max_row + 1): - row_cells = [] - for c in range(1, max_col + 1): - v = ws.cell(r, c).value - if v is None: - continue - txt = str(v).strip() - if not txt: - continue - # 座標付きで記録(後で人間が確認しやすい) - row_cells.append(f"R{r}C{c}:{txt}") - if row_cells: - lines.append(" | ".join(row_cells)) - - write_text(out_txt, "\n".join(lines).strip() + "\n") + try: + sheets = sheet_scope or wb.sheetnames + + lines: list[str] = [] + lines.append("[DOC_META]") + lines.append(f"source={xlsx_path.name}") + lines.append("method=openpyxl") + lines.append("") + lines.append("[CONTENT]") + + for sname in sheets: + if sname not in wb.sheetnames: + continue + ws = wb[sname] + lines.append(f"\n# SHEET: {sname}") + max_row = ws.max_row or 1 + max_col = ws.max_column or 1 + + for r in range(1, max_row + 1): + row_cells = [] + for c in range(1, max_col + 1): + v = ws.cell(r, c).value + if v is None: + continue + txt = str(v).strip() + if not txt: + continue + # 座標付きで記録(後で人間が確認しやすい) + row_cells.append(f"R{r}C{c}:{txt}") + if row_cells: + lines.append(" | ".join(row_cells)) + + write_text(out_txt, "\n".join(lines).strip() + "\n") + finally: + wb.close() diff --git a/benchmark/src/bench/pipeline/pdf_text.py b/benchmark/src/bench/pipeline/pdf_text.py index 73c3786..21d9742 100644 --- a/benchmark/src/bench/pipeline/pdf_text.py +++ b/benchmark/src/bench/pipeline/pdf_text.py @@ -1,7 +1,7 @@ from __future__ import annotations -from pathlib import Path import subprocess +from pathlib import Path import fitz # PyMuPDF @@ -23,18 +23,21 @@ def xlsx_to_pdf(xlsx_path: Path, out_pdf: Path) -> None: str(out_pdf.parent), str(xlsx_path), ] - subprocess.run(cmd, check=True) + try: + subprocess.run(cmd, check=True, timeout=300) + except subprocess.TimeoutExpired as exc: + raise RuntimeError(f"soffice timed out after 300s: {xlsx_path}") from exc produced = out_pdf.parent / (xlsx_path.stem + ".pdf") produced.replace(out_pdf) def pdf_to_text(pdf_path: Path, out_txt: Path) -> None: - doc = fitz.open(pdf_path) parts: list[str] = [] - for i in range(doc.page_count): - page = doc.load_page(i) - parts.append(f"\n# PAGE {i + 1}") - parts.append(page.get_text("text")) + with fitz.open(pdf_path) as doc: + for i in range(doc.page_count): + page = doc.load_page(i) + parts.append(f"\n# PAGE {i + 1}") + parts.append(page.get_text("text")) text = "\n".join(parts).strip() lines: list[str] = [] diff --git a/benchmark/src/bench/rub/manifest.py b/benchmark/src/bench/rub/manifest.py index 4708a33..503c20c 100644 --- a/benchmark/src/bench/rub/manifest.py +++ b/benchmark/src/bench/rub/manifest.py @@ -10,6 +10,7 @@ class RubTask(BaseModel): """RUB task definition.""" id: str + track: str = Field("reconstruction", description="Evaluation track name.") source_case_id: str = Field(..., description="Case id for Stage A Markdown.") type: str question: str diff --git a/docs/README.en.md b/docs/README.en.md index c94cd26..c7a99cb 100644 --- a/docs/README.en.md +++ b/docs/README.en.md @@ -21,7 +21,7 @@ ExStruct reads Excel workbooks and outputs structured data (cells, table candida ## Benchmark -![Benchmark Chart](../benchmark/outputs/plots/markdown_quality.png) +![Benchmark Chart](../benchmark/public/plots/markdown_quality.png) This repository includes benchmark reports focused on RAG/LLM preprocessing of Excel documents. We track two perspectives: (1) core extraction accuracy and (2) reconstruction utility for downstream structure queries (RUB). diff --git a/docs/README.ja.md b/docs/README.ja.md index f6dec91..f19b011 100644 --- a/docs/README.ja.md +++ b/docs/README.ja.md @@ -19,7 +19,7 @@ ExStruct は Excel ワークブックを読み取り、構造化データ(セ ## ベンチマーク -![Benchmark Chart](../benchmark/outputs/plots/markdown_quality.png) +![Benchmark Chart](../benchmark/public/plots/markdown_quality.png) このリポジトリには、ExcelドキュメントのRAG/LLM前処理に焦点を当てたベンチマークレポートが含まれています。 私たちは2つの視点から追跡しています。(1) コア抽出精度と (2) 下流構造クエリのための再構築ユーティリティ (RUB) です。