From bfdd4359cf7aae9758d2a05249a2c7410449b761 Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Thu, 22 Jan 2026 22:05:08 +0900 Subject: [PATCH 01/12] =?UTF-8?q?=E6=95=B0=E5=BC=8F=E5=8F=96=E5=BE=97?= =?UTF-8?q?=E6=A9=9F=E8=83=BD=E3=81=AE=E4=BB=95=E6=A7=98=E3=82=92=E8=BF=BD?= =?UTF-8?q?=E5=8A=A0=E3=81=97=E3=80=81=E9=96=A2=E9=80=A3=E3=82=BF=E3=82=B9?= =?UTF-8?q?=E3=82=AF=E3=82=92=E6=9B=B4=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/agents/FEATURE_SPEC.md | 19 +++++++++++++++++++ docs/agents/TASKS.md | 13 +++++++++---- 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/docs/agents/FEATURE_SPEC.md b/docs/agents/FEATURE_SPEC.md index 55753aa..ecd2332 100644 --- a/docs/agents/FEATURE_SPEC.md +++ b/docs/agents/FEATURE_SPEC.md @@ -4,6 +4,25 @@ --- +## 数式取得機能追加 + +- 新たに数式文字列をそのまま取得する機能を追加 +- `SheetData`モデルに`formulas_map`を新設予定`formulas_map: dict[str, list[tuple[int,int]]]` +- 数式の値は定義されている数式をそのまま取得する +- セル座標はcolors_mapと同じようにr,cの数値で表記 +- デフォルトはverboseモード以上で出力、もしくはオプションからONにする +- 定義されている数式文字列をシンプルに取得する実装 +- 数式の表記形式は「=A1」のようにユーザーが見るままの数式文字列にする +- 共有数式や配列数式は一旦は展開しない実装にする +- 空文字は除外、=だけのセルも数式文字として取得 +- formulas_mapのキーは「式文字列(先頭=を含む)」で固定する +- 既存の値はSheetData.rowsにあり、数式はSheetData.formulas_mapにあることで共存する +- データ取得時はformulas_map が ON のときだけ data_only=False で再読込 +- オプションは`StructOptions`にて`include_formulas_map: bool = False`で設定を受け付ける +- `.xls`形式かつ数式取得ONの時は処理が遅くなるという警告を出しつつ、COMで取得処理をする。 + +--- + ## 今後のオプション検討メモ - 表検知スコアリングの閾値を CLI/環境変数で調整可能にする diff --git a/docs/agents/TASKS.md b/docs/agents/TASKS.md index 742e3b4..2984de8 100644 --- a/docs/agents/TASKS.md +++ b/docs/agents/TASKS.md @@ -2,7 +2,12 @@ 未完了 [ ], 完了 [x] -- [x] 仕様確認: 画像出力は DPI を維持しつつ、メモリリーク/クラッシュ回避のためサブプロセス化で処理する方針を明記 -- [x] 実装方針: シートごとに PDF を分割 → サブプロセスで PDF ページを PNG へ変換 → 終了時にメモリを解放する設計(親は進捗/結果を集約) -- [x] 実装方針: 子プロセスは `pypdfium2` をロードしてページごとにレンダリングし、書き込み済みパスを親に返す -- [x] 実装方針: 例外時は子プロセスでエラーを返し、親が RenderError として集約して返す +## 数式取得機能追加 + +- [ ] `SheetData`に`formulas_map`フィールドを追加し、シリアライズ対象に含める +- [ ] `StructOptions`に`include_formulas_map: bool = False`を追加し、verbose時の既定挙動と整合させる +- [ ] openpyxlで`data_only=False`の読み取りパスを追加し、`formulas_map`用の走査処理を実装する +- [ ] `.xls`かつ数式取得ONの場合はCOM経由で`formulas_map`を取得し、遅延警告を出す +- [ ] `formulas_map`の仕様(=付きの式文字列、空文字除外、=のみ許可、共有/配列は未展開)に沿った抽出ロジックを追加 +- [ ] CLI/ドキュメント/READMEの出力モード説明に`formulas_map`の条件を追記する +- [ ] テスト要件に`formulas_map`関連(ON/OFF、verbose既定、.xls COM分岐)を追加する From 27721f0ccd431f5c97476693e98870756a0071b9 Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Thu, 22 Jan 2026 23:21:20 +0900 Subject: [PATCH 02/12] Add support for extracting formulas in ExStructEngine - Introduced `include_formulas_map` option in `StructOptions` to control formula extraction. - Updated `ExStructEngine` to handle formulas map during extraction. - Enhanced `SheetData` model to include `formulas_map` for storing formula locations. - Modified tests across various modules to validate the new formula extraction feature. - Added sample Excel file with formulas for testing purposes. --- docs/README.en.md | 4 +- docs/README.ja.md | 4 +- docs/agents/DATA_MODEL.md | 4 +- docs/agents/FEATURE_SPEC.md | 1 + docs/agents/TASKS.md | 1 + sample/formula/formula.xlsx | Bin 0 -> 10400 bytes src/exstruct/__init__.py | 2 + src/exstruct/core/backends/base.py | 5 +- src/exstruct/core/backends/com_backend.py | 22 +- .../core/backends/openpyxl_backend.py | 16 + src/exstruct/core/cells.py | 150 ++++++++ src/exstruct/core/integrate.py | 3 + src/exstruct/core/modeling.py | 3 + src/exstruct/core/pipeline.py | 124 ++++++- src/exstruct/engine.py | 4 + src/exstruct/models/__init__.py | 7 + src/exstruct/render/__init__.py | 335 +++++++++++++++--- tests/backends/test_auto_page_breaks.py | 1 + tests/com/test_render_smoke.py | 13 +- tests/core/test_cells_utils.py | 14 +- tests/core/test_pipeline.py | 19 + tests/core/test_pipeline_fallbacks.py | 3 + tests/engine/test_engine.py | 1 + tests/integration/test_integrate_raw_data.py | 17 +- tests/models/test_modeling.py | 1 + tests/render/test_render_init.py | 4 +- 26 files changed, 687 insertions(+), 71 deletions(-) create mode 100644 sample/formula/formula.xlsx diff --git a/docs/README.en.md b/docs/README.en.md index 38fa5b0..97e63de 100644 --- a/docs/README.en.md +++ b/docs/README.en.md @@ -11,7 +11,7 @@ ExStruct reads Excel workbooks and outputs structured data (cells, table candida ## Features - **Excel → Structured JSON**: cells, shapes, charts, smartart, table candidates, print areas/views, and auto page-break areas per sheet. -- **Output modes**: `light` (cells + table candidates + print areas; no COM, shapes/charts empty), `standard` (texted shapes + arrows, charts, smartart, merged cell ranges, print areas), `verbose` (all shapes with width/height, charts with size, merged cell ranges, print areas). Verbose also emits cell hyperlinks and `colors_map`. Size output is flag-controlled. +- **Output modes**: `light` (cells + table candidates + print areas; no COM, shapes/charts empty), `standard` (texted shapes + arrows, charts, smartart, merged cell ranges, print areas), `verbose` (all shapes with width/height, charts with size, merged cell ranges, print areas). Verbose also emits cell hyperlinks, `colors_map`, and `formulas_map`. Size output is flag-controlled. - **Auto page-break export (COM only)**: capture Excel-computed auto page breaks and write per-area JSON/YAML/TOON when requested (CLI option appears only when COM is available). - **Formats**: JSON (compact by default, `--pretty` available), YAML, TOON (optional dependencies). - **Table detection tuning**: adjust heuristics at runtime via API. @@ -134,7 +134,7 @@ Use higher thresholds to reduce false positives; lower them if true tables are m - **light**: cells + table candidates (no COM needed). - **standard**: texted shapes + arrows, charts (COM if available), merged cell ranges, table candidates. Hyperlinks are off unless `include_cell_links=True`. -- **verbose**: all shapes (with width/height), charts, merged cell ranges, table candidates, cell hyperlinks, and `colors_map`. +- **verbose**: all shapes (with width/height), charts, merged cell ranges, table candidates, cell hyperlinks, `colors_map`, and `formulas_map`. ## Error Handling / Fallbacks diff --git a/docs/README.ja.md b/docs/README.ja.md index 87f7c24..34137d3 100644 --- a/docs/README.ja.md +++ b/docs/README.ja.md @@ -9,7 +9,7 @@ ExStruct は Excel ワークブックを読み取り、構造化データ(セ ## 主な特徴 - **Excel → 構造化 JSON**: セル、図形、チャート、SmartArt、テーブル候補、セル結合範囲、印刷範囲/自動改ページ範囲(PrintArea/PrintAreaView)をシート単位・範囲単位で出力。 -- **出力モード**: `light`(セル+テーブル候補のみ)、`standard`(テキスト付き図形+矢印、チャート、SmartArt、セル結合範囲)、`verbose`(全図形を幅高さ付きで出力、セルのハイパーリンクも出力)。 +- **出力モード**: `light`(セル+テーブル候補のみ)、`standard`(テキスト付き図形+矢印、チャート、SmartArt、セル結合範囲)、`verbose`(全図形を幅高さ付きで出力、セルのハイパーリンク/`colors_map`/`formulas_map`も出力)。 - **フォーマット**: JSON(デフォルトはコンパクト、`--pretty` で整形)、YAML、TOON(任意依存)。 - **テーブル検出のチューニング**: API でヒューリスティックを動的に変更可能。 - **ハイパーリンク抽出**: `verbose` モード(または `include_cell_links=True` 指定)でセルのリンクを `links` に出力。 @@ -131,7 +131,7 @@ set_table_detection_params( - **light**: セル+テーブル候補のみ(COM 不要)。 - **standard**: テキスト付き図形+矢印、チャート(COM ありで取得)、テーブル候補。セルのハイパーリンクは `include_cell_links=True` を指定したときのみ出力。 -- **verbose**: all shapes, charts, table_candidates, hyperlinks, and `colors_map`. +- **verbose**: all shapes, charts, table_candidates, hyperlinks, `colors_map`, and `formulas_map`. ## エラーハンドリング / フォールバック diff --git a/docs/agents/DATA_MODEL.md b/docs/agents/DATA_MODEL.md index 0d7d056..af411a8 100644 --- a/docs/agents/DATA_MODEL.md +++ b/docs/agents/DATA_MODEL.md @@ -1,6 +1,6 @@ # ExStruct データモデル仕様 -**Version**: 0.15 +**Version**: 0.16 **Status**: Authoritative 本ドキュメントは ExStruct が返す全モデルの唯一の正準ソースです。 @@ -175,6 +175,7 @@ SheetData { table_candidates: [str] print_areas: [PrintArea] auto_print_areas: [PrintArea] // 自動改ページ矩形 (COM 前提、デフォルト無効) + formulas_map: {[formula: str]: [[int, int]]} // (row=1-based, col=0-based) colors_map: {[colorHex: str]: [[int, int]]} // (row=1-based, col=0-based) merged_cells: MergedCells | null } @@ -251,3 +252,4 @@ WorkbookData { - 0.13: Shape を `Shape` / `Arrow` / `SmartArt` に分割し、`SmartArtNode` のネスト構造を追加 - 0.14: `MergedCell` / `SheetData.merged_cells` を追加 - 0.15: `MergedCells` を schema + items 形式に変更し圧縮形式を導入 +- 0.16: `SheetData.formulas_map` を追加 diff --git a/docs/agents/FEATURE_SPEC.md b/docs/agents/FEATURE_SPEC.md index ecd2332..21a82f9 100644 --- a/docs/agents/FEATURE_SPEC.md +++ b/docs/agents/FEATURE_SPEC.md @@ -20,6 +20,7 @@ - データ取得時はformulas_map が ON のときだけ data_only=False で再読込 - オプションは`StructOptions`にて`include_formulas_map: bool = False`で設定を受け付ける - `.xls`形式かつ数式取得ONの時は処理が遅くなるという警告を出しつつ、COMで取得処理をする。 +- cell.value が ArrayFormula の場合に value.text(実際の式文字列)を使う --- diff --git a/docs/agents/TASKS.md b/docs/agents/TASKS.md index 2984de8..afa5954 100644 --- a/docs/agents/TASKS.md +++ b/docs/agents/TASKS.md @@ -9,5 +9,6 @@ - [ ] openpyxlで`data_only=False`の読み取りパスを追加し、`formulas_map`用の走査処理を実装する - [ ] `.xls`かつ数式取得ONの場合はCOM経由で`formulas_map`を取得し、遅延警告を出す - [ ] `formulas_map`の仕様(=付きの式文字列、空文字除外、=のみ許可、共有/配列は未展開)に沿った抽出ロジックを追加 +- [ ] openpyxlの配列数式(`ArrayFormula`)は`value.text`から式文字列を取得する分岐を追加 - [ ] CLI/ドキュメント/READMEの出力モード説明に`formulas_map`の条件を追記する - [ ] テスト要件に`formulas_map`関連(ON/OFF、verbose既定、.xls COM分岐)を追加する diff --git a/sample/formula/formula.xlsx b/sample/formula/formula.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..eb2ebab69aaa88a6a7f101617cdfcd5cfcaf50d6 GIT binary patch literal 10400 zcmeHNWm_EE(jDAoa7%CkBxryD!6CRi1cDAWxVu}hA%g}F?(QxLt^tCD;O_45lAL?r zb573v2lw`e?q{CvRnxn7t*W)FwxSFiJRSfMfD8ZtC;@3NEN#Fr001@u0DucXhJ7n) zW9DZo9(Cm{i$z^hr&dsfu7>VQ;i|phmk={JC)jQKw=^9ls!A}gwTk1q;BC7M| z>=hax3~bz{frx_CI=Rtk=in-5POmL0zk9StR5HaEM~w;EMpXuj>o4(X&KBsHiDgff zFTd-p(S&Jx61iXR$mJjDX3072c#Ir3otVhKw|U}6<0~9LgymPF(HD3jE(3fYYg2N) z1|@%j+Aqtd35L81>~PH!Z91vzU^|GD#;x|Zq!jT4Vi+8zE~PUU3#Pm!+<3!Fud8N` z(%&a1icdd^gpCUJolc}Gc|%=&w=S@f?*AbV#K$V>(Mt2Z3%KP2r!X=AgvvrFyIWw$ z8h53tM;ZuVCf=fR5C_2BPV6s5clNr;F|peMQV2d33Lb2RMM@-#A*M#!7wqct`e;f@qH%)w2g# zIljk2L zq}}M9TOf=@_4%J<`WC3g$B)D+ut%6RNKkNd$bt#@Kl*8R%W3^Eyex&86j9kL2`;aD zm9ZT&lI$}TpSy)27|J8N`}HG!uY;lKOsQwDCFR)_iJFQTk9oOanmsR-o1U>{`-xa` z8}6MalYC0AG8G5TCHsg(4{gSok9IZ3(Lk!(=T|6QMZJf8{^8{LKOWQ7KS?q@qsHk3 z2LRYS<~6LxFXL*-;$mYDHngz;|4d@VYPvRAyx3lCU+=I&T~HBE3UZ5Ru;-=s3N5Pk zWU~7a@-z_!Idi4d_Iws9P|TYe8vC7JQE{E4hrE1gI=Gc!rAs`b>KO3~7dMqVS2#I^ zu-GjnaUmYr_`&p-GesyS2}$Bw)e&Wp>erE<$kV;J!TejxklU{ zUlCt`nJuUc7@qEx;>Y6YN);48pL0rSFGT?l!cYQx^bgOH);QMO^v1<8Dg4#NVDCze ztzKnLUa{G70u{W#Yt7a$%YJIIt!9ej`6sKl6-)kns@UfQSKaFE`0eC8to19&?T&|q zNo)Qto=ggW*{71^pCM$VpRgFFpRO-*(^#Ti52|8Z6pI~7sdqem?g}S5Y?DP99sf-l zw(o`}scmCD$utkFJ~k)Zh-f}nmaQIFD4RuM8tqQf+I%2-P zdgR=`6-QinD!XhICy=jG)D&KL_qrc9kgbFUALE1LFewV}_b+EnN#oNe(16ZJ@U_hd zd{8&6mvAvFl#dc?9{)qmSh$7ZaEzL8x`qZz0 zr*YdTRG~$iR>(dX+}|HkQ1uz1N)Z~$M7BO@?}UV{dtb&Pi7Dy=p0&JiiPIRx#TEUCC7WRUT01)C zwAeN4XnWT?KFNnfe9i^GoOi{Vc;3;5L`lZTqbJGVh5~-UsocCd0vs`6pH<>g98H@) z*o6r%uzM#uV}uf`tM^*uCA|giiBtTSUbL*~PD*>G2(O+nYQd^Qj#T-oUWYD@(b!#= zlv6v@@28d=jR@8g!0^TI>Q()fHY8L^2iO}3U)9LS2c+TxRwuIufYd#aaY*z1;qu$W z6WzW~JBK+O4J~;-ZoK6V(|z8V1|G88Ht|l(nUN)w^umM++RBogq#o3Ezxn}h z=Gt7^7;CSPSFH?}li9iHX6UTEqG&v1U8Yne?qYuYVZ#+}Oyg8?9E9BQV0jmw_Sb&~`H!%T!tfIE3kqDUBu#iw`S# z`bh$z`Gx_oUEghT&}S7xDDC)XxC>GaqK&q|Gg$(La`rK~SYJmtDFnyyLX`akW9ulw zK0$^bz)gh~9Q?2E#y+f}5wIVQb|Z-SwbH2{zJ2xqxC*)%N;)SS5sA6c>^~ZcC#vKm zZcPzH6#b0bQ-Ht9trggyhbwMWl#b5j?9WpZ&um|3L67dvxG>!_I8w!C^U%>-HN7GD zIQ9RG@|N30X+U@Y01piSAbNb_Us2w{3OUIc)K^?R#N@NI)Dp9TJ2hnu;a@H7o2 zkjQ}4oD1RVtSYJv+KXgOQd;cipVA)Fv$mlS9^mwL0VWU`rJFDt?b8UuSti`j-REk!Um32+1`16^toE)_89!v7<4I=n0Q!;Z3o5&EF4Vx z9p?MBO#%e!7onOmv|Z+bP|5IHkWujpILL`ytNV?kDA%nNhF7_%b<@5bT2}kWSH)tk9XEMOuEk99o`p{ppT`WX($MXTvPeSsm{1!n*=-Xy zs^=XXUMPURQE0Bt0~=veZOO}1sYpV? z$3+Q_X`$fI0&79Yy3x1tQcGNPb!ZEvStum)kcKq>;a7C{HNCNMatZ-L*uf8|9|H=4 z@#2rw;Gvaf%N^l0rlRzkxGEUYv-ch@nyt8sipQ+(oe;G+v zh8GfI@QE?%yLWr8IQ&^w*~aI675h+>8sPeCHN~mQgtelMje#Gug+HjVELB8LhV%`G zI<}H<4{DZ*8G0f0%k}6_+<^1lUUeK|jbaN6jf8shCUOTVFGYai`FCslus0Mcu*;~J zVy8eQ4*-rFxA8EHNCmR7l#tFCfS5Yt900#vhl;7?;HyXm@NlNBqNOM;7X~+m9~06Q z8ALTfGbNcb@U)%+!h`1Kcc|E#dp$FPC5Alo26o(MMO_f|T`*b0;A|1wqu}|}>y`BKI9?SQILOxxP6BlotRz4fTvFrURcXS<@ zAdLK>{xP-C#<(GMyormt`508T>S3GG4e?VgQKU)B?ecU1e)B@Hvkzu4>!t&BP47`r zzJ)x2Qq9#p5p#s#kiqL&?skj5v)0?&zZKSE=KHc0)BJusq~IcAP1WzO(A=0RKf)mJ@AgK|R1d z8d)w5=lW^GD9krCDC*j++^r5yX8lho_|Gt6Pot|3Mgjm3DSuWyeuWW7GmsUC<=6RF z1liS^4<~B?`e3d4VBZO(mF-_?@(HYudz_bAH5-L^ObnaluBp`_N>%MFv68*xu1^qJ z#5smW*LiS@VA@Ah!yos&n5Ib(q@r~isjkjy(7pM#WWUS2D-ZToV^oXtE@Ym++XQuv zG612+)wrq$s<2{TQPRIU{y<5Kl*m!{&FIPVu7B;DCzXh>%TZe~omGl02&%T}E+^Wbou`JZa8pYXv#H`r3y1!GYsg`42}G3w#Bb#!I$0B2yLO*5y@bD%tnYUYpai-5r=d8?#$v8xewY*z32)Qb2o`&M#5t-sXftw zXest6U2)nV)q*a)JJKWtx<*%szKr%9vU*<$r4=a@SD3I6mk&_lhv0TH??PDC*5Z0s z7k+XQKIPN5!Bt4s;FU6L)o{>uJ>+wLNPK;v_i%cb&^DCY8j-+tzn9+TdUN4(AMLJb zw-olO?Ph(iZlU?=uI%CLEaB;%t{=qeGLcR!Q(_gJe+}iusF5HK!2umx6q^%`EqWt= zVF;tMFGM}@%9ek*@vs5VeFSsBqQ_nP^owCZzUg|1-7)}+$CRJB8Phd?qmqyBU9Q8! zs-gKxnxW@*auY=5S$o$StXaaf>bcQ!;#J_1>JpFCdy(tc-Xfun-R5>THB z#2KZq&MhHnei5LCs2dut1S<1WlPFb#t8?xxq4ojp3Gh9*XYoW8b0MwQmjE3g&%AxtJ6?MDLyCDv-p! z8&Y;PbW68&5iFfX$0Z+de6w5*)=;!@K8<&HR)d9XaEY#fB0M5+t~t7b*0T{bhGnQ9 z=d~vHPMr@C zy)^ZwGUz5kMJJ#455>I~5lb%;x7>3`}2;{b;M z(K~`LI5{iz-{-)Z67D~g7ZWdnt^?bpDu=Y{)P}6oaBp{ul5{<;YU3M9K-OU5g}EuD z0#Mj70%LOKiqW-TU(KxG%l_#P+HCd|a!8V$b3=~s0l9LZ>-gkvi&Vi7x0Sw*Ap|aV zN!zpMh09q)EE#gDoRLEfgK8tR+O{b3)GsY(%T}S*;{n=qA)ZN2LPcuxdxPH@Dj2(; zkC~A1sxykdN_wMVll9|NY9kl7iA!jpH5~ZOxD@CF0S=u}h^>Z1us&}BKA(O1TwD8& zS9OQpS_dZADigMB0r}nRYhgT-RFK)AaDl+2U4PQx&gS;wce894o|qY8z}h)4T$p5C zP08zoJdK2$Sx%}wB7>Y{a9~_8TQlCqjCtL%4g0q}s_ZDYS^S-O#l$g7uTV{}NW*80 z7d0RKb2f`{Ikth{Kg%9w;tpnbpp(ZStVT`P=qC;>6`Ev<)h;cBXc34M!o=IH+lulM zlhUXH{fOjv{odWvk!o6Sf1Xw^fuX!Ljs(M%+)hLUWHbrWRS|!rf4%!H^xW|r89m)D z=;AH){k=X@Ujp`imaBVk)QmFUmuN$!s=_DSIXKL~!jG@=@T_~e#MD<8asn9gbZ^4? zR~5OUW~jQa-W$faA-;F10#l5Hyyf`Voj5KZNKwU1&?TDYo&a|EZS5V zSI8ybkFG0b#8m1jjA$IXsd-Uf+Op0osd~U*_ocui60U>2Byo^U?rpPNS5no9(o{9+ z3*;9H9L|?k4&tu-N3G8Udv%Aky@ZOvvN4zIN%O5&>pd*Ke2BIQ^~Takik-T(?TV+M zS24s^?F4QG$0R_3PdOQ>i{sK07&kTfdWF@Q1`Nd^amvrPzsbzg7ixqk)v#|)zU%17 zT+Su|Y1~rR59=4*W-dh7=$=2@Ax(abn-<&9FvvpMH3aghah?P{XfCWGg~Gyk&E`kW zo;jB(Sxhx$DRXbuNbh-PX^?PS4|SN0LbUhBiU6r3w$mjSUESqs7OFRQ%G3FqIn}c{ zCYnV~T_$y$=uKtEP{ihFC8>Qv+nS1_A3nc?;P2JW32sDm!$*lvhXVj${E?d-9NjEI z4nKAMgrl z4b!l>AiD=6d8w^dYd96~O2f9sWp>tr#6EYE^aPX!;@)siw3O ztUyPHGbwg{CU_YH?|co54&6KJP?fAQO29(`8g3W1D?#BP z&+S8dGSz0{L5Vqm#|^88jW6KAC(RSnfPaD?^I;w#qaUFwA>yGocN<@tnm#Y zlXYcg*cH#AKZusnagm3=OEEa|^W@D%@<7|Gga^wX%`-apIiFQxfsI^cUM=2!VwgIt zjCZX`B`ht4;7)m2yfs zJR$e{-FF!mmWiJ_=Mj&6D3ApyVKLx61y`v2>@{emke!nYjzX7JVUgfJvP_h%%3KPr zD2qrDL2&%nQz#3$bxO=i1KCmV*(%py8D}E{ziutk-{oB2b&>=jR_GB#s|#JAkp>7< z@OSGz;C^K^(W_UH>#n7F544uRp`L@%h+i^8V|PPz!q{?5!_EY==K_!YZr)f_sBsOg zzRX9p%~Ygqa}Tsv6i8ZS3phenVmBo=85Jpyrcp0p2&UxH&62dq-xF~wJas>GZF+~s zl;Bw6o{nfJPN5B}ZPPE42p*Y}ea5aL8v(UHlr{a_~7gl() z)|xX`^^P`W-_s|s#2tM?s`XJ6*Oe+)N#Z8^>SW_2-hkTGHr^i6uFhfhhOE8~zil-V z#O)hVbng6dk69D2)1=1LWmw2kx83PjC0TobGfGI3da6x6&jpm3m!Z1ZhD?z!cwb)b zup|#GFrK%6tHS?etE|x?tY_#Pe|L2=8RCM`$VpIt-P@`QqX^BkUcW9;oid+auUn*m zqR5Yh^`Gnw^p>W#eL>LzMaSe!pxJlir~_xtP8^&@&pC{BdN+8;qL!w#QE+AiQ{`KU zQ&Qyb z+MsG;V?C^S0662`v>!Amh>q;ZHkD4HxaTiyT{Ie)lDOYXu_UGhkC*%L^Bry<@TQpx zP|+VQDMHazItw+`fEL3d(fY0-KzV$Wy!-Z-_k{mxp&(M}5=Z^mcF90|tU3HE7?~N^ zgN#)i?ai%Cf5oKgxCMoAMZBH`xm)t28)g?wn(k^^X~ShwtZSr+l2N*yz1^0V*IWm3 z(Gm1wT9S&C5$}!ILC8Quf1g3!sUtntYolP=ENz^$q}sfNla>w7;R4>1P6mb%Imj&Y z*y!GSGG$q1+i)(e(b1S5ESvc^s31fJ6551B5_fZDj;w{|Ps;sxm+^`iS`l#v2C``J zTSShXOv&&+2Ak51SnJOSSeZO>m9pc_+4BNV8oqNlqbF13EUF_1G^se88mB1m9TaTAw0h93RHZ62N;%|hIADxP5WbbK>$kWuK= zWF^9nkGhE`v2F?F5^pfpfZx=}Oe8Bk=QdNVATl2yDxq&iV{gG+Gv0TLw`bB}F9ahk z>x>?V`6c2S(sKor6N>T$>($R@c9a)*c2g&ACv$|s<~m^o@T7oTCO6Sz(LPA0_@TA4 zJ#SOT>^E9Zy2C@SzBZYAu&QUTy?5Z+^LElh6#&zdcSPZ#HI~@u=i3iNXhij9$CdKI z3wwxQq)P3*vIC6xsqw*AIn%M3?j2X<_Az#A`!sue@tJgP&*y+0Zw|8I#BWV*beiR; zk6p5T>=1Me;bC9ZzS_Ae{?2{$NY!=hf^6D zSQ?3%8JJuDOmaF9;h)RqHy8dc$W?XbXkpUh%&2KXSlD03;Gt4d{+oLkYE+FWuey>S znwFMcG#Q0<$N1(J`cX0n5;vmMrzj-zmnYK5eRUUsQHWqY-BHWSR?CVH*5p#e%TP&z zq)6n9^=V!aGuV_0aDO}|vgR9xS;ONrw2!ZIYFS}M%Q(P8gYC{mFoC!9n;}DRT^b7U zk&0^2A+WzhnmKmvBiHEQIXd=Fpt^f3!u@k?-O73uVf#2Ti$~f3AIrSPHb#o}Hnt8d zMmF}KpTm3XS^lrO`#9~MPz9MTR-*0$|AkMhKXQC(*+i!N`sFd~JYPbp*0$Q`Ky<5dkGgt)XX-_G8tj3U84B(#kj3z8H8#-EMJ?nLMpPov`$YIjpc za7%$iYIyn?hUuF!RNB*49PMC}>YaP0V3#8}*>$y+M@Pak$?VNr)2oi?* zoxM+T(kLtC`UYu*5|zKl!79(_bxXw@op-XM`^J}>wi#Yyo>!;H#M-e{hyaUy63-+6lle-O8Nz? z7m}ZcEDwf%dj=7nd^kFTMZ2?eFt3&v1C)Zrf1d5#H5YN!snsnNk5njiU25a}ywB~` zk?XSi+;nkJVuTAqvb=q~-*_vBwMzNf_Rb9c01l%D z%(t9+681HkgxZxoF*`^~qTeyJ5kK)d#y8QcB`IDsQo^3cIW^*_k7-z3QGHQU$@L)@WQ_yVG_gYldPjfpb!;94Tvd78!!=+(hnI477f8NdY?-Olgbp+7B^q5uC&{G(U&yOrOI!+%;if6Ug8L;1Z_{JVkQSFnE?fIjxD z9u54tp8XyAyWad0O8(>z=vSE`tQm6Pb>hSOaTD=BbEOS k|9epVEBuP;FYtebSVb9xN8k9lrwI+v`luELX@CCuKjN)7uK)l5 literal 0 HcmV?d00001 diff --git a/src/exstruct/__init__.py b/src/exstruct/__init__.py index d0110d6..dee7580 100644 --- a/src/exstruct/__init__.py +++ b/src/exstruct/__init__.py @@ -115,11 +115,13 @@ def extract(file_path: str | Path, mode: ExtractionMode = "standard") -> Workboo """ include_links = True if mode == "verbose" else False include_colors_map = True if mode == "verbose" else None + include_formulas_map = True if mode == "verbose" else None engine = ExStructEngine( options=StructOptions( mode=mode, include_cell_links=include_links, include_colors_map=include_colors_map, + include_formulas_map=include_formulas_map, ) ) return engine.extract(file_path, mode=mode) diff --git a/src/exstruct/core/backends/base.py b/src/exstruct/core/backends/base.py index f678e54..0cf283c 100644 --- a/src/exstruct/core/backends/base.py +++ b/src/exstruct/core/backends/base.py @@ -4,7 +4,7 @@ from typing import Protocol from ...models import CellRow, PrintArea -from ..cells import MergedCellRange, WorkbookColorsMap +from ..cells import MergedCellRange, WorkbookColorsMap, WorkbookFormulasMap CellData = dict[str, list[CellRow]] PrintAreaData = dict[str, list[PrintArea]] @@ -40,3 +40,6 @@ def extract_colors_map( def extract_merged_cells(self) -> MergedCellData: """Extract merged cell ranges from the workbook.""" + + def extract_formulas_map(self) -> WorkbookFormulasMap | None: + """Extract formulas map from the workbook.""" diff --git a/src/exstruct/core/backends/com_backend.py b/src/exstruct/core/backends/com_backend.py index c6bdaf3..81ec58c 100644 --- a/src/exstruct/core/backends/com_backend.py +++ b/src/exstruct/core/backends/com_backend.py @@ -9,7 +9,12 @@ import xlwings as xw from ...models import PrintArea -from ..cells import WorkbookColorsMap, extract_sheet_colors_map_com +from ..cells import ( + WorkbookColorsMap, + WorkbookFormulasMap, + extract_sheet_colors_map_com, + extract_sheet_formulas_map_com, +) from ..ranges import parse_range_zero_based from .base import MergedCellData, PrintAreaData @@ -80,6 +85,21 @@ def extract_colors_map( ) return None + def extract_formulas_map(self) -> WorkbookFormulasMap | None: + """Extract formulas_map via COM; logs and skips on failure. + + Returns: + WorkbookFormulasMap or None when extraction fails. + """ + try: + return extract_sheet_formulas_map_com(self.workbook) + except Exception as exc: + logger.warning( + "COM formula map extraction failed; skipping formulas_map. (%r)", + exc, + ) + return None + def extract_auto_page_breaks(self) -> PrintAreaData: """Compute auto page-break rectangles per sheet using Excel COM. diff --git a/src/exstruct/core/backends/openpyxl_backend.py b/src/exstruct/core/backends/openpyxl_backend.py index f7593d3..d67ae59 100644 --- a/src/exstruct/core/backends/openpyxl_backend.py +++ b/src/exstruct/core/backends/openpyxl_backend.py @@ -9,10 +9,12 @@ from ...models import PrintArea from ..cells import ( WorkbookColorsMap, + WorkbookFormulasMap, detect_tables_openpyxl, extract_sheet_cells, extract_sheet_cells_with_links, extract_sheet_colors_map, + extract_sheet_formulas_map, extract_sheet_merged_cells, ) from ..ranges import parse_range_zero_based @@ -99,6 +101,20 @@ def extract_merged_cells(self) -> MergedCellData: except Exception: return {} + def extract_formulas_map(self) -> WorkbookFormulasMap | None: + """Extract formulas_map using openpyxl. + + Returns: + WorkbookFormulasMap or None when extraction fails. + """ + try: + return extract_sheet_formulas_map(self.file_path) + except Exception as exc: + logger.warning( + "Formula map extraction failed; skipping formulas_map. (%r)", exc + ) + return None + def detect_tables(self, sheet_name: str) -> list[str]: """Detect table candidates for a single sheet. diff --git a/src/exstruct/core/cells.py b/src/exstruct/core/cells.py index de3b352..1024888 100644 --- a/src/exstruct/core/cells.py +++ b/src/exstruct/core/cells.py @@ -67,6 +67,32 @@ def get_sheet(self, sheet_name: str) -> SheetColorsMap | None: return self.sheets.get(sheet_name) +@dataclass(frozen=True) +class SheetFormulasMap: + """Formula map for a single worksheet.""" + + sheet_name: str + formulas_map: dict[str, list[tuple[int, int]]] + + +@dataclass(frozen=True) +class WorkbookFormulasMap: + """Formula maps for all worksheets in a workbook.""" + + sheets: dict[str, SheetFormulasMap] + + def get_sheet(self, sheet_name: str) -> SheetFormulasMap | None: + """Return the formulas map for a sheet if available. + + Args: + sheet_name: Target worksheet name. + + Returns: + SheetFormulasMap for the sheet, or None if missing. + """ + return self.sheets.get(sheet_name) + + @dataclass(frozen=True) class MergedCellRange: """Merged cell range with normalized value.""" @@ -102,6 +128,61 @@ def extract_sheet_colors_map( return WorkbookColorsMap(sheets=sheets) +def extract_sheet_formulas_map(file_path: Path) -> WorkbookFormulasMap: + """Extract formula strings for each worksheet. + + Args: + file_path: Excel workbook path. + + Returns: + WorkbookFormulasMap containing per-sheet formula maps. + """ + sheets: dict[str, SheetFormulasMap] = {} + with openpyxl_workbook(file_path, data_only=False, read_only=False) as wb: + for ws in wb.worksheets: + sheet_map = _extract_sheet_formulas(ws) + sheets[ws.title] = sheet_map + return WorkbookFormulasMap(sheets=sheets) + + +def extract_sheet_formulas_map_com(workbook: xw.Book) -> WorkbookFormulasMap: + """Extract formula strings for each worksheet via COM. + + Args: + workbook: xlwings workbook instance. + + Returns: + WorkbookFormulasMap containing per-sheet formula maps. + """ + sheets: dict[str, SheetFormulasMap] = {} + for sheet in workbook.sheets: + formulas_map: dict[str, list[tuple[int, int]]] = {} + used = sheet.used_range + start_row = int(getattr(used, "row", 1)) + start_col = int(getattr(used, "column", 1)) + max_row = used.last_cell.row + max_col = used.last_cell.column + if max_row <= 0 or max_col <= 0: + sheets[sheet.name] = SheetFormulasMap( + sheet_name=sheet.name, formulas_map=formulas_map + ) + continue + rng = sheet.range((start_row, start_col), (max_row, max_col)) + matrix = _normalize_matrix(rng.formula) + for r_offset, row in enumerate(matrix): + for c_offset, value in enumerate(row): + normalized = _normalize_formula_from_com(value) + if normalized is None: + continue + row_index = start_row + r_offset + col_index = start_col + c_offset - 1 + formulas_map.setdefault(normalized, []).append((row_index, col_index)) + sheets[sheet.name] = SheetFormulasMap( + sheet_name=sheet.name, formulas_map=formulas_map + ) + return WorkbookFormulasMap(sheets=sheets) + + def extract_sheet_colors_map_com( workbook: xw.Book, *, @@ -165,6 +246,75 @@ def _extract_sheet_colors( return SheetColorsMap(sheet_name=ws.title, colors_map=colors_map) +def _extract_sheet_formulas(ws: Worksheet) -> SheetFormulasMap: + """Extract formula strings for a single worksheet. + + Args: + ws: Target worksheet. + + Returns: + SheetFormulasMap for the worksheet. + """ + min_row, min_col, max_row, max_col = _get_used_range_bounds(ws) + formulas_map: dict[str, list[tuple[int, int]]] = {} + if min_row > max_row or min_col > max_col: + return SheetFormulasMap(sheet_name=ws.title, formulas_map=formulas_map) + + for row in ws.iter_rows( + min_row=min_row, max_row=max_row, min_col=min_col, max_col=max_col + ): + for cell in row: + if getattr(cell, "data_type", None) != "f": + continue + normalized = _normalize_formula_value(getattr(cell, "value", None)) + if normalized is None: + continue + formulas_map.setdefault(normalized, []).append((cell.row, cell.col_idx - 1)) + return SheetFormulasMap(sheet_name=ws.title, formulas_map=formulas_map) + + +def _normalize_formula_value(value: object) -> str | None: + """Normalize a formula string for openpyxl cells. + + Args: + value: Raw cell value. + + Returns: + Formula string with leading "=", or None when empty. + """ + if value is None: + return None + array_text = getattr(value, "text", None) + if array_text is not None: + text = str(array_text) + else: + text = str(value) + if text == "": + return None + if not text.startswith("="): + return f"={text}" + return text + + +def _normalize_formula_from_com(value: object) -> str | None: + """Normalize a formula string returned by COM. + + Args: + value: Raw COM formula value. + + Returns: + Formula string with leading "=", or None when not a formula. + """ + if value is None or not isinstance(value, str): + return None + text = value + if text == "": + return None + if not text.startswith("="): + return None + return text + + def _extract_sheet_colors_com( sheet: xw.Sheet, include_default_background: bool, ignore_colors: set[str] | None ) -> SheetColorsMap: diff --git a/src/exstruct/core/integrate.py b/src/exstruct/core/integrate.py index 03017ce..ccd8131 100644 --- a/src/exstruct/core/integrate.py +++ b/src/exstruct/core/integrate.py @@ -17,6 +17,7 @@ def extract_workbook( # noqa: C901 include_colors_map: bool | None = None, include_default_background: bool = False, ignore_colors: set[str] | None = None, + include_formulas_map: bool | None = None, include_merged_cells: bool | None = None, include_merged_values_in_rows: bool = True, ) -> WorkbookData: @@ -33,6 +34,7 @@ def extract_workbook( # noqa: C901 include_colors_map: Whether to include colors map; None uses mode defaults. include_default_background: Whether to include default background color. ignore_colors: Optional set of color keys to ignore. + include_formulas_map: Whether to include formulas map; None uses mode defaults. include_merged_cells: Whether to include merged cell ranges; None uses mode defaults. include_merged_values_in_rows: Whether to keep merged values in rows. @@ -51,6 +53,7 @@ def extract_workbook( # noqa: C901 include_colors_map=include_colors_map, include_default_background=include_default_background, ignore_colors=ignore_colors, + include_formulas_map=include_formulas_map, include_merged_cells=include_merged_cells, include_merged_values_in_rows=include_merged_values_in_rows, ) diff --git a/src/exstruct/core/modeling.py b/src/exstruct/core/modeling.py index f92f32a..7b801f9 100644 --- a/src/exstruct/core/modeling.py +++ b/src/exstruct/core/modeling.py @@ -27,6 +27,7 @@ class SheetRawData: table_candidates: Detected table ranges. print_areas: Extracted print areas. auto_print_areas: Extracted auto page-break areas. + formulas_map: Mapping of formula strings to (row, column) positions. colors_map: Mapping of color keys to (row, column) positions. merged_cells: Extracted merged cell ranges. """ @@ -37,6 +38,7 @@ class SheetRawData: table_candidates: list[str] print_areas: list[PrintArea] auto_print_areas: list[PrintArea] + formulas_map: dict[str, list[tuple[int, int]]] colors_map: dict[str, list[tuple[int, int]]] merged_cells: list[MergedCellRange] @@ -70,6 +72,7 @@ def build_sheet_data(raw: SheetRawData) -> SheetData: table_candidates=raw.table_candidates, print_areas=raw.print_areas, auto_print_areas=raw.auto_print_areas, + formulas_map=raw.formulas_map, colors_map=raw.colors_map, merged_cells=_build_merged_cells(raw.merged_cells), ) diff --git a/src/exstruct/core/pipeline.py b/src/exstruct/core/pipeline.py index 363596a..98aa053 100644 --- a/src/exstruct/core/pipeline.py +++ b/src/exstruct/core/pipeline.py @@ -21,7 +21,13 @@ ) from .backends.com_backend import ComBackend from .backends.openpyxl_backend import OpenpyxlBackend -from .cells import MergedCellRange, WorkbookColorsMap, detect_tables +from .cells import ( + MergedCellRange, + WorkbookColorsMap, + WorkbookFormulasMap, + detect_tables, + warn_once, +) from .charts import get_charts from .logging_utils import log_fallback from .modeling import SheetRawData, WorkbookRawData, build_workbook_data @@ -51,6 +57,8 @@ class ExtractionInputs: include_colors_map: Whether to include background colors map. include_default_background: Whether to include default background color. ignore_colors: Optional set of color keys to ignore. + include_formulas_map: Whether to include formulas map. + use_com_for_formulas: Whether to use COM for formulas extraction. include_merged_cells: Whether to include merged cell ranges. include_merged_values_in_rows: Whether to keep merged values in rows. """ @@ -63,6 +71,8 @@ class ExtractionInputs: include_colors_map: bool include_default_background: bool ignore_colors: set[str] | None + include_formulas_map: bool + use_com_for_formulas: bool include_merged_cells: bool include_merged_values_in_rows: bool @@ -75,6 +85,7 @@ class ExtractionArtifacts: cell_data: Extracted cell rows per sheet. print_area_data: Extracted print areas per sheet. auto_page_break_data: Extracted auto page-break areas per sheet. + formulas_map_data: Extracted formulas map for workbook sheets. colors_map_data: Extracted colors map for workbook sheets. shape_data: Extracted shapes per sheet. chart_data: Extracted charts per sheet. @@ -84,6 +95,7 @@ class ExtractionArtifacts: cell_data: CellData = field(default_factory=dict) print_area_data: PrintAreaData = field(default_factory=dict) auto_page_break_data: PrintAreaData = field(default_factory=dict) + formulas_map_data: WorkbookFormulasMap | None = None colors_map_data: WorkbookColorsMap | None = None shape_data: ShapeData = field(default_factory=dict) chart_data: ChartData = field(default_factory=dict) @@ -179,6 +191,7 @@ def resolve_extraction_inputs( include_colors_map: bool | None, include_default_background: bool, ignore_colors: set[str] | None, + include_formulas_map: bool | None, include_merged_cells: bool | None, include_merged_values_in_rows: bool, ) -> ExtractionInputs: @@ -193,6 +206,7 @@ def resolve_extraction_inputs( include_colors_map: Whether to include background colors; None uses mode defaults. include_default_background: Include default background colors when colors_map is enabled. ignore_colors: Optional set of colors to ignore when colors_map is enabled. + include_formulas_map: Whether to include formulas map; None uses mode defaults. include_merged_cells: Whether to include merged cell ranges; None uses mode defaults. include_merged_values_in_rows: Whether to keep merged values in rows. @@ -222,6 +236,19 @@ def resolve_extraction_inputs( resolved_ignore_colors = ignore_colors if resolved_colors_map else None if resolved_colors_map and resolved_ignore_colors is None: resolved_ignore_colors = set() + resolved_formulas_map = ( + include_formulas_map if include_formulas_map is not None else mode == "verbose" + ) + file_suffix = normalized_file_path.suffix.lower() + use_com_for_formulas = resolved_formulas_map and file_suffix == ".xls" + if use_com_for_formulas: + warn_once( + f"xls-formulas-fallback::{normalized_file_path}", + ( + f"File '{normalized_file_path.name}' is .xls (BIFF); openpyxl cannot " + "read formulas. Falling back to COM-based extraction (slower)." + ), + ) resolved_merged_cells = ( include_merged_cells if include_merged_cells is not None else mode != "light" ) @@ -237,6 +264,8 @@ def resolve_extraction_inputs( include_colors_map=resolved_colors_map, include_default_background=resolved_default_background, ignore_colors=resolved_ignore_colors, + include_formulas_map=resolved_formulas_map, + use_com_for_formulas=use_com_for_formulas, include_merged_cells=resolved_merged_cells, include_merged_values_in_rows=include_merged_values_in_rows, ) @@ -254,7 +283,7 @@ def build_pipeline_plan(inputs: ExtractionInputs) -> PipelinePlan: return PipelinePlan( pre_com_steps=build_pre_com_pipeline(inputs), com_steps=build_com_pipeline(inputs), - use_com=inputs.mode != "light", + use_com=inputs.mode != "light" or inputs.use_com_for_formulas, ) @@ -279,6 +308,12 @@ def build_pre_com_pipeline(inputs: ExtractionInputs) -> list[ExtractionStep]: step=step_extract_print_areas_openpyxl, enabled=lambda _inputs: _inputs.include_print_areas, ), + StepConfig( + name="formulas_map_openpyxl", + step=step_extract_formulas_map_openpyxl, + enabled=lambda _inputs: _inputs.include_formulas_map + and not _inputs.use_com_for_formulas, + ), StepConfig( name="colors_map_openpyxl", step=step_extract_colors_map_openpyxl, @@ -301,6 +336,12 @@ def build_pre_com_pipeline(inputs: ExtractionInputs) -> list[ExtractionStep]: step=step_extract_print_areas_openpyxl, enabled=lambda _inputs: _inputs.include_print_areas, ), + StepConfig( + name="formulas_map_openpyxl", + step=step_extract_formulas_map_openpyxl, + enabled=lambda _inputs: _inputs.include_formulas_map + and not _inputs.use_com_for_formulas, + ), StepConfig( name="colors_map_openpyxl_if_skip_com", step=step_extract_colors_map_openpyxl, @@ -324,6 +365,12 @@ def build_pre_com_pipeline(inputs: ExtractionInputs) -> list[ExtractionStep]: step=step_extract_print_areas_openpyxl, enabled=lambda _inputs: _inputs.include_print_areas, ), + StepConfig( + name="formulas_map_openpyxl", + step=step_extract_formulas_map_openpyxl, + enabled=lambda _inputs: _inputs.include_formulas_map + and not _inputs.use_com_for_formulas, + ), StepConfig( name="colors_map_openpyxl_if_skip_com", step=step_extract_colors_map_openpyxl, @@ -353,18 +400,18 @@ def build_com_pipeline(inputs: ExtractionInputs) -> list[ComExtractionStep]: Returns: Ordered list of COM extraction steps. """ - if inputs.mode == "light": + if inputs.mode == "light" and not inputs.use_com_for_formulas: return [] step_table: Sequence[ComStepConfig] = ( ComStepConfig( name="shapes_com", step=step_extract_shapes_com, - enabled=lambda _inputs: True, + enabled=lambda _inputs: _inputs.mode != "light", ), ComStepConfig( name="charts_com", step=step_extract_charts_com, - enabled=lambda _inputs: True, + enabled=lambda _inputs: _inputs.mode != "light", ), ComStepConfig( name="print_areas_com", @@ -376,6 +423,12 @@ def build_com_pipeline(inputs: ExtractionInputs) -> list[ComExtractionStep]: step=step_extract_auto_page_breaks_com, enabled=lambda _inputs: _inputs.include_auto_page_breaks, ), + ComStepConfig( + name="formulas_map_com", + step=step_extract_formulas_map_com, + enabled=lambda _inputs: _inputs.include_formulas_map + and _inputs.use_com_for_formulas, + ), ComStepConfig( name="colors_map_com", step=step_extract_colors_map_com, @@ -457,6 +510,19 @@ def step_extract_print_areas_openpyxl( artifacts.print_area_data = backend.extract_print_areas() +def step_extract_formulas_map_openpyxl( + inputs: ExtractionInputs, artifacts: ExtractionArtifacts +) -> None: + """Extract formulas_map via openpyxl; logs and skips on failure. + + Args: + inputs: Pipeline inputs. + artifacts: Artifact container to update. + """ + backend = OpenpyxlBackend(inputs.file_path) + artifacts.formulas_map_data = backend.extract_formulas_map() + + def step_extract_colors_map_openpyxl( inputs: ExtractionInputs, artifacts: ExtractionArtifacts ) -> None: @@ -543,6 +609,19 @@ def step_extract_auto_page_breaks_com( artifacts.auto_page_break_data = ComBackend(workbook).extract_auto_page_breaks() +def step_extract_formulas_map_com( + inputs: ExtractionInputs, artifacts: ExtractionArtifacts, workbook: xw.Book +) -> None: + """Extract formulas_map via COM; logs and skips on failure. + + Args: + inputs: Pipeline inputs. + artifacts: Artifact container to update. + workbook: xlwings workbook instance. + """ + artifacts.formulas_map_data = ComBackend(workbook).extract_formulas_map() + + def step_extract_colors_map_com( inputs: ExtractionInputs, artifacts: ExtractionArtifacts, workbook: xw.Book ) -> None: @@ -589,6 +668,26 @@ def _resolve_sheet_colors_map( return sheet_colors.colors_map +def _resolve_sheet_formulas_map( + formulas_map_data: WorkbookFormulasMap | None, sheet_name: str +) -> dict[str, list[tuple[int, int]]]: + """Resolve formulas_map for a single sheet. + + Args: + formulas_map_data: Optional workbook formulas map container. + sheet_name: Target sheet name. + + Returns: + formulas_map dictionary for the sheet, or empty dict if unavailable. + """ + if not formulas_map_data: + return {} + sheet_formulas = formulas_map_data.get_sheet(sheet_name) + if sheet_formulas is None: + return {} + return sheet_formulas.formulas_map + + def _filter_rows_excluding_merged_values( rows: list[CellRow], merged_cells: list[MergedCellRange], @@ -702,6 +801,7 @@ def collect_sheet_raw_data( include_merged_values_in_rows: bool, print_area_data: PrintAreaData | None = None, auto_page_break_data: PrintAreaData | None = None, + formulas_map_data: WorkbookFormulasMap | None = None, colors_map_data: WorkbookColorsMap | None = None, ) -> dict[str, SheetRawData]: """Collect per-sheet raw data from extraction artifacts. @@ -715,6 +815,7 @@ def collect_sheet_raw_data( mode: Extraction mode. print_area_data: Optional print area data per sheet. auto_page_break_data: Optional auto page-break data per sheet. + formulas_map_data: Optional formulas map data. colors_map_data: Optional colors map data. include_merged_values_in_rows: Whether to keep merged values in rows. @@ -739,6 +840,7 @@ def collect_sheet_raw_data( auto_print_areas=auto_page_break_data.get(sheet_name, []) if auto_page_break_data else [], + formulas_map=_resolve_sheet_formulas_map(formulas_map_data, sheet_name), colors_map=_resolve_sheet_colors_map(colors_map_data, sheet_name), merged_cells=merged_cells, ) @@ -797,6 +899,7 @@ def _fallback(message: str, reason: FallbackReason) -> PipelineResult: auto_page_break_data=artifacts.auto_page_break_data if inputs.include_auto_page_breaks else None, + formulas_map_data=artifacts.formulas_map_data, colors_map_data=artifacts.colors_map_data, ) raw_workbook = WorkbookRawData( @@ -844,11 +947,21 @@ def build_cells_tables_workbook( include_default_background=inputs.include_default_background, ignore_colors=inputs.ignore_colors, ) + formulas_map_data = artifacts.formulas_map_data + if ( + inputs.include_formulas_map + and formulas_map_data is None + and not inputs.use_com_for_formulas + ): + formulas_map_data = backend.extract_formulas_map() sheets: dict[str, SheetRawData] = {} for sheet_name, rows in artifacts.cell_data.items(): sheet_colors = ( colors_map_data.get_sheet(sheet_name) if colors_map_data else None ) + sheet_formulas = ( + formulas_map_data.get_sheet(sheet_name) if formulas_map_data else None + ) tables = backend.detect_tables(sheet_name) merged_cells = artifacts.merged_cell_data.get(sheet_name, []) filtered_rows = ( @@ -865,6 +978,7 @@ def build_cells_tables_workbook( if inputs.include_print_areas else [], auto_print_areas=[], + formulas_map=sheet_formulas.formulas_map if sheet_formulas else {}, colors_map=sheet_colors.colors_map if sheet_colors else {}, merged_cells=merged_cells, ) diff --git a/src/exstruct/engine.py b/src/exstruct/engine.py index 58a51e6..1d5b3b9 100644 --- a/src/exstruct/engine.py +++ b/src/exstruct/engine.py @@ -70,6 +70,7 @@ class StructOptions: before extraction. Use this to tweak table detection heuristics per engine instance without touching global state. include_colors_map: Whether to extract background color maps. + include_formulas_map: Whether to extract formulas map. include_merged_cells: Whether to extract merged cell ranges. include_merged_values_in_rows: Whether to keep merged values in rows. colors: Color extraction options. @@ -81,6 +82,7 @@ class StructOptions: ) include_cell_links: bool | None = None # None -> auto: verbose=True, others=False include_colors_map: bool | None = None # None -> auto: verbose=True, others=False + include_formulas_map: bool | None = None # None -> auto: verbose=True, others=False include_merged_cells: bool | None = None # None -> auto: light=False, others=True include_merged_values_in_rows: bool = True colors: ColorsOptions = field(default_factory=ColorsOptions) @@ -284,6 +286,7 @@ def _filter_sheet( if self.output.filters.include_tables else [], colors_map=sheet.colors_map, + formulas_map=sheet.formulas_map, print_areas=sheet.print_areas if include_print_areas else [], auto_print_areas=sheet.auto_print_areas if include_auto_print_areas else [], merged_cells=sheet.merged_cells @@ -358,6 +361,7 @@ def extract( include_colors_map=self.options.include_colors_map, include_default_background=self.options.colors.include_default_background, ignore_colors=self.options.colors.ignore_colors_set(), + include_formulas_map=self.options.include_formulas_map, include_merged_cells=self.options.include_merged_cells, include_merged_values_in_rows=self.options.include_merged_values_in_rows, ) diff --git a/src/exstruct/models/__init__.py b/src/exstruct/models/__init__.py index 4041aeb..b25e7a5 100644 --- a/src/exstruct/models/__init__.py +++ b/src/exstruct/models/__init__.py @@ -177,6 +177,13 @@ class SheetData(BaseModel): auto_print_areas: list[PrintArea] = Field( default_factory=list, description="COM-computed auto page-break areas." ) + formulas_map: dict[str, list[tuple[int, int]]] = Field( + default_factory=dict, + description=( + "Mapping of formula strings to lists of (row, column) tuples " + "where row is 1-based and column is 0-based." + ), + ) colors_map: dict[str, list[tuple[int, int]]] = Field( default_factory=dict, description=( diff --git a/src/exstruct/render/__init__.py b/src/exstruct/render/__init__.py index 9c76481..4a16cc8 100644 --- a/src/exstruct/render/__init__.py +++ b/src/exstruct/render/__init__.py @@ -7,7 +7,7 @@ import shutil import tempfile from types import ModuleType -from typing import Any, cast +from typing import Protocol, cast import xlwings as xw @@ -84,51 +84,19 @@ def export_sheet_images( normalized_output_dir = Path(output_dir) normalized_output_dir.mkdir(parents=True, exist_ok=True) use_subprocess = _use_render_subprocess() - if not use_subprocess: - pdfium = cast(Any, _require_pdfium()) - else: - _require_pdfium() + pdfium = _ensure_pdfium(use_subprocess) try: with tempfile.TemporaryDirectory() as td: - written: list[Path] = [] - app: xw.App | None = None - wb: xw.Book | None = None - try: - app = _require_excel_app() - wb = app.books.open(str(normalized_excel_path)) - for sheet_index, sheet in enumerate(wb.sheets): - sheet_name = sheet.name - sheet_pdf = Path(td) / f"sheet_{sheet_index + 1:02d}.pdf" - sheet.api.ExportAsFixedFormat(0, str(sheet_pdf)) - safe_name = _sanitize_sheet_filename(sheet_name) - if use_subprocess: - written.extend( - _render_pdf_pages_subprocess( - sheet_pdf, - normalized_output_dir, - sheet_index, - safe_name, - dpi, - ) - ) - else: - written.extend( - _render_pdf_pages_in_process( - pdfium, - sheet_pdf, - normalized_output_dir, - sheet_index, - safe_name, - dpi, - ) - ) - return written - finally: - if wb is not None: - wb.close() - if app is not None: - app.quit() + temp_dir = Path(td) + return _export_sheet_images_with_app( + normalized_excel_path, + normalized_output_dir, + temp_dir, + dpi, + use_subprocess, + pdfium, + ) except RenderError: raise except Exception as exc: @@ -141,6 +109,287 @@ def _sanitize_sheet_filename(name: str) -> str: return "".join("_" if c in '\\/:*?"<>|' else c for c in name).strip() or "sheet" +class _PageSetupProtocol(Protocol): + PrintArea: object + + +class _SheetApiProtocol(Protocol): + PageSetup: _PageSetupProtocol + + def ExportAsFixedFormat( # noqa: N802 + self, file_format: int, output_path: str, *args: object, **kwargs: object + ) -> None: ... + + +def _iter_sheet_apis(wb: xw.Book) -> list[tuple[int, str, _SheetApiProtocol]]: + """Return sheet index, name, and COM api handle in order.""" + try: + ws_collection = getattr(getattr(wb, "api", None), "Worksheets", None) + if ws_collection is None: + raise AttributeError("Worksheets not available") + count = int(ws_collection.Count) + sheets: list[tuple[int, str, _SheetApiProtocol]] = [] + for i in range(1, count + 1): + ws_api = cast(_SheetApiProtocol, ws_collection.Item(i)) + name = str(getattr(ws_api, "Name", f"Sheet{i}")) + sheets.append((i - 1, name, ws_api)) + return sheets + except Exception: + return [ + ( + index, + sheet.name, + cast(_SheetApiProtocol, sheet.api), + ) + for index, sheet in enumerate(wb.sheets) + ] + + +def _build_sheet_export_plan( + wb: xw.Book, +) -> list[tuple[str, _SheetApiProtocol, str | None]]: + """Return export plan rows for sheets and their print areas. + + Each item is (sheet_name, sheet_api, print_area). + """ + plan: list[tuple[str, _SheetApiProtocol, str | None]] = [] + for _, sheet_name, sheet_api in _iter_sheet_apis(wb): + areas = _extract_print_areas(sheet_api) + if not areas: + plan.append((sheet_name, sheet_api, None)) + continue + for area in areas: + plan.append((sheet_name, sheet_api, area)) + return plan + + +def _extract_print_areas(sheet_api: _SheetApiProtocol) -> list[str]: + """Return print areas for a sheet API, split into individual ranges.""" + try: + page_setup = getattr(sheet_api, "PageSetup", None) + if page_setup is None: + return [] + raw = str(getattr(page_setup, "PrintArea", "") or "") + except Exception: + return [] + if not raw: + return [] + return _split_csv_respecting_quotes(raw) + + +def _split_csv_respecting_quotes(raw: str) -> list[str]: + """Split a CSV-like string while keeping commas inside single quotes intact.""" + parts: list[str] = [] + buf: list[str] = [] + in_quote = False + i = 0 + while i < len(raw): + ch = raw[i] + if ch == "'": + if in_quote and i + 1 < len(raw) and raw[i + 1] == "'": + buf.append("''") + i += 2 + continue + in_quote = not in_quote + buf.append(ch) + i += 1 + continue + if ch == "," and not in_quote: + parts.append("".join(buf).strip()) + buf = [] + i += 1 + continue + buf.append(ch) + i += 1 + if buf: + parts.append("".join(buf).strip()) + return [p for p in parts if p] + + +def _rename_pages_for_print_area( + paths: list[Path], + output_dir: Path, + base_index: int, + safe_name: str, +) -> list[Path]: + """Rename multi-page outputs to unique prefixes for print areas.""" + renamed: list[Path] = [] + for path in paths: + page_index = _page_index_from_suffix(path.stem) + new_index = base_index + page_index + new_path = output_dir / f"{new_index + 1:02d}_{safe_name}.png" + if path != new_path: + path.replace(new_path) + renamed.append(new_path) + return renamed + + +def _page_index_from_suffix(stem: str) -> int: + """Extract zero-based page index from a _pNN suffix when present.""" + if "_p" not in stem: + return 0 + base, suffix = stem.rsplit("_p", 1) + _ = base + if len(suffix) == 2 and suffix.isdigit(): + return int(suffix) - 1 + return 0 + + +def _export_sheet_pdf( + sheet_api: _SheetApiProtocol, + pdf_path: Path, + *, + ignore_print_areas: bool, + print_area: str | None = None, +) -> None: + """Export a sheet to PDF via Excel COM. + + Args: + sheet_api: Target worksheet COM api. + pdf_path: Output PDF path. + ignore_print_areas: Whether to ignore print areas. + print_area: Optional print area string to apply for this export. + """ + original_print_area: object | None = None + page_setup = None + if print_area is not None: + try: + page_setup = getattr(sheet_api, "PageSetup", None) + if page_setup is not None: + original_print_area = getattr(page_setup, "PrintArea", None) + page_setup.PrintArea = print_area + except Exception: + page_setup = None + try: + sheet_api.ExportAsFixedFormat( + 0, str(pdf_path), IgnorePrintAreas=ignore_print_areas + ) + except TypeError: + sheet_api.ExportAsFixedFormat(0, str(pdf_path)) + finally: + if page_setup is not None and print_area is not None: + try: + page_setup.PrintArea = original_print_area + except Exception: + return + + +def _ensure_pdfium(use_subprocess: bool) -> ModuleType | None: + """Return pdfium module when needed, or None for subprocess rendering.""" + if use_subprocess: + _require_pdfium() + return None + return _require_pdfium() + + +def _export_sheet_images_with_app( + excel_path: Path, + output_dir: Path, + temp_dir: Path, + dpi: int, + use_subprocess: bool, + pdfium: ModuleType | None, +) -> list[Path]: + """Export sheet images using Excel COM and PDF rendering.""" + written: list[Path] = [] + app: xw.App | None = None + wb: xw.Book | None = None + try: + app = _require_excel_app() + wb = app.books.open(str(excel_path)) + output_index = 0 + for sheet_name, sheet_api, print_area in _build_sheet_export_plan(wb): + sheet_pdf = temp_dir / f"sheet_{output_index + 1:02d}.pdf" + safe_name = _sanitize_sheet_filename(sheet_name) + _export_sheet_pdf( + sheet_api, + sheet_pdf, + ignore_print_areas=False, + print_area=print_area, + ) + sheet_paths = _render_sheet_images( + pdfium, + sheet_pdf, + output_dir, + output_index, + safe_name, + dpi, + use_subprocess, + ) + if not sheet_paths: + _export_sheet_pdf( + sheet_api, + sheet_pdf, + ignore_print_areas=True, + print_area=print_area, + ) + sheet_paths = _render_sheet_images( + pdfium, + sheet_pdf, + output_dir, + output_index, + safe_name, + dpi, + use_subprocess, + ) + sheet_paths = _normalize_multipage_paths( + sheet_paths, + output_dir, + output_index, + safe_name, + ) + written.extend(sheet_paths) + output_index += max(1, len(sheet_paths)) + return written + finally: + if wb is not None: + wb.close() + if app is not None: + app.quit() + + +def _render_sheet_images( + pdfium: ModuleType | None, + sheet_pdf: Path, + output_dir: Path, + output_index: int, + safe_name: str, + dpi: int, + use_subprocess: bool, +) -> list[Path]: + """Render sheet PDF to PNGs using the configured renderer.""" + if use_subprocess: + return _render_pdf_pages_subprocess( + sheet_pdf, + output_dir, + output_index, + safe_name, + dpi, + ) + if pdfium is None: + raise RenderError("pypdfium2 is required for in-process rendering.") + return _render_pdf_pages_in_process( + pdfium, + sheet_pdf, + output_dir, + output_index, + safe_name, + dpi, + ) + + +def _normalize_multipage_paths( + paths: list[Path], + output_dir: Path, + base_index: int, + safe_name: str, +) -> list[Path]: + """Normalize multi-page outputs to unique prefixes when needed.""" + if len(paths) <= 1: + return paths + return _rename_pages_for_print_area(paths, output_dir, base_index, safe_name) + + def _use_render_subprocess() -> bool: """Return True when PDF->PNG rendering should run in a subprocess.""" return os.getenv("EXSTRUCT_RENDER_SUBPROCESS", "1").lower() not in {"0", "false"} diff --git a/tests/backends/test_auto_page_breaks.py b/tests/backends/test_auto_page_breaks.py index 81e01d6..dd472d5 100644 --- a/tests/backends/test_auto_page_breaks.py +++ b/tests/backends/test_auto_page_breaks.py @@ -27,6 +27,7 @@ def fake_extract( include_colors_map: bool = False, include_default_background: bool = False, ignore_colors: set[str] | None = None, + include_formulas_map: bool | None = None, include_merged_cells: bool | None = None, include_merged_values_in_rows: bool = True, ) -> WorkbookData: diff --git a/tests/com/test_render_smoke.py b/tests/com/test_render_smoke.py index fb4a38f..d85ec52 100644 --- a/tests/com/test_render_smoke.py +++ b/tests/com/test_render_smoke.py @@ -55,15 +55,4 @@ def test_render_multiple_print_ranges_images(tmp_path: Path) -> None: images_dir = out_json.parent / f"{out_json.stem}_images" images = list(images_dir.glob("*.png")) assert images_dir.exists() - prefixes = {_strip_page_suffix(image.stem) for image in images} - assert len(prefixes) == 4 - - -def _strip_page_suffix(stem: str) -> str: - """Return the image stem without the _pNN page suffix.""" - if "_p" not in stem: - return stem - base, suffix = stem.rsplit("_p", 1) - if len(suffix) == 2 and suffix.isdigit(): - return base - return stem + assert len(images) == 4 diff --git a/tests/core/test_cells_utils.py b/tests/core/test_cells_utils.py index 64f90dc..7e0a79a 100644 --- a/tests/core/test_cells_utils.py +++ b/tests/core/test_cells_utils.py @@ -5,7 +5,11 @@ from openpyxl.worksheet.table import Table, TableStyleInfo from exstruct.core import cells -from exstruct.core.cells import _coerce_numeric_preserve_format, detect_tables_openpyxl +from exstruct.core.cells import ( + _coerce_numeric_preserve_format, + _normalize_formula_value, + detect_tables_openpyxl, +) def test_coerce_numeric_preserve_format() -> None: @@ -61,3 +65,11 @@ def test_detect_tables_openpyxl_respects_table_params( ) tables = detect_tables_openpyxl(path, "Sheet1") assert "A1:B2" in tables + + +def test_normalize_formula_value_prefers_array_text() -> None: + class _ArrayFormulaLike: + text = "SUM(A1:A3)" + + assert _normalize_formula_value(_ArrayFormulaLike()) == "=SUM(A1:A3)" + assert _normalize_formula_value("") is None diff --git a/tests/core/test_pipeline.py b/tests/core/test_pipeline.py index 80f45c7..dfd4a93 100644 --- a/tests/core/test_pipeline.py +++ b/tests/core/test_pipeline.py @@ -27,6 +27,8 @@ def test_build_pre_com_pipeline_respects_flags( include_colors_map=False, include_default_background=False, ignore_colors=None, + include_formulas_map=False, + use_com_for_formulas=False, include_merged_cells=False, include_merged_values_in_rows=True, ) @@ -47,6 +49,8 @@ def test_build_pre_com_pipeline_includes_colors_map_for_light( include_colors_map=True, include_default_background=False, ignore_colors=None, + include_formulas_map=False, + use_com_for_formulas=False, include_merged_cells=True, include_merged_values_in_rows=True, ) @@ -72,6 +76,8 @@ def test_build_pre_com_pipeline_skips_merged_cells_when_disabled( include_colors_map=True, include_default_background=False, ignore_colors=None, + include_formulas_map=False, + use_com_for_formulas=False, include_merged_cells=False, include_merged_values_in_rows=True, ) @@ -90,6 +96,8 @@ def test_build_com_pipeline_respects_flags(tmp_path: Path) -> None: include_colors_map=False, include_default_background=False, ignore_colors=None, + include_formulas_map=False, + use_com_for_formulas=False, include_merged_cells=False, include_merged_values_in_rows=True, ) @@ -114,6 +122,8 @@ def test_build_com_pipeline_excludes_auto_page_breaks_when_disabled( include_colors_map=False, include_default_background=False, ignore_colors=None, + include_formulas_map=False, + use_com_for_formulas=False, include_merged_cells=False, include_merged_values_in_rows=True, ) @@ -132,6 +142,8 @@ def test_build_com_pipeline_empty_for_light(tmp_path: Path) -> None: include_colors_map=True, include_default_background=False, ignore_colors=None, + include_formulas_map=False, + use_com_for_formulas=False, include_merged_cells=False, include_merged_values_in_rows=True, ) @@ -149,12 +161,14 @@ def test_resolve_extraction_inputs_defaults(tmp_path: Path) -> None: include_colors_map=None, include_default_background=True, ignore_colors=None, + include_formulas_map=None, include_merged_cells=None, include_merged_values_in_rows=True, ) assert inputs.include_cell_links is False assert inputs.include_print_areas is True assert inputs.include_colors_map is False + assert inputs.include_formulas_map is False assert inputs.include_default_background is False assert inputs.include_merged_cells is True @@ -171,6 +185,7 @@ def test_resolve_extraction_inputs_forces_merged_cells_when_excluding_values( include_colors_map=None, include_default_background=False, ignore_colors=None, + include_formulas_map=None, include_merged_cells=False, include_merged_values_in_rows=False, ) @@ -197,6 +212,8 @@ def fake_detect_tables(_: Path, __: str) -> list[str]: include_colors_map=False, include_default_background=False, ignore_colors=None, + include_formulas_map=False, + use_com_for_formulas=False, include_merged_cells=True, include_merged_values_in_rows=True, ) @@ -228,6 +245,8 @@ def test_build_cells_tables_workbook_excludes_merged_values_in_rows( include_colors_map=False, include_default_background=False, ignore_colors=None, + include_formulas_map=False, + use_com_for_formulas=False, include_merged_cells=True, include_merged_values_in_rows=False, ) diff --git a/tests/core/test_pipeline_fallbacks.py b/tests/core/test_pipeline_fallbacks.py index 393be5e..5099c87 100644 --- a/tests/core/test_pipeline_fallbacks.py +++ b/tests/core/test_pipeline_fallbacks.py @@ -34,6 +34,7 @@ def test_pipeline_fallback_skip_com_tests( include_colors_map=False, include_default_background=False, ignore_colors=None, + include_formulas_map=None, include_merged_cells=None, include_merged_values_in_rows=True, ) @@ -68,6 +69,7 @@ def _raise(*_args: object, **_kwargs: object) -> None: include_colors_map=False, include_default_background=False, ignore_colors=None, + include_formulas_map=None, include_merged_cells=None, include_merged_values_in_rows=True, ) @@ -110,6 +112,7 @@ def _raise( include_colors_map=False, include_default_background=False, ignore_colors=None, + include_formulas_map=None, include_merged_cells=None, include_merged_values_in_rows=True, ) diff --git a/tests/engine/test_engine.py b/tests/engine/test_engine.py index 9ff36ec..084ee12 100644 --- a/tests/engine/test_engine.py +++ b/tests/engine/test_engine.py @@ -34,6 +34,7 @@ def fake_extract( include_colors_map: bool = False, include_default_background: bool = False, ignore_colors: set[str] | None = None, + include_formulas_map: bool | None = None, include_merged_cells: bool | None = None, include_merged_values_in_rows: bool = True, ) -> WorkbookData: diff --git a/tests/integration/test_integrate_raw_data.py b/tests/integration/test_integrate_raw_data.py index d94507c..fcd020d 100644 --- a/tests/integration/test_integrate_raw_data.py +++ b/tests/integration/test_integrate_raw_data.py @@ -4,7 +4,12 @@ from _pytest.monkeypatch import MonkeyPatch -from exstruct.core.cells import SheetColorsMap, WorkbookColorsMap +from exstruct.core.cells import ( + SheetColorsMap, + SheetFormulasMap, + WorkbookColorsMap, + WorkbookFormulasMap, +) from exstruct.core.pipeline import collect_sheet_raw_data from exstruct.models import CellRow, Chart, ChartSeries, PrintArea, Shape @@ -52,6 +57,13 @@ def test_collect_sheet_raw_data_includes_extracted_fields( ) } ) + formulas_map = WorkbookFormulasMap( + sheets={ + "Sheet1": SheetFormulasMap( + sheet_name="Sheet1", formulas_map={"=A1": [(1, 0)]} + ) + } + ) result = collect_sheet_raw_data( cell_data={"Sheet1": [CellRow(r=1, c={"0": "A"}, links=None)]}, shape_data={"Sheet1": [Shape(text="S", l=0, t=0)]}, @@ -62,6 +74,7 @@ def test_collect_sheet_raw_data_includes_extracted_fields( include_merged_values_in_rows=True, print_area_data={"Sheet1": [PrintArea(r1=1, c1=0, r2=1, c2=0)]}, auto_page_break_data={"Sheet1": [PrintArea(r1=1, c1=0, r2=1, c2=0)]}, + formulas_map_data=formulas_map, colors_map_data=colors_map, ) @@ -72,6 +85,7 @@ def test_collect_sheet_raw_data_includes_extracted_fields( assert raw.table_candidates == ["A1:B2"] assert raw.print_areas assert raw.auto_print_areas + assert raw.formulas_map == {"=A1": [(1, 0)]} assert raw.colors_map == {"#FFFFFF": [(1, 0)]} @@ -98,6 +112,7 @@ def test_collect_sheet_raw_data_skips_charts_in_light_mode( include_merged_values_in_rows=True, print_area_data=None, auto_page_break_data=None, + formulas_map_data=None, colors_map_data=None, ) diff --git a/tests/models/test_modeling.py b/tests/models/test_modeling.py index c367c6a..4570b19 100644 --- a/tests/models/test_modeling.py +++ b/tests/models/test_modeling.py @@ -25,6 +25,7 @@ def test_build_workbook_data_from_raw() -> None: table_candidates=["A1:A1"], print_areas=[PrintArea(r1=1, c1=0, r2=1, c2=0)], auto_print_areas=[], + formulas_map={"=A1": [(1, 0)]}, colors_map={"#FFFFFF": [(1, 0)]}, merged_cells=[MergedCellRange(r1=1, c1=0, r2=1, c2=0, v=" ")], ) diff --git a/tests/render/test_render_init.py b/tests/render/test_render_init.py index d9b7134..9012050 100644 --- a/tests/render/test_render_init.py +++ b/tests/render/test_render_init.py @@ -292,8 +292,8 @@ def test_export_sheet_images_success( written = render.export_sheet_images(xlsx, out_dir, dpi=144) assert written[0].name == "01_Sheet_1.png" - assert written[1].name == "01_Sheet_1_p02.png" - assert written[2].name == "02_sheet.png" + assert written[1].name == "02_Sheet_1.png" + assert written[2].name == "03_sheet.png" assert all(path.exists() for path in written) From f2a9e18926ea35847b591ec0ec8e55d2cc32fe71 Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Thu, 22 Jan 2026 23:26:56 +0900 Subject: [PATCH 03/12] Add formula data and interpretation for Excel JSON structure --- sample/formula/formula.json | 180 +++++++++++++++++++++++++ sample/formula/formula_json_for_llm.md | 85 ++++++++++++ 2 files changed, 265 insertions(+) create mode 100644 sample/formula/formula.json create mode 100644 sample/formula/formula_json_for_llm.md diff --git a/sample/formula/formula.json b/sample/formula/formula.json new file mode 100644 index 0000000..a1f3ba0 --- /dev/null +++ b/sample/formula/formula.json @@ -0,0 +1,180 @@ +{ + "book_name": "formula.xlsx", + "sheets": { + "Sheet1": { + "rows": [ + { + "r": 1, + "c": { + "0": "商品名", + "1": "定価", + "2": "数量" + } + }, + { + "r": 2, + "c": { + "0": "商品A", + "1": 800, + "2": 10 + } + }, + { + "r": 3, + "c": { + "0": "商品B", + "1": 1000, + "2": 2 + } + }, + { + "r": 4, + "c": { + "0": "商品C", + "1": 1200, + "2": 5 + } + }, + { + "r": 5, + "c": { + "0": "売上合計", + "2": 16000 + } + }, + { + "r": 8, + "c": { + "0": "学生名", + "1": "点数", + "2": "評価" + } + }, + { + "r": 9, + "c": { + "0": "山田 早苗", + "1": 86, + "2": "A" + } + }, + { + "r": 10, + "c": { + "0": "田中 太郎", + "1": 60, + "2": "C" + } + }, + { + "r": 11, + "c": { + "0": "坂本 直美", + "1": 72, + "2": "B" + } + }, + { + "r": 12, + "c": { + "0": "多田 友梨奈", + "1": 50, + "2": "D" + } + } + ], + "table_candidates": [ + "A1:C5", + "A8:C12" + ], + "formulas_map": { + "=SUM(B2:B4*C2:C4)": [ + [ + 5, + 2 + ] + ], + "=_xlfn.IFS(B9>=85,\"A\",B9>=70,\"B\",B9>=60,\"C\",TRUE,\"D\")": [ + [ + 9, + 2 + ] + ], + "=_xlfn.IFS(B10>=85,\"A\",B10>=70,\"B\",B10>=60,\"C\",TRUE,\"D\")": [ + [ + 10, + 2 + ] + ], + "=_xlfn.IFS(B11>=85,\"A\",B11>=70,\"B\",B11>=60,\"C\",TRUE,\"D\")": [ + [ + 11, + 2 + ] + ], + "=_xlfn.IFS(B12>=85,\"A\",B12>=70,\"B\",B12>=60,\"C\",TRUE,\"D\")": [ + [ + 12, + 2 + ] + ] + }, + "colors_map": { + "BDD7EE": [ + [ + 1, + 0 + ], + [ + 1, + 1 + ], + [ + 1, + 2 + ], + [ + 5, + 0 + ], + [ + 5, + 1 + ] + ], + "F8CBAD": [ + [ + 8, + 0 + ], + [ + 8, + 1 + ], + [ + 8, + 2 + ] + ] + }, + "merged_cells": { + "schema": [ + "r1", + "c1", + "r2", + "c2", + "v" + ], + "items": [ + [ + 5, + 0, + 5, + 1, + "売上合計" + ] + ] + } + } + } +} \ No newline at end of file diff --git a/sample/formula/formula_json_for_llm.md b/sample/formula/formula_json_for_llm.md new file mode 100644 index 0000000..5481659 --- /dev/null +++ b/sample/formula/formula_json_for_llm.md @@ -0,0 +1,85 @@ +# 📘 Overall Interpretation of the Excel JSON + +The JSON describes an Excel file named **formula.xlsx** containing a single sheet (**Sheet1**) with **two separate tables** placed in different row regions. The sheet includes data, formulas, merged cells, and color formatting. + +--- + +# 🛒 Table 1: Product Sales (Range A1:C5) + +## **Content** +| Product | Price | Quantity | +|--------|-------|----------| +| Product A | 800 | 10 | +| Product B | 1000 | 2 | +| Product C | 1200 | 5 | +| **Sales Total** (merged A5:B5) | | **16000** | + +### **Formula** +The formula: + +``` +=SUM(B2:B4*C2:C4) +``` + +is applied to cell **C5**. + +This calculates: + +- 800 × 10 = 8000 +- 1000 × 2 = 2000 +- 1200 × 5 = 6000 +- Total = **16000** + +The JSON value matches this computed result. + +### **Formatting** +- Color `BDD7EE` (light blue) is applied to: + - Header row (A1:C1) + - The “Sales Total” label area (A5:B5) +- Cells A5 and B5 are merged. + +This suggests the table is formatted as a typical summary sales table. + +--- + +# 🎓 Table 2: Student Grades (Range A8:C12) + +## **Content** +| Student | Score | Grade | +|---------|--------|--------| +| Sanae Yamada | 86 | A | +| Taro Tanaka | 60 | C | +| Naomi Sakamoto | 72 | B | +| Yurina Tada | 50 | D | + +### **Formula** +Each grade cell (column C) uses an **IFS** function: + +``` +IFS( + B>=85, "A", + B>=70, "B", + B>=60, "C", + TRUE, "D" +) +``` + +This automatically assigns a grade based on the score. + +### **Formatting** +- Color `F8CBAD` (light red) is applied to the header row (A8:C8). + +--- + +# 🧩 What This Excel Sheet Appears to Be + +Based on the structure, formulas, and formatting, this sheet looks like a **practice or demonstration file** for learning Excel basics: + +- Using **SUM** with array multiplication +- Using **IFS** for conditional grading +- Creating **two independent tables** on one sheet +- Applying **header colors** +- Using **merged cells** for labels +- Demonstrating how formulas map to cell positions + +It resembles a training or sample dataset for Excel exercises. \ No newline at end of file From ca34275adcc0b3c6e1050f81a7b5ffcea9a1817c Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Fri, 23 Jan 2026 08:37:09 +0900 Subject: [PATCH 04/12] Add codacy_issues.py script for fetching issues from Codacy API --- docs/README.ja.md | 2 +- docs/agents/TASKS.md | 29 +- pyproject.toml | 34 +- scripts/codacy_issues.py | 341 +++++++++++++++++++ src/exstruct/core/pipeline.py | 16 +- src/exstruct/core/workbook.py | 2 + src/exstruct/render/__init__.py | 11 +- tests/backends/test_backends.py | 33 ++ tests/core/test_cells_colors_and_tables.py | 2 +- tests/core/test_cells_utils.py | 26 ++ tests/core/test_error_handling_exceptions.py | 4 +- tests/core/test_pipeline.py | 66 ++++ tests/core/test_workbook_utils.py | 3 +- tests/models/test_models_export.py | 7 +- tests/render/test_render_init.py | 47 +++ tests/utils.py | 21 +- 16 files changed, 601 insertions(+), 43 deletions(-) create mode 100644 scripts/codacy_issues.py diff --git a/docs/README.ja.md b/docs/README.ja.md index 34137d3..e051e04 100644 --- a/docs/README.ja.md +++ b/docs/README.ja.md @@ -131,7 +131,7 @@ set_table_detection_params( - **light**: セル+テーブル候補のみ(COM 不要)。 - **standard**: テキスト付き図形+矢印、チャート(COM ありで取得)、テーブル候補。セルのハイパーリンクは `include_cell_links=True` を指定したときのみ出力。 -- **verbose**: all shapes, charts, table_candidates, hyperlinks, `colors_map`, and `formulas_map`. +- **verbose**: 全図形(幅高さ付き)、チャート、`table_candidates`、ハイパーリンク、`colors_map`、`formulas_map` を出力。 ## エラーハンドリング / フォールバック diff --git a/docs/agents/TASKS.md b/docs/agents/TASKS.md index afa5954..2f2105f 100644 --- a/docs/agents/TASKS.md +++ b/docs/agents/TASKS.md @@ -4,11 +4,24 @@ ## 数式取得機能追加 -- [ ] `SheetData`に`formulas_map`フィールドを追加し、シリアライズ対象に含める -- [ ] `StructOptions`に`include_formulas_map: bool = False`を追加し、verbose時の既定挙動と整合させる -- [ ] openpyxlで`data_only=False`の読み取りパスを追加し、`formulas_map`用の走査処理を実装する -- [ ] `.xls`かつ数式取得ONの場合はCOM経由で`formulas_map`を取得し、遅延警告を出す -- [ ] `formulas_map`の仕様(=付きの式文字列、空文字除外、=のみ許可、共有/配列は未展開)に沿った抽出ロジックを追加 -- [ ] openpyxlの配列数式(`ArrayFormula`)は`value.text`から式文字列を取得する分岐を追加 -- [ ] CLI/ドキュメント/READMEの出力モード説明に`formulas_map`の条件を追記する -- [ ] テスト要件に`formulas_map`関連(ON/OFF、verbose既定、.xls COM分岐)を追加する +- [x] `SheetData`に`formulas_map`フィールドを追加し、シリアライズ対象に含める +- [x] `StructOptions`に`include_formulas_map: bool = False`を追加し、verbose時の既定挙動と整合させる +- [x] openpyxlで`data_only=False`の読み取りパスを追加し、`formulas_map`用の走査処理を実装する +- [x] `.xls`かつ数式取得ONの場合はCOM経由で`formulas_map`を取得し、遅延警告を出す +- [x] `formulas_map`の仕様(=付きの式文字列、空文字除外、=のみ許可、共有/配列は未展開)に沿った抽出ロジックを追加 +- [x] openpyxlの配列数式(`ArrayFormula`)は`value.text`から式文字列を取得する分岐を追加 +- [x] CLI/ドキュメント/READMEの出力モード説明に`formulas_map`の条件を追記する +- [x] テスト要件に`formulas_map`関連(ON/OFF、verbose既定、.xls COM分岐)を追加する + +## PR #44 指摘対応 + +- [ ] `src/exstruct/render/__init__.py` の `_page_index_from_suffix` を2桁固定ではなく可変桁の数値サフィックスに対応させ、`_rename_pages_for_print_area` の上書きリスクを解消する +- [ ] `src/exstruct/render/__init__.py` の `_export_sheet_pdf` の `finally` 内 `return` を削除し、PrintArea 復元失敗はログに残して例外を握りつぶさない +- [ ] `src/exstruct/core/pipeline.py` の `step_extract_formulas_map_*` の挙動を docstring に合わせる(失敗時にログしてスキップ)か、docstring を実装に合わせて修正する +- [ ] `docs/README.ja.md` の `**verbose**` 説明行を日本語に統一する + +## PR #44 コメント/Codecov 対応 + +- [ ] Codecov パッチカバレッジ低下(60.53%)の指摘に対応し、対象ファイルの不足分テストを追加する(`src/exstruct/render/__init__.py`, `src/exstruct/core/cells.py`, `src/exstruct/core/backends/com_backend.py`, `src/exstruct/core/pipeline.py`, `src/exstruct/core/backends/openpyxl_backend.py`) +- [ ] Codecov の「Files with missing lines」で具体的な未カバー行を確認し、テスト観点を整理する +- [ ] Codacy 警告対応: `src/exstruct/render/__init__.py:274` の finally 内 return により例外が握りつぶされる可能性(`PyLintPython3_W0150`)を解消する diff --git a/pyproject.toml b/pyproject.toml index 7b6fe5c..4093734 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,7 +60,10 @@ omit = [ [tool.ruff] target-version = "py311" src = ["exstruct"] +fix = true +# 静的解析ルール +[tool.ruff.lint] select = [ "E", # pycodestyle errors "W", # pycodestyle warnings @@ -75,43 +78,36 @@ select = [ ] ignore = [ - "E501", # 行長は許容(Excel JSON は長くなりがち) - "B008", # Pydantic の default_factory を誤検知するため - "ANN101", # self に型を要求されてしまうため - "ANN102", # cls も同様 + "E501", # 長い行は許容(Excel JSON は長くなりがち) + "B008", # Pydantic の default_factory を使用するため + "ANN101", # self の型注釈は省略可能 + "ANN102", # cls の型注釈は省略可能 ] -fix = true - -# 型ヒントのスタイル -[tool.ruff.lint] -extend-select = ["ANN"] - -# import の並び替え設定 -[tool.ruff.isort] +# import の並び順 +[tool.ruff.lint.isort] combine-as-imports = true known-first-party = ["exstruct"] force-sort-within-sections = true -# 複雑度チェック(関数の最大複雑度) -[tool.ruff.mccabe] +# 複雑度の最大値 +[tool.ruff.lint.mccabe] max-complexity = 12 -[tool.ruff.per-file-ignores] +[tool.ruff.lint.per-file-ignores] "tests/**/*.py" = ["N802", "N803", "N806"] - [tool.mypy] packages = ["exstruct"] python_version = "3.11" -# 外部ライブラリは一切チェックしない +# 外部ライブラリの型情報がない場合は無視 ignore_missing_imports = true -# 自作コードは厳密にチェックする +# 厳格モードを有効化 strict = true -# Pydantic v2 向け +# Pydantic v2 対応 plugins = ["pydantic.mypy"] [tool.pytest.ini_options] diff --git a/scripts/codacy_issues.py b/scripts/codacy_issues.py new file mode 100644 index 0000000..06db656 --- /dev/null +++ b/scripts/codacy_issues.py @@ -0,0 +1,341 @@ +#!/usr/bin/env python3 + +from __future__ import annotations + +import argparse +from dataclasses import dataclass +import json +import os +import re +import subprocess +import sys +from typing import Any, cast +import urllib.parse +import urllib.request + +# ================================ +# Config +# ================================ +BASE = "https://api.codacy.com/api/v3" +BASE_URL = urllib.parse.urlparse(BASE) +BASE_PATH = BASE_URL.path.rstrip("/") # "/api/v3" +TOKEN = os.environ.get("CODACY_API_TOKEN") + +if TOKEN is None: + print("CODACY_API_TOKEN is not set", file=sys.stderr) + sys.exit(1) +TOKEN_STR = TOKEN + + +# ================================ +# Utilities +# ================================ +LEVELS = ["Error", "High", "Warning", "Info"] + + +def get_level_priority(level: str | None) -> int | None: + if level == "Error": + return 4 + if level == "High": + return 3 + if level == "Warning": + return 2 + if level == "Info": + return 1 + return None + + +def normalize_provider(value: str) -> str | None: + return value if value in ("gh", "gl", "bb") else None + + +def assert_valid_segment(name: str, value: str, pattern: re.Pattern[str]) -> str: + if (not value) or (pattern.match(value) is None): + print(f"Invalid {name}: {value}", file=sys.stderr) + sys.exit(1) + return value + + +def assert_valid_choice(name: str, value: str, choices: list[str]) -> str: + if value not in choices: + print( + f"Invalid {name}: {value}. Valid values: {', '.join(choices)}", + file=sys.stderr, + ) + sys.exit(1) + return value + + +def encode_segment(value: str) -> str: + return urllib.parse.quote(value, safe="") + + +def build_codacy_url(pathname: str, query: dict[str, str] | None = None) -> str: + # Ensure we keep origin and base path + url = f"{BASE_URL.scheme}://{BASE_URL.netloc}{BASE_PATH}{pathname}" + if query: + url = f"{url}?{urllib.parse.urlencode(query)}" + return url + + +def assert_codacy_url(url: str) -> str: + # Basic safety: must be same origin and start with /api/v3/analysis/ + parsed = urllib.parse.urlparse(url) + expected_origin = f"{BASE_URL.scheme}://{BASE_URL.netloc}" + origin = f"{parsed.scheme}://{parsed.netloc}" + expected_prefix = f"{BASE_PATH}/analysis/" + if origin != expected_origin or not parsed.path.startswith(expected_prefix): + print(f"Invalid URL: {url}", file=sys.stderr) + sys.exit(1) + return url + + +def build_repo_issues_url(provider: str, org: str, repo: str, limit: int) -> str: + return build_codacy_url( + f"/analysis/organizations/{encode_segment(provider)}/{encode_segment(org)}" + f"/repositories/{encode_segment(repo)}/issues/search", + query={"limit": str(limit)}, + ) + + +def build_pr_issues_url( + provider: str, org: str, repo: str, pr: str, limit: int, status: str +) -> str: + return build_codacy_url( + f"/analysis/organizations/{encode_segment(provider)}/{encode_segment(org)}" + f"/repositories/{encode_segment(repo)}/pull-requests/{encode_segment(pr)}/issues", + query={"status": status, "limit": str(limit)}, + ) + + +def run_git(cmd: list[str]) -> str | None: + try: + out = subprocess.check_output(cmd, stderr=subprocess.DEVNULL) + return out.decode("utf-8", errors="replace").strip() + except Exception: + return None + + +def get_git_origin_url() -> str | None: + # git repo check + ok = run_git(["git", "rev-parse", "--is-inside-work-tree"]) + if not ok: + return None + return run_git(["git", "remote", "get-url", "origin"]) + + +@dataclass +class GitRemoteInfo: + provider: str + org: str + repo: str + + +def parse_git_remote(url: str) -> GitRemoteInfo | None: + # HTTPS + m = re.match(r"^https?://([^/]+)/([^/]+)/([^/]+?)(?:\.git)?$", url) + # SSH + if not m: + m = re.match(r"^git@([^:]+):([^/]+)/([^/]+?)(?:\.git)?$", url) + + if not m: + return None + + host, org, repo = m.group(1), m.group(2), m.group(3) + + def is_same_or_subdomain(hostname: str, base_domain: str) -> bool: + return hostname == base_domain or hostname.endswith("." + base_domain) + + if is_same_or_subdomain(host, "github.com"): + provider = "gh" + elif is_same_or_subdomain(host, "gitlab.com"): + provider = "gl" + elif is_same_or_subdomain(host, "bitbucket.org"): + provider = "bb" + else: + provider = "unknown" + + return GitRemoteInfo(provider=provider, org=org, repo=repo) + + +def fetch_json( + url: str, method: str = "GET", body: dict[str, Any] | None = None +) -> dict[str, Any]: + safe_url = assert_codacy_url(url) + + headers = { + "Accept": "application/json", + "api-token": TOKEN_STR, + } + + data: bytes | None = None + if body is not None and method.upper() != "GET": + payload = json.dumps(body).encode("utf-8") + headers["Content-Type"] = "application/json" + headers["Content-Length"] = str(len(payload)) + data = payload + + req = urllib.request.Request( + safe_url, method=method.upper(), headers=headers, data=data + ) + + try: + with urllib.request.urlopen(req, timeout=60) as res: + raw = res.read().decode("utf-8", errors="replace") + status = getattr(res, "status", 0) or 0 + if status < 200 or status >= 300: + raise RuntimeError(f"HTTP {status}: {raw}") + try: + parsed = json.loads(raw) + except json.JSONDecodeError as exc: + raise RuntimeError("Invalid JSON response") from exc + if not isinstance(parsed, dict): + raise RuntimeError("Invalid JSON response") + return cast(dict[str, Any], parsed) + except urllib.error.HTTPError as e: + # include response body if possible + try: + body_text = e.read().decode("utf-8", errors="replace") + except Exception: + body_text = "" + raise RuntimeError(f"HTTP {e.code}: {body_text or str(e)}") from None + except urllib.error.URLError as e: + raise RuntimeError(str(e)) from None + + +# ================================ +# API +# ================================ +def fetch_repo_issues(provider: str, org: str, repo: str, limit: int) -> dict[str, Any]: + url = build_repo_issues_url(provider, org, repo, limit) + return fetch_json(url, method="POST", body={}) + + +def fetch_pr_issues( + provider: str, org: str, repo: str, pr: str, limit: int, status: str = "all" +) -> dict[str, Any]: + url = build_pr_issues_url(provider, org, repo, pr, limit, status) + return fetch_json(url, method="GET") + + +# ================================ +# AI Output Formatter +# ================================ +def format_for_ai(raw_issues: list[dict[str, Any]], min_level: str) -> list[str]: + min_priority = get_level_priority(min_level) + if min_priority is None: + print( + f"Invalid --min-level: {min_level}. Valid values: {', '.join(LEVELS)}", + file=sys.stderr, + ) + sys.exit(1) + + out: list[str] = [] + + for item in raw_issues: + issue = item.get("commitIssue") or item + + pattern_info = issue.get("patternInfo") or {} + level = pattern_info.get("level") + prio = get_level_priority(level) + if prio is None or prio < min_priority: + continue + + file_path = issue.get("filePath") + line_no = issue.get("lineNumber") + rule = pattern_info.get("id") + category = pattern_info.get("category") + message = issue.get("message") + + out.append(f"{level} | {file_path}:{line_no} | {rule} | {category} | {message}") + + return out + + +# ================================ +# CLI +# ================================ +def parse_args(argv: list[str]) -> argparse.Namespace: + p = argparse.ArgumentParser(add_help=False) + p.add_argument("org", nargs="?", default=None) + p.add_argument("repo", nargs="?", default=None) + p.add_argument("--pr", dest="pr", default=None) + p.add_argument("--min-level", dest="min_level", default="Info", choices=LEVELS) + p.add_argument("--provider", dest="provider", default=None) + p.add_argument("--help", action="help", help="Show this help message and exit") + return p.parse_args(argv) + + +def main() -> int: + args = parse_args(sys.argv[1:]) + + # --- Git auto-detect --- + if not args.org or not args.repo: + origin_url = get_git_origin_url() + if origin_url: + parsed = parse_git_remote(origin_url) + if parsed: + if args.provider is None: + args.provider = parsed.provider + if args.org is None: + args.org = parsed.org + if args.repo is None: + args.repo = parsed.repo + + if args.provider is None: + args.provider = "gh" + + provider = normalize_provider(args.provider) + if not provider: + print("Invalid --provider: use gh, gl, or bb", file=sys.stderr) + return 1 + + if not args.org or not args.repo: + print( + "Usage:\n" + " python codacy_issues.py ORG REPO [--pr NUMBER] [--min-level Error|High|Warning|Info] [--provider gh|gl|bb]", + file=sys.stderr, + ) + return 1 + + segment_pattern = re.compile(r"^[A-Za-z0-9_.-]+$") + org = assert_valid_segment("org", args.org, segment_pattern) + repo = assert_valid_segment("repo", args.repo, segment_pattern) + pr = args.pr + if pr is not None: + pr = assert_valid_segment("pr", pr, re.compile(r"^[0-9]+$")) + + status = assert_valid_choice("status", "all", ["all", "open", "closed"]) + limit = 100 + + result = ( + fetch_pr_issues( + provider=provider, org=org, repo=repo, pr=pr, limit=limit, status=status + ) + if pr + else fetch_repo_issues(provider=provider, org=org, repo=repo, limit=limit) + ) + + issues = result.get("data") or [] + formatted = format_for_ai(issues, args.min_level) + + payload = { + "scope": "pull_request" if pr else "repository", + "organization": org, + "repository": repo, + "pullRequest": pr if pr else None, + "minLevel": args.min_level, + "total": len(formatted), + "issues": formatted, + } + + sys.stdout.write(json.dumps(payload, ensure_ascii=False, indent=2) + "\n") + return 0 + + +if __name__ == "__main__": + try: + raise SystemExit(main()) + except Exception as e: + print(str(e), file=sys.stderr) + raise SystemExit(1) from e diff --git a/src/exstruct/core/pipeline.py b/src/exstruct/core/pipeline.py index 98aa053..ff46dbe 100644 --- a/src/exstruct/core/pipeline.py +++ b/src/exstruct/core/pipeline.py @@ -520,7 +520,13 @@ def step_extract_formulas_map_openpyxl( artifacts: Artifact container to update. """ backend = OpenpyxlBackend(inputs.file_path) - artifacts.formulas_map_data = backend.extract_formulas_map() + try: + artifacts.formulas_map_data = backend.extract_formulas_map() + except Exception as exc: + logger.warning( + "Failed to extract formulas_map via openpyxl. (%r)", + exc, + ) def step_extract_colors_map_openpyxl( @@ -619,7 +625,13 @@ def step_extract_formulas_map_com( artifacts: Artifact container to update. workbook: xlwings workbook instance. """ - artifacts.formulas_map_data = ComBackend(workbook).extract_formulas_map() + try: + artifacts.formulas_map_data = ComBackend(workbook).extract_formulas_map() + except Exception as exc: + logger.warning( + "Failed to extract formulas_map via COM. (%r)", + exc, + ) def step_extract_colors_map_com( diff --git a/src/exstruct/core/workbook.py b/src/exstruct/core/workbook.py index 4a7568b..3f33822 100644 --- a/src/exstruct/core/workbook.py +++ b/src/exstruct/core/workbook.py @@ -12,6 +12,8 @@ logger = logging.getLogger(__name__) +__all__ = ["openpyxl_workbook", "xlwings_workbook", "_find_open_workbook", "xw"] + @contextmanager def openpyxl_workbook( diff --git a/src/exstruct/render/__init__.py b/src/exstruct/render/__init__.py index 4a16cc8..f004e2f 100644 --- a/src/exstruct/render/__init__.py +++ b/src/exstruct/render/__init__.py @@ -230,8 +230,11 @@ def _page_index_from_suffix(stem: str) -> int: return 0 base, suffix = stem.rsplit("_p", 1) _ = base - if len(suffix) == 2 and suffix.isdigit(): - return int(suffix) - 1 + if suffix.isdigit(): + page_number = int(suffix) + if page_number <= 0: + return 0 + return page_number - 1 return 0 @@ -270,8 +273,8 @@ def _export_sheet_pdf( if page_setup is not None and print_area is not None: try: page_setup.PrintArea = original_print_area - except Exception: - return + except Exception as exc: + logger.debug("Failed to restore PrintArea. (%r)", exc) def _ensure_pdfium(use_subprocess: bool) -> ModuleType | None: diff --git a/tests/backends/test_backends.py b/tests/backends/test_backends.py index 7326fc3..cfbd71b 100644 --- a/tests/backends/test_backends.py +++ b/tests/backends/test_backends.py @@ -75,6 +75,21 @@ def fake_colors_map( ) +def test_openpyxl_backend_extract_formulas_map_returns_none_on_failure( + monkeypatch: MonkeyPatch, tmp_path: Path +) -> None: + def fake_formulas_map(file_path: Path) -> object: + raise RuntimeError("boom") + + monkeypatch.setattr( + "exstruct.core.backends.openpyxl_backend.extract_sheet_formulas_map", + fake_formulas_map, + ) + + backend = OpenpyxlBackend(tmp_path / "book.xlsx") + assert backend.extract_formulas_map() is None + + def test_com_backend_extract_colors_map_returns_none_on_failure( monkeypatch: MonkeyPatch, ) -> None: @@ -101,6 +116,24 @@ class DummyWorkbook: ) +def test_com_backend_extract_formulas_map_returns_none_on_failure( + monkeypatch: MonkeyPatch, +) -> None: + def fake_formulas_map(workbook: object) -> object: + raise RuntimeError("boom") + + monkeypatch.setattr( + "exstruct.core.backends.com_backend.extract_sheet_formulas_map_com", + fake_formulas_map, + ) + + class DummyWorkbook: + pass + + backend = ComBackend(DummyWorkbook()) + assert backend.extract_formulas_map() is None + + def test_com_backend_extract_print_areas_handles_sheet_error( monkeypatch: MonkeyPatch, ) -> None: diff --git a/tests/core/test_cells_colors_and_tables.py b/tests/core/test_cells_colors_and_tables.py index edf5d24..8469c4f 100644 --- a/tests/core/test_cells_colors_and_tables.py +++ b/tests/core/test_cells_colors_and_tables.py @@ -138,7 +138,7 @@ def test_table_signal_score_prefers_header_and_coverage() -> None: def test_count_nonempty_cells() -> None: """非空セル数のカウントを確認する。""" - values = [["", None, "x"], ["y", " ", 0]] + values: list[list[object]] = [["", None, "x"], ["y", " ", 0]] assert _count_nonempty_cells(values) == 3 diff --git a/tests/core/test_cells_utils.py b/tests/core/test_cells_utils.py index 7e0a79a..fa649f3 100644 --- a/tests/core/test_cells_utils.py +++ b/tests/core/test_cells_utils.py @@ -7,8 +7,10 @@ from exstruct.core import cells from exstruct.core.cells import ( _coerce_numeric_preserve_format, + _normalize_formula_from_com, _normalize_formula_value, detect_tables_openpyxl, + extract_sheet_formulas_map, ) @@ -73,3 +75,27 @@ class _ArrayFormulaLike: assert _normalize_formula_value(_ArrayFormulaLike()) == "=SUM(A1:A3)" assert _normalize_formula_value("") is None + + +def test_extract_sheet_formulas_map_collects_formulas(tmp_path: Path) -> None: + path = tmp_path / "formulas.xlsx" + wb = Workbook() + ws = wb.active + ws.title = "Sheet1" + ws["A1"] = 1 + ws["A2"] = 2 + ws["B1"] = "=SUM(A1:A2)" + wb.save(path) + wb.close() + + result = extract_sheet_formulas_map(path) + sheet = result.get_sheet("Sheet1") + assert sheet is not None + assert sheet.formulas_map == {"=SUM(A1:A2)": [(1, 1)]} + + +def test_normalize_formula_from_com() -> None: + assert _normalize_formula_from_com("=A1") == "=A1" + assert _normalize_formula_from_com("A1") is None + assert _normalize_formula_from_com("") is None + assert _normalize_formula_from_com(None) is None diff --git a/tests/core/test_error_handling_exceptions.py b/tests/core/test_error_handling_exceptions.py index 95bad4a..542284f 100644 --- a/tests/core/test_error_handling_exceptions.py +++ b/tests/core/test_error_handling_exceptions.py @@ -2,6 +2,7 @@ import importlib from pathlib import Path +from typing import Literal, cast import pytest @@ -24,8 +25,9 @@ def _minimal_workbook() -> WorkbookData: def test_serialize_workbook_unsupported_format() -> None: """Unsupported formats should raise SerializationError.""" workbook = _minimal_workbook() + invalid_format = cast(Literal["json", "yaml", "yml", "toon"], "invalid") with pytest.raises(SerializationError): - serialize_workbook(workbook, fmt="invalid") + serialize_workbook(workbook, fmt=invalid_format) def test_save_as_yaml_missing_dependency( diff --git a/tests/core/test_pipeline.py b/tests/core/test_pipeline.py index dfd4a93..9b4a917 100644 --- a/tests/core/test_pipeline.py +++ b/tests/core/test_pipeline.py @@ -1,7 +1,11 @@ +import logging from pathlib import Path from _pytest.monkeypatch import MonkeyPatch +import pytest +from exstruct.core.backends.com_backend import ComBackend +from exstruct.core.backends.openpyxl_backend import OpenpyxlBackend from exstruct.core.cells import MergedCellRange from exstruct.core.pipeline import ( ExtractionArtifacts, @@ -11,6 +15,8 @@ build_com_pipeline, build_pre_com_pipeline, resolve_extraction_inputs, + step_extract_formulas_map_com, + step_extract_formulas_map_openpyxl, ) from exstruct.models import CellRow, PrintArea @@ -278,3 +284,63 @@ def test_filter_rows_excluding_merged_values_drops_empty_rows() -> None: merged_cells = [MergedCellRange(r1=1, c1=0, r2=1, c2=0, v="A")] filtered = _filter_rows_excluding_merged_values(rows, merged_cells) assert filtered == [] + + +def test_step_extract_formulas_map_openpyxl_skips_on_failure( + tmp_path: Path, monkeypatch: MonkeyPatch, caplog: "pytest.LogCaptureFixture" +) -> None: + def _raise(_: OpenpyxlBackend) -> object: + raise RuntimeError("boom") + + monkeypatch.setattr(OpenpyxlBackend, "extract_formulas_map", _raise) + inputs = ExtractionInputs( + file_path=tmp_path / "book.xlsx", + mode="standard", + include_cell_links=False, + include_print_areas=False, + include_auto_page_breaks=False, + include_colors_map=False, + include_default_background=False, + ignore_colors=None, + include_formulas_map=True, + use_com_for_formulas=False, + include_merged_cells=False, + include_merged_values_in_rows=True, + ) + artifacts = ExtractionArtifacts() + + with caplog.at_level(logging.WARNING): + step_extract_formulas_map_openpyxl(inputs, artifacts) + + assert artifacts.formulas_map_data is None + assert "Failed to extract formulas_map via openpyxl" in caplog.text + + +def test_step_extract_formulas_map_com_skips_on_failure( + tmp_path: Path, monkeypatch: MonkeyPatch, caplog: "pytest.LogCaptureFixture" +) -> None: + def _raise(_: ComBackend) -> object: + raise RuntimeError("boom") + + monkeypatch.setattr(ComBackend, "extract_formulas_map", _raise) + inputs = ExtractionInputs( + file_path=tmp_path / "book.xlsx", + mode="standard", + include_cell_links=False, + include_print_areas=False, + include_auto_page_breaks=False, + include_colors_map=False, + include_default_background=False, + ignore_colors=None, + include_formulas_map=True, + use_com_for_formulas=True, + include_merged_cells=False, + include_merged_values_in_rows=True, + ) + artifacts = ExtractionArtifacts() + + with caplog.at_level(logging.WARNING): + step_extract_formulas_map_com(inputs, artifacts, object()) + + assert artifacts.formulas_map_data is None + assert "Failed to extract formulas_map via COM" in caplog.text diff --git a/tests/core/test_workbook_utils.py b/tests/core/test_workbook_utils.py index 002076d..b43a9a4 100644 --- a/tests/core/test_workbook_utils.py +++ b/tests/core/test_workbook_utils.py @@ -2,6 +2,7 @@ from collections.abc import Iterator from pathlib import Path +from typing import cast import pytest @@ -87,7 +88,7 @@ class _DummyApp: monkeypatch.setattr(workbook.xw, "apps", [_DummyApp()]) file_path = _DummyPath("good") - assert workbook._find_open_workbook(file_path) is None + assert workbook._find_open_workbook(cast(Path, file_path)) is None def test_find_open_workbook_returns_none_on_iter_error( diff --git a/tests/models/test_models_export.py b/tests/models/test_models_export.py index b318d7f..57d72b7 100644 --- a/tests/models/test_models_export.py +++ b/tests/models/test_models_export.py @@ -1,6 +1,8 @@ +from collections.abc import Callable from importlib import util import json from pathlib import Path +from typing import Any, cast import pytest @@ -16,6 +18,7 @@ HAS_PYYAML = util.find_spec("yaml") is not None HAS_TOON = util.find_spec("toon") is not None +_SkipIf = Callable[[Callable[..., Any]], Callable[..., Any]] def _sheet() -> SheetData: @@ -68,7 +71,7 @@ def test_save_unsupported_format_raises(tmp_path: Path) -> None: # pytest.skipif is typed; no ignore needed -@pytest.mark.skipif(not HAS_PYYAML, reason="pyyaml not installed") # type: ignore[misc] +@cast(_SkipIf, pytest.mark.skipif(not HAS_PYYAML, reason="pyyaml not installed")) def test_sheet_to_yaml_roundtrip() -> None: sheet = _sheet() text = sheet.to_yaml() @@ -76,7 +79,7 @@ def test_sheet_to_yaml_roundtrip() -> None: assert "SheetData" not in text # not a repr -@pytest.mark.skipif(not HAS_PYYAML, reason="pyyaml not installed") # type: ignore[misc] +@cast(_SkipIf, pytest.mark.skipif(not HAS_PYYAML, reason="pyyaml not installed")) def test_workbook_to_yaml() -> None: wb = _workbook() text = wb.to_yaml() diff --git a/tests/render/test_render_init.py b/tests/render/test_render_init.py index 9012050..eb2708d 100644 --- a/tests/render/test_render_init.py +++ b/tests/render/test_render_init.py @@ -541,3 +541,50 @@ def test_sanitize_sheet_filename() -> None: """_sanitize_sheet_filename replaces invalid characters and defaults.""" assert render._sanitize_sheet_filename("Sheet/1") == "Sheet_1" assert render._sanitize_sheet_filename(" ") == "sheet" + + +def test_page_index_from_suffix_handles_multi_digits() -> None: + assert render._page_index_from_suffix("sheet_01") == 0 + assert render._page_index_from_suffix("sheet_01_p01") == 0 + assert render._page_index_from_suffix("sheet_01_p10") == 9 + assert render._page_index_from_suffix("sheet_01_p100") == 99 + assert render._page_index_from_suffix("sheet_01_p0") == 0 + + +def test_export_sheet_pdf_does_not_swallow_export_errors(tmp_path: Path) -> None: + class _FlakyPageSetup(render._PageSetupProtocol): + def __init__(self) -> None: + self._print_area: object = "A1" + self._set_calls = 0 + + @property + def PrintArea(self) -> object: + return self._print_area + + @PrintArea.setter + def PrintArea(self, value: object) -> None: + if self._set_calls >= 1: + raise RuntimeError("restore failed") + self._print_area = value + self._set_calls += 1 + + class _ExplodingSheetApi: + PageSetup: render._PageSetupProtocol = _FlakyPageSetup() + + def ExportAsFixedFormat( + self, file_format: int, output_path: str, *args: object, **kwargs: object + ) -> None: + _ = file_format + _ = output_path + _ = args + _ = kwargs + raise RuntimeError("export failed") + + pdf_path = tmp_path / "out.pdf" + with pytest.raises(RuntimeError, match="export failed"): + render._export_sheet_pdf( + _ExplodingSheetApi(), + pdf_path, + ignore_print_areas=False, + print_area="A1:B2", + ) diff --git a/tests/utils.py b/tests/utils.py index 965eb79..da3e063 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,5 +1,5 @@ -from collections.abc import Callable -from typing import TypeVar, cast +from collections.abc import Callable, Iterable, Sequence +from typing import Literal, TypeVar, cast import pytest from typing_extensions import ParamSpec @@ -9,10 +9,23 @@ def parametrize( - *args: object, **kwargs: object + argnames: str | Sequence[str], + argvalues: Iterable[object], + *, + indirect: bool | Sequence[str] = False, + ids: Iterable[str | float | int | bool | None] + | Callable[[object], object | None] + | None = None, + scope: Literal["session", "package", "module", "class", "function"] | None = None, ) -> Callable[[Callable[P, R]], Callable[P, R]]: """Type-safe wrapper around pytest.mark.parametrize.""" return cast( Callable[[Callable[P, R]], Callable[P, R]], - pytest.mark.parametrize(*args, **kwargs), + pytest.mark.parametrize( + argnames, + argvalues, + indirect=indirect, + ids=ids, + scope=scope, + ), ) From 493421d7070a11f69c9315cb60dbbefa5ce449d0 Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Fri, 23 Jan 2026 08:38:11 +0900 Subject: [PATCH 05/12] check --- docs/agents/TASKS.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/agents/TASKS.md b/docs/agents/TASKS.md index 2f2105f..aa24090 100644 --- a/docs/agents/TASKS.md +++ b/docs/agents/TASKS.md @@ -15,13 +15,13 @@ ## PR #44 指摘対応 -- [ ] `src/exstruct/render/__init__.py` の `_page_index_from_suffix` を2桁固定ではなく可変桁の数値サフィックスに対応させ、`_rename_pages_for_print_area` の上書きリスクを解消する -- [ ] `src/exstruct/render/__init__.py` の `_export_sheet_pdf` の `finally` 内 `return` を削除し、PrintArea 復元失敗はログに残して例外を握りつぶさない -- [ ] `src/exstruct/core/pipeline.py` の `step_extract_formulas_map_*` の挙動を docstring に合わせる(失敗時にログしてスキップ)か、docstring を実装に合わせて修正する -- [ ] `docs/README.ja.md` の `**verbose**` 説明行を日本語に統一する +- [x] `src/exstruct/render/__init__.py` の `_page_index_from_suffix` を2桁固定ではなく可変桁の数値サフィックスに対応させ、`_rename_pages_for_print_area` の上書きリスクを解消する +- [x] `src/exstruct/render/__init__.py` の `_export_sheet_pdf` の `finally` 内 `return` を削除し、PrintArea 復元失敗はログに残して例外を握りつぶさない +- [x] `src/exstruct/core/pipeline.py` の `step_extract_formulas_map_*` の挙動を docstring に合わせる(失敗時にログしてスキップ)か、docstring を実装に合わせて修正する +- [x] `docs/README.ja.md` の `**verbose**` 説明行を日本語に統一する ## PR #44 コメント/Codecov 対応 -- [ ] Codecov パッチカバレッジ低下(60.53%)の指摘に対応し、対象ファイルの不足分テストを追加する(`src/exstruct/render/__init__.py`, `src/exstruct/core/cells.py`, `src/exstruct/core/backends/com_backend.py`, `src/exstruct/core/pipeline.py`, `src/exstruct/core/backends/openpyxl_backend.py`) +- [x] Codecov パッチカバレッジ低下(60.53%)の指摘に対応し、対象ファイルの不足分テストを追加する(`src/exstruct/render/__init__.py`, `src/exstruct/core/cells.py`, `src/exstruct/core/backends/com_backend.py`, `src/exstruct/core/pipeline.py`, `src/exstruct/core/backends/openpyxl_backend.py`) - [ ] Codecov の「Files with missing lines」で具体的な未カバー行を確認し、テスト観点を整理する -- [ ] Codacy 警告対応: `src/exstruct/render/__init__.py:274` の finally 内 return により例外が握りつぶされる可能性(`PyLintPython3_W0150`)を解消する +- [x] Codacy 警告対応: `src/exstruct/render/__init__.py:274` の finally 内 return により例外が握りつぶされる可能性(`PyLintPython3_W0150`)を解消する From 8aa23cb770f8d9a6d20e51b1bb30af7e59a2ecfc Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Fri, 23 Jan 2026 09:09:15 +0900 Subject: [PATCH 06/12] =?UTF-8?q?=E6=95=B0=E5=BC=8F=E5=8F=96=E5=BE=97?= =?UTF-8?q?=E6=A9=9F=E8=83=BD=E3=81=AE=E3=83=86=E3=82=B9=E3=83=88=E3=82=92?= =?UTF-8?q?=E8=BF=BD=E5=8A=A0=E3=81=97=E3=80=81Codecov=E3=81=AE=E8=AD=A6?= =?UTF-8?q?=E5=91=8A=E3=81=AB=E5=AF=BE=E5=BF=9C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/agents/TASKS.md | 2 +- tests/backends/test_backends.py | 156 +++++- tests/backends/test_print_areas_openpyxl.py | 59 ++- tests/core/test_cells_utils.py | 56 ++ tests/core/test_mode_output.py | 11 +- tests/core/test_pipeline.py | 539 +++++++++++++++++++- tests/render/test_render_init.py | 199 +++++++- 7 files changed, 1012 insertions(+), 10 deletions(-) diff --git a/docs/agents/TASKS.md b/docs/agents/TASKS.md index aa24090..1c1a3ae 100644 --- a/docs/agents/TASKS.md +++ b/docs/agents/TASKS.md @@ -23,5 +23,5 @@ ## PR #44 コメント/Codecov 対応 - [x] Codecov パッチカバレッジ低下(60.53%)の指摘に対応し、対象ファイルの不足分テストを追加する(`src/exstruct/render/__init__.py`, `src/exstruct/core/cells.py`, `src/exstruct/core/backends/com_backend.py`, `src/exstruct/core/pipeline.py`, `src/exstruct/core/backends/openpyxl_backend.py`) -- [ ] Codecov の「Files with missing lines」で具体的な未カバー行を確認し、テスト観点を整理する +- [x] Codecov の「Files with missing lines」で具体的な未カバー行を確認し、テスト観点を整理する - [x] Codacy 警告対応: `src/exstruct/render/__init__.py:274` の finally 内 return により例外が握りつぶされる可能性(`PyLintPython3_W0150`)を解消する diff --git a/tests/backends/test_backends.py b/tests/backends/test_backends.py index cfbd71b..8de6f1a 100644 --- a/tests/backends/test_backends.py +++ b/tests/backends/test_backends.py @@ -3,7 +3,7 @@ from _pytest.monkeypatch import MonkeyPatch from openpyxl import Workbook -from exstruct.core.backends.com_backend import ComBackend +from exstruct.core.backends.com_backend import ComBackend, _parse_print_area_range from exstruct.core.backends.openpyxl_backend import OpenpyxlBackend from exstruct.core.ranges import parse_range_zero_based @@ -175,6 +175,19 @@ def test_openpyxl_backend_extract_print_areas(tmp_path: Path) -> None: assert areas["Sheet1"][0].c1 == 0 +def test_openpyxl_backend_extract_print_areas_returns_empty_on_error( + monkeypatch: MonkeyPatch, tmp_path: Path +) -> None: + def _raise(*_args: object, **_kwargs: object) -> None: + raise RuntimeError("boom") + + monkeypatch.setattr( + "exstruct.core.backends.openpyxl_backend.openpyxl_workbook", _raise + ) + backend = OpenpyxlBackend(tmp_path / "book.xlsx") + assert backend.extract_print_areas() == {} + + def test_parse_range_zero_based_parses_sheet_prefix() -> None: bounds = parse_range_zero_based("Sheet1!A1:B2") assert bounds is not None @@ -182,3 +195,144 @@ def test_parse_range_zero_based_parses_sheet_prefix() -> None: assert bounds.c1 == 0 assert bounds.r2 == 1 assert bounds.c2 == 1 + + +def test_com_backend_extract_print_areas_success() -> None: + class _PageSetup: + PrintArea = "A1:B2,INVALID" + + class _SheetApi: + PageSetup = _PageSetup() + + class _Sheet: + name = "Sheet1" + api = _SheetApi() + + class _DummyWorkbook: + sheets = [_Sheet()] + + backend = ComBackend(_DummyWorkbook()) + areas = backend.extract_print_areas() + assert "Sheet1" in areas + assert areas["Sheet1"][0].r1 == 1 + assert areas["Sheet1"][0].c1 == 0 + assert areas["Sheet1"][0].r2 == 2 + assert areas["Sheet1"][0].c2 == 1 + + +def test_com_backend_parse_print_area_range_invalid() -> None: + assert _parse_print_area_range("INVALID") is None + + +class _Location: + def __init__(self, row: int | None = None, col: int | None = None) -> None: + self.Row = row + self.Column = col + + +class _BreakItem: + def __init__(self, row: int | None = None, col: int | None = None) -> None: + self.Location = _Location(row=row, col=col) + + +class _Breaks: + def __init__(self, items: list[_BreakItem]) -> None: + self._items = items + self.Count = len(items) + + def Item(self, index: int) -> _BreakItem: + return self._items[index - 1] + + +class _RangeRows: + def __init__(self, count: int) -> None: + self.Count = count + + +class _RangeCols: + def __init__(self, count: int) -> None: + self.Count = count + + +class _Range: + Row = 1 + Column = 1 + Rows = _RangeRows(2) + Columns = _RangeCols(2) + + +class _UsedRange: + Address = "A1:B2" + + +class _PageSetup: + PrintArea = "A1:B2" + + +class _SheetApi: + def __init__(self) -> None: + self.DisplayPageBreaks = False + self.PageSetup = _PageSetup() + self.UsedRange = _UsedRange() + self.HPageBreaks = _Breaks([_BreakItem(row=2)]) + self.VPageBreaks = _Breaks([_BreakItem(col=2)]) + + def Range(self, _addr: str) -> _Range: + return _Range() + + +class _Sheet: + name = "Sheet1" + + def __init__(self) -> None: + self.api = _SheetApi() + + +class _DummyWorkbook: + sheets = [_Sheet()] + + +def test_com_backend_extract_auto_page_breaks_success() -> None: + backend = ComBackend(_DummyWorkbook()) + areas = backend.extract_auto_page_breaks() + assert "Sheet1" in areas + assert areas["Sheet1"] + + +class _RestoreErrorSheetApi: + def __init__(self) -> None: + self._display = False + self.PageSetup = _PageSetup() + self.UsedRange = _UsedRange() + self.HPageBreaks = _Breaks([]) + self.VPageBreaks = _Breaks([]) + + @property + def DisplayPageBreaks(self) -> bool: + return self._display + + @DisplayPageBreaks.setter + def DisplayPageBreaks(self, value: bool) -> None: + if value is False: + raise RuntimeError("restore failed") + self._display = value + + def Range(self, _addr: str) -> _Range: + return _Range() + + +class _RestoreErrorSheet: + name = "Sheet1" + + def __init__(self) -> None: + self.api = _RestoreErrorSheetApi() + + +class _RestoreErrorWorkbook: + sheets = [_RestoreErrorSheet()] + + +def test_com_backend_extract_auto_page_breaks_restore_error() -> None: + backend = ComBackend(_RestoreErrorWorkbook()) + areas = backend.extract_auto_page_breaks() + assert "Sheet1" in areas diff --git a/tests/backends/test_print_areas_openpyxl.py b/tests/backends/test_print_areas_openpyxl.py index 99a4970..38d110d 100644 --- a/tests/backends/test_print_areas_openpyxl.py +++ b/tests/backends/test_print_areas_openpyxl.py @@ -3,7 +3,13 @@ from openpyxl import Workbook from exstruct import extract -from exstruct.core.backends.openpyxl_backend import OpenpyxlBackend +from exstruct.core.backends.openpyxl_backend import ( + OpenpyxlBackend, + _append_print_areas, + _extract_print_areas_from_defined_names, + _extract_print_areas_from_sheet_props, + _parse_print_area_range, +) def _make_book_with_print_area(path: Path) -> None: @@ -44,3 +50,54 @@ def test_openpyxl_backend_multiple_print_areas(tmp_path: Path) -> None: assert "Sheet1" in areas ranges = [(a.r1, a.c1, a.r2, a.c2) for a in areas["Sheet1"]] assert ranges == [(1, 0, 2, 1), (3, 3, 4, 4)] + + +def test_extract_print_areas_from_defined_names_filters_unknown_sheets() -> None: + class _DefinedArea: + destinations = [("Sheet1", "A1:B2"), ("Unknown", "C1:D2")] + + class _DefinedNames: + def get(self, _name: str) -> _DefinedArea: + return _DefinedArea() + + class _DummyWorkbook: + defined_names = _DefinedNames() + sheetnames = ["Sheet1"] + + areas = _extract_print_areas_from_defined_names(_DummyWorkbook()) + assert "Sheet1" in areas + assert "Unknown" not in areas + + +def test_extract_print_areas_from_defined_names_without_defined_names() -> None: + class _DummyWorkbook: + defined_names = None + + assert _extract_print_areas_from_defined_names(_DummyWorkbook()) == {} + + +def test_extract_print_areas_from_sheet_props_skips_empty() -> None: + class _SheetEmpty: + title = "Sheet1" + _print_area = None + + class _SheetWithArea: + title = "Sheet2" + _print_area = "A1:B2" + + class _DummyWorkbook: + worksheets = [_SheetEmpty(), _SheetWithArea()] + + areas = _extract_print_areas_from_sheet_props(_DummyWorkbook()) + assert "Sheet2" in areas + + +def test_parse_print_area_range_invalid() -> None: + assert _parse_print_area_range("INVALID") is None + + +def test_append_print_areas_skips_invalid_ranges() -> None: + areas: dict[str, list[object]] = {} + _append_print_areas(areas, "Sheet1", "A1:B2,INVALID") + assert "Sheet1" in areas + assert len(areas["Sheet1"]) == 1 diff --git a/tests/core/test_cells_utils.py b/tests/core/test_cells_utils.py index fa649f3..69bf9da 100644 --- a/tests/core/test_cells_utils.py +++ b/tests/core/test_cells_utils.py @@ -11,6 +11,7 @@ _normalize_formula_value, detect_tables_openpyxl, extract_sheet_formulas_map, + extract_sheet_formulas_map_com, ) @@ -99,3 +100,58 @@ def test_normalize_formula_from_com() -> None: assert _normalize_formula_from_com("A1") is None assert _normalize_formula_from_com("") is None assert _normalize_formula_from_com(None) is None + + +def test_extract_sheet_formulas_map_com_empty_range() -> None: + class _DummyLastCell: + row = 0 + column = 0 + + class _DummyUsedRange: + row = 1 + column = 1 + last_cell = _DummyLastCell() + + class _DummySheet: + name = "Sheet1" + used_range = _DummyUsedRange() + + class _DummyWorkbook: + sheets = [_DummySheet()] + + result = extract_sheet_formulas_map_com(_DummyWorkbook()) + sheet = result.get_sheet("Sheet1") + assert sheet is not None + assert sheet.formulas_map == {} + + +def test_extract_sheet_formulas_map_com_collects_formulas() -> None: + class _DummyLastCell: + row = 2 + column = 2 + + class _DummyUsedRange: + row = 1 + column = 1 + last_cell = _DummyLastCell() + + class _DummyRange: + formula = [["=A1", "B1"], ["=SUM(A1)", ""]] + + class _DummySheet: + name = "Sheet1" + used_range = _DummyUsedRange() + + def range(self, _start: object, _end: object) -> _DummyRange: + return _DummyRange() + + class _DummyWorkbook: + sheets = [_DummySheet()] + + result = extract_sheet_formulas_map_com(_DummyWorkbook()) + sheet = result.get_sheet("Sheet1") + assert sheet is not None + assert sheet.formulas_map == { + "=A1": [(1, 0)], + "=SUM(A1)": [(2, 0)], + } diff --git a/tests/core/test_mode_output.py b/tests/core/test_mode_output.py index f90bb5d..fbcfdaa 100644 --- a/tests/core/test_mode_output.py +++ b/tests/core/test_mode_output.py @@ -1,7 +1,8 @@ +import os from pathlib import Path import subprocess import sys -from typing import Never +from typing import Never, cast from _pytest.capture import CaptureFixture from _pytest.monkeypatch import MonkeyPatch @@ -9,7 +10,7 @@ import pytest import xlwings as xw -from exstruct import extract, process_excel +from exstruct import ExtractionMode, extract, process_excel from exstruct.models import Arrow @@ -29,6 +30,8 @@ def _make_basic_book(path: Path) -> None: def _ensure_excel() -> None: + if os.getenv("SKIP_COM_TESTS"): + pytest.skip("SKIP_COM_TESTS is set; skipping Excel-dependent test.") try: app = xw.App(add_book=False, visible=False) app.quit() @@ -115,11 +118,11 @@ def test_invalidモードはエラーになる(tmp_path: Path) -> None: path = tmp_path / "book.xlsx" _make_basic_book(path) with pytest.raises(ValueError): - extract(path, mode="invalid") # type: ignore[arg-type] + extract(path, mode=cast(ExtractionMode, "invalid")) out = tmp_path / "out.json" with pytest.raises(ValueError): - process_excel(path, out, mode="invalid") # type: ignore[arg-type] + process_excel(path, out, mode=cast(ExtractionMode, "invalid")) def test_CLIのmode引数バリデーション(tmp_path: Path) -> None: diff --git a/tests/core/test_pipeline.py b/tests/core/test_pipeline.py index 9b4a917..cbecd84 100644 --- a/tests/core/test_pipeline.py +++ b/tests/core/test_pipeline.py @@ -6,17 +6,36 @@ from exstruct.core.backends.com_backend import ComBackend from exstruct.core.backends.openpyxl_backend import OpenpyxlBackend -from exstruct.core.cells import MergedCellRange +from exstruct.core.cells import ( + MergedCellRange, + SheetColorsMap, + SheetFormulasMap, + WorkbookColorsMap, + WorkbookFormulasMap, +) from exstruct.core.pipeline import ( ExtractionArtifacts, ExtractionInputs, + PipelinePlan, + _col_in_intervals, _filter_rows_excluding_merged_values, + _merge_intervals, + _resolve_sheet_colors_map, + _resolve_sheet_formulas_map, build_cells_tables_workbook, build_com_pipeline, build_pre_com_pipeline, resolve_extraction_inputs, + run_com_pipeline, + run_extraction_pipeline, + step_extract_auto_page_breaks_com, + step_extract_charts_com, + step_extract_colors_map_com, + step_extract_colors_map_openpyxl, step_extract_formulas_map_com, step_extract_formulas_map_openpyxl, + step_extract_print_areas_com, + step_extract_shapes_com, ) from exstruct.models import CellRow, PrintArea @@ -198,6 +217,51 @@ def test_resolve_extraction_inputs_forces_merged_cells_when_excluding_values( assert inputs.include_merged_cells is True +def test_resolve_extraction_inputs_warns_on_xls_formulas( + tmp_path: Path, monkeypatch: MonkeyPatch +) -> None: + calls: list[str] = [] + + def _warn_once(key: str, message: str) -> None: + calls.append(key) + _ = message + + monkeypatch.setattr("exstruct.core.pipeline.warn_once", _warn_once) + + inputs = resolve_extraction_inputs( + tmp_path / "book.xls", + mode="standard", + include_cell_links=None, + include_print_areas=None, + include_auto_page_breaks=False, + include_colors_map=None, + include_default_background=False, + ignore_colors=None, + include_formulas_map=True, + include_merged_cells=None, + include_merged_values_in_rows=True, + ) + assert inputs.use_com_for_formulas is True + assert calls + + +def test_resolve_extraction_inputs_sets_ignore_colors(tmp_path: Path) -> None: + inputs = resolve_extraction_inputs( + tmp_path / "book.xlsx", + mode="verbose", + include_cell_links=None, + include_print_areas=None, + include_auto_page_breaks=False, + include_colors_map=True, + include_default_background=False, + ignore_colors=None, + include_formulas_map=None, + include_merged_cells=None, + include_merged_values_in_rows=True, + ) + assert inputs.ignore_colors == set() + + def test_build_cells_tables_workbook_uses_print_areas( monkeypatch: MonkeyPatch, tmp_path: Path ) -> None: @@ -286,6 +350,187 @@ def test_filter_rows_excluding_merged_values_drops_empty_rows() -> None: assert filtered == [] +def test_filter_rows_excluding_merged_values_returns_when_empty() -> None: + assert _filter_rows_excluding_merged_values([], []) == [] + + +def test_filter_rows_excluding_merged_values_keeps_rows_without_intervals() -> None: + rows = [CellRow(r=1, c={"0": "A"})] + merged_cells = [MergedCellRange(r1=2, c1=0, r2=2, c2=1, v="B")] + filtered = _filter_rows_excluding_merged_values(rows, merged_cells) + assert filtered == rows + + +def test_filter_rows_excluding_merged_values_drops_links_when_filtered() -> None: + rows = [CellRow(r=1, c={"0": "A", "1": "B"}, links={"0": "L0"})] + merged_cells = [MergedCellRange(r1=1, c1=0, r2=1, c2=0, v="A")] + filtered = _filter_rows_excluding_merged_values(rows, merged_cells) + assert filtered[0].links is None + + +def test_resolve_sheet_colors_map_empty() -> None: + assert _resolve_sheet_colors_map(None, "Sheet1") == {} + + +def test_resolve_sheet_formulas_map_empty() -> None: + assert _resolve_sheet_formulas_map(None, "Sheet1") == {} + + +def test_merge_intervals_merges_adjacent() -> None: + assert _merge_intervals([(1, 2), (3, 4)]) == [(1, 4)] + + +def test_col_in_intervals_fast_false() -> None: + assert _col_in_intervals(1, [(3, 5)]) is False + + +def test_step_extract_colors_map_openpyxl_sets_data( + tmp_path: Path, monkeypatch: MonkeyPatch +) -> None: + def _fake( + _: OpenpyxlBackend, + *, + include_default_background: bool, + ignore_colors: set[str] | None, + ) -> object: + _ = include_default_background + _ = ignore_colors + return WorkbookColorsMap(sheets={}) + + monkeypatch.setattr(OpenpyxlBackend, "extract_colors_map", _fake) + inputs = ExtractionInputs( + file_path=tmp_path / "book.xlsx", + mode="standard", + include_cell_links=False, + include_print_areas=False, + include_auto_page_breaks=False, + include_colors_map=True, + include_default_background=False, + ignore_colors=None, + include_formulas_map=False, + use_com_for_formulas=False, + include_merged_cells=False, + include_merged_values_in_rows=True, + ) + artifacts = ExtractionArtifacts() + step_extract_colors_map_openpyxl(inputs, artifacts) + assert artifacts.colors_map_data is not None + + +def test_step_extract_colors_map_com_falls_back( + tmp_path: Path, monkeypatch: MonkeyPatch +) -> None: + def _fake_com( + _: ComBackend, + *, + include_default_background: bool, + ignore_colors: set[str] | None, + ) -> None: + _ = include_default_background + _ = ignore_colors + return None + + def _fake_openpyxl( + _: OpenpyxlBackend, + *, + include_default_background: bool, + ignore_colors: set[str] | None, + ) -> object: + _ = include_default_background + _ = ignore_colors + return WorkbookColorsMap(sheets={}) + + monkeypatch.setattr(ComBackend, "extract_colors_map", _fake_com) + monkeypatch.setattr(OpenpyxlBackend, "extract_colors_map", _fake_openpyxl) + inputs = ExtractionInputs( + file_path=tmp_path / "book.xlsx", + mode="standard", + include_cell_links=False, + include_print_areas=False, + include_auto_page_breaks=False, + include_colors_map=True, + include_default_background=False, + ignore_colors=None, + include_formulas_map=False, + use_com_for_formulas=False, + include_merged_cells=False, + include_merged_values_in_rows=True, + ) + artifacts = ExtractionArtifacts() + step_extract_colors_map_com(inputs, artifacts, object()) + assert artifacts.colors_map_data is not None + + +def test_step_extract_auto_page_breaks_com_sets_data( + tmp_path: Path, monkeypatch: MonkeyPatch +) -> None: + def _fake(_: ComBackend) -> dict[str, list[PrintArea]]: + return {"Sheet1": [PrintArea(r1=1, c1=0, r2=1, c2=0)]} + + monkeypatch.setattr(ComBackend, "extract_auto_page_breaks", _fake) + inputs = ExtractionInputs( + file_path=tmp_path / "book.xlsx", + mode="standard", + include_cell_links=False, + include_print_areas=False, + include_auto_page_breaks=True, + include_colors_map=False, + include_default_background=False, + ignore_colors=None, + include_formulas_map=False, + use_com_for_formulas=False, + include_merged_cells=False, + include_merged_values_in_rows=True, + ) + artifacts = ExtractionArtifacts() + step_extract_auto_page_breaks_com(inputs, artifacts, object()) + assert artifacts.auto_page_break_data + + +def test_build_cells_tables_workbook_fetches_missing_maps( + tmp_path: Path, monkeypatch: MonkeyPatch +) -> None: + colors_map = WorkbookColorsMap(sheets={}) + formulas_map = WorkbookFormulasMap(sheets={}) + + def _fake_colors( + _: OpenpyxlBackend, + *, + include_default_background: bool, + ignore_colors: set[str] | None, + ) -> object: + _ = include_default_background + _ = ignore_colors + return colors_map + + def _fake_formulas(_: OpenpyxlBackend) -> object: + return formulas_map + + monkeypatch.setattr(OpenpyxlBackend, "extract_colors_map", _fake_colors) + monkeypatch.setattr(OpenpyxlBackend, "extract_formulas_map", _fake_formulas) + + inputs = ExtractionInputs( + file_path=tmp_path / "book.xlsx", + mode="standard", + include_cell_links=False, + include_print_areas=False, + include_auto_page_breaks=False, + include_colors_map=True, + include_default_background=False, + ignore_colors=None, + include_formulas_map=True, + use_com_for_formulas=False, + include_merged_cells=False, + include_merged_values_in_rows=True, + ) + artifacts = ExtractionArtifacts( + cell_data={"Sheet1": [CellRow(r=1, c={"0": "A"})]}, + merged_cell_data={"Sheet1": []}, + ) + wb = build_cells_tables_workbook(inputs=inputs, artifacts=artifacts, reason="test") + assert "Sheet1" in wb.sheets + + def test_step_extract_formulas_map_openpyxl_skips_on_failure( tmp_path: Path, monkeypatch: MonkeyPatch, caplog: "pytest.LogCaptureFixture" ) -> None: @@ -344,3 +589,295 @@ def _raise(_: ComBackend) -> object: assert artifacts.formulas_map_data is None assert "Failed to extract formulas_map via COM" in caplog.text + + +def test_filter_rows_excluding_merged_values_returns_rows_when_intervals_empty() -> ( + None +): + rows = [CellRow(r=1, c={"0": "A"})] + merged_cells = [MergedCellRange(r1=2, c1=0, r2=1, c2=1, v="A")] + assert _filter_rows_excluding_merged_values(rows, merged_cells) == rows + + +def test_resolve_sheet_colors_map_missing_sheet() -> None: + colors_map = WorkbookColorsMap( + sheets={"Other": SheetColorsMap(sheet_name="Other", colors_map={})} + ) + assert _resolve_sheet_colors_map(colors_map, "Sheet1") == {} + + +def test_resolve_sheet_formulas_map_missing_sheet() -> None: + formulas_map = WorkbookFormulasMap( + sheets={"Other": SheetFormulasMap(sheet_name="Other", formulas_map={})} + ) + assert _resolve_sheet_formulas_map(formulas_map, "Sheet1") == {} + + +def test_merge_intervals_empty() -> None: + assert _merge_intervals([]) == [] + + +def test_merge_intervals_keeps_non_overlapping() -> None: + assert _merge_intervals([(1, 2), (5, 6)]) == [(1, 2), (5, 6)] + + +def test_step_extract_shapes_com_sets_data( + tmp_path: Path, monkeypatch: MonkeyPatch +) -> None: + shapes_data = {"Sheet1": [object()]} + + def _fake(_: object, *, mode: str) -> dict[str, list[object]]: + _ = mode + return shapes_data + + monkeypatch.setattr("exstruct.core.pipeline.get_shapes_with_position", _fake) + inputs = ExtractionInputs( + file_path=tmp_path / "book.xlsx", + mode="standard", + include_cell_links=False, + include_print_areas=False, + include_auto_page_breaks=False, + include_colors_map=False, + include_default_background=False, + ignore_colors=None, + include_formulas_map=False, + use_com_for_formulas=False, + include_merged_cells=False, + include_merged_values_in_rows=True, + ) + artifacts = ExtractionArtifacts() + step_extract_shapes_com(inputs, artifacts, object()) + assert artifacts.shape_data == shapes_data + + +def test_step_extract_charts_com_sets_data( + tmp_path: Path, monkeypatch: MonkeyPatch +) -> None: + charts = [object()] + + def _fake(_: object, *, mode: str) -> list[object]: + _ = mode + return charts + + class _Sheet: + def __init__(self, name: str) -> None: + self.name = name + + class _Workbook: + sheets = [_Sheet("Sheet1")] + + monkeypatch.setattr("exstruct.core.pipeline.get_charts", _fake) + inputs = ExtractionInputs( + file_path=tmp_path / "book.xlsx", + mode="standard", + include_cell_links=False, + include_print_areas=False, + include_auto_page_breaks=False, + include_colors_map=False, + include_default_background=False, + ignore_colors=None, + include_formulas_map=False, + use_com_for_formulas=False, + include_merged_cells=False, + include_merged_values_in_rows=True, + ) + artifacts = ExtractionArtifacts() + step_extract_charts_com(inputs, artifacts, _Workbook()) + assert artifacts.chart_data == {"Sheet1": charts} + + +def test_step_extract_print_areas_com_skips_when_present( + tmp_path: Path, monkeypatch: MonkeyPatch +) -> None: + def _raise(_: ComBackend) -> object: + raise RuntimeError("should not be called") + + monkeypatch.setattr(ComBackend, "extract_print_areas", _raise) + inputs = ExtractionInputs( + file_path=tmp_path / "book.xlsx", + mode="standard", + include_cell_links=False, + include_print_areas=True, + include_auto_page_breaks=False, + include_colors_map=False, + include_default_background=False, + ignore_colors=None, + include_formulas_map=False, + use_com_for_formulas=False, + include_merged_cells=False, + include_merged_values_in_rows=True, + ) + artifacts = ExtractionArtifacts( + print_area_data={"Sheet1": [PrintArea(r1=1, c1=0, r2=1, c2=0)]} + ) + step_extract_print_areas_com(inputs, artifacts, object()) + + +def test_step_extract_print_areas_com_sets_data( + tmp_path: Path, monkeypatch: MonkeyPatch +) -> None: + def _fake(_: ComBackend) -> dict[str, list[PrintArea]]: + return {"Sheet1": [PrintArea(r1=1, c1=0, r2=1, c2=0)]} + + monkeypatch.setattr(ComBackend, "extract_print_areas", _fake) + inputs = ExtractionInputs( + file_path=tmp_path / "book.xlsx", + mode="standard", + include_cell_links=False, + include_print_areas=True, + include_auto_page_breaks=False, + include_colors_map=False, + include_default_background=False, + ignore_colors=None, + include_formulas_map=False, + use_com_for_formulas=False, + include_merged_cells=False, + include_merged_values_in_rows=True, + ) + artifacts = ExtractionArtifacts() + step_extract_print_areas_com(inputs, artifacts, object()) + assert artifacts.print_area_data + + +def test_step_extract_colors_map_com_sets_data( + tmp_path: Path, monkeypatch: MonkeyPatch +) -> None: + colors_map = WorkbookColorsMap(sheets={}) + + def _fake_com( + _: ComBackend, + *, + include_default_background: bool, + ignore_colors: set[str] | None, + ) -> object: + _ = include_default_background + _ = ignore_colors + return colors_map + + def _raise( + _: OpenpyxlBackend, + *, + include_default_background: bool, + ignore_colors: set[str] | None, + ) -> object: + _ = include_default_background + _ = ignore_colors + raise RuntimeError("should not be called") + + monkeypatch.setattr(ComBackend, "extract_colors_map", _fake_com) + monkeypatch.setattr(OpenpyxlBackend, "extract_colors_map", _raise) + inputs = ExtractionInputs( + file_path=tmp_path / "book.xlsx", + mode="standard", + include_cell_links=False, + include_print_areas=False, + include_auto_page_breaks=False, + include_colors_map=True, + include_default_background=False, + ignore_colors=None, + include_formulas_map=False, + use_com_for_formulas=False, + include_merged_cells=False, + include_merged_values_in_rows=True, + ) + artifacts = ExtractionArtifacts() + step_extract_colors_map_com(inputs, artifacts, object()) + assert artifacts.colors_map_data is colors_map + + +def test_run_com_pipeline_executes_steps(tmp_path: Path) -> None: + calls: list[str] = [] + + def _step(_: ExtractionInputs, artifacts: ExtractionArtifacts, __: object) -> None: + calls.append("called") + artifacts.shape_data = {"Sheet1": [object()]} + + inputs = ExtractionInputs( + file_path=tmp_path / "book.xlsx", + mode="standard", + include_cell_links=False, + include_print_areas=False, + include_auto_page_breaks=False, + include_colors_map=False, + include_default_background=False, + ignore_colors=None, + include_formulas_map=False, + use_com_for_formulas=False, + include_merged_cells=False, + include_merged_values_in_rows=True, + ) + artifacts = ExtractionArtifacts() + run_com_pipeline([_step], inputs, artifacts, object()) + assert calls == ["called"] + assert artifacts.shape_data + + +def test_run_extraction_pipeline_com_success( + tmp_path: Path, monkeypatch: MonkeyPatch +) -> None: + class _Sheet: + def __init__(self, name: str) -> None: + self.name = name + + class _Sheets: + def __init__(self) -> None: + self._sheets = {"Sheet1": _Sheet("Sheet1")} + + def __getitem__(self, name: str) -> _Sheet: + return self._sheets[name] + + class _Workbook: + sheets = _Sheets() + + def _pre_step(_: ExtractionInputs, artifacts: ExtractionArtifacts) -> None: + artifacts.cell_data = {"Sheet1": [CellRow(r=1, c={"0": "A"})]} + artifacts.merged_cell_data = {"Sheet1": []} + + def _fake_plan(_: ExtractionInputs) -> PipelinePlan: + return PipelinePlan(pre_com_steps=[_pre_step], com_steps=[], use_com=True) + + def _fake_detect_tables(_: object) -> list[str]: + return [] + + def _fake_workbook(_: Path) -> object: + class _Context: + def __enter__(self) -> _Workbook: + return _Workbook() + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc: BaseException | None, + tb: object | None, + ) -> bool | None: + _ = exc_type + _ = exc + _ = tb + return None + + return _Context() + + monkeypatch.delenv("SKIP_COM_TESTS", raising=False) + monkeypatch.setattr("exstruct.core.pipeline.build_pipeline_plan", _fake_plan) + monkeypatch.setattr("exstruct.core.pipeline.detect_tables", _fake_detect_tables) + monkeypatch.setattr("exstruct.core.pipeline.xlwings_workbook", _fake_workbook) + + inputs = ExtractionInputs( + file_path=tmp_path / "book.xlsx", + mode="standard", + include_cell_links=False, + include_print_areas=False, + include_auto_page_breaks=False, + include_colors_map=False, + include_default_background=False, + ignore_colors=None, + include_formulas_map=False, + use_com_for_formulas=False, + include_merged_cells=False, + include_merged_values_in_rows=True, + ) + + result = run_extraction_pipeline(inputs) + assert result.state.com_attempted is True + assert result.state.com_succeeded is True + assert "Sheet1" in result.workbook.sheets diff --git a/tests/render/test_render_init.py b/tests/render/test_render_init.py index eb2708d..b8aba44 100644 --- a/tests/render/test_render_init.py +++ b/tests/render/test_render_init.py @@ -543,6 +543,199 @@ def test_sanitize_sheet_filename() -> None: assert render._sanitize_sheet_filename(" ") == "sheet" +def test_split_csv_respecting_quotes() -> None: + raw = "'Sheet 1'!A1:B2,'Sheet,2'!C3:D4,'O''Brien'!E1:F2" + parts = render._split_csv_respecting_quotes(raw) + assert parts == ["'Sheet 1'!A1:B2", "'Sheet,2'!C3:D4", "'O''Brien'!E1:F2"] + + +def test_extract_print_areas_with_page_setup() -> None: + class _PageSetup: + PrintArea = "'Sheet 1'!A1:B2,'Sheet 1'!C3:D4" + + class _SheetApi: + PageSetup = _PageSetup() + + areas = render._extract_print_areas(_SheetApi()) + assert areas == ["'Sheet 1'!A1:B2", "'Sheet 1'!C3:D4"] + + +def test_extract_print_areas_empty_print_area() -> None: + class _PageSetup: + PrintArea = "" + + class _SheetApi: + PageSetup = _PageSetup() + + assert render._extract_print_areas(_SheetApi()) == [] + + +def test_extract_print_areas_handles_exception() -> None: + class _PageSetup: + @property + def PrintArea(self) -> str: + raise RuntimeError("boom") + + class _SheetApi: + PageSetup = _PageSetup() + + assert render._extract_print_areas(_SheetApi()) == [] + + +def test_iter_sheet_apis_prefers_worksheets_collection() -> None: + class _WsApi: + def __init__(self, name: str) -> None: + self.Name = name + + class _Worksheets: + def __init__(self) -> None: + self.Count = 2 + + def Item(self, index: int) -> _WsApi: + return _WsApi(f"Sheet{index}") + + class _Api: + Worksheets = _Worksheets() + + class _Wb: + api = _Api() + sheets: list[Any] = [] + + result = render._iter_sheet_apis(_Wb()) + assert result[0][1] == "Sheet1" + assert result[1][1] == "Sheet2" + + +def test_export_pdf_propagates_render_error( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + def _raise() -> xw.App: + raise RenderError("boom") + + monkeypatch.setattr(render, "_require_excel_app", _raise) + with pytest.raises(RenderError, match="boom"): + render.export_pdf(tmp_path / "in.xlsx", tmp_path / "out.pdf") + + +def test_require_pdfium_success(monkeypatch: pytest.MonkeyPatch) -> None: + fake_pdfium = ModuleType("pypdfium2") + sys.modules["pypdfium2"] = fake_pdfium + try: + assert render._require_pdfium() is fake_pdfium + finally: + sys.modules.pop("pypdfium2", None) + + +def test_build_sheet_export_plan_handles_multiple_areas( + monkeypatch: pytest.MonkeyPatch, +) -> None: + class _SheetApi: + pass + + def _fake_iter(_: xw.Book) -> list[tuple[int, str, _SheetApi]]: + return [(0, "Sheet1", _SheetApi())] + + def _fake_extract(_: _SheetApi) -> list[str]: + return ["A1:B2", "C3:D4"] + + monkeypatch.setattr(render, "_iter_sheet_apis", _fake_iter) + monkeypatch.setattr(render, "_extract_print_areas", _fake_extract) + + plan = render._build_sheet_export_plan(cast(xw.Book, object())) + assert [item[0] for item in plan] == ["Sheet1", "Sheet1"] + assert [item[2] for item in plan] == ["A1:B2", "C3:D4"] + + +def test_page_index_from_suffix_default() -> None: + assert render._page_index_from_suffix("sheet") == 0 + + +def test_page_index_from_suffix_non_digit() -> None: + assert render._page_index_from_suffix("sheet_pxx") == 0 + + +def test_export_sheet_pdf_skips_invalid_print_area(tmp_path: Path) -> None: + class _BadPageSetup: + @property + def PrintArea(self) -> str: + return "A1:B2" + + @PrintArea.setter + def PrintArea(self, _value: object) -> None: + raise RuntimeError("bad") + + class _SheetApi: + PageSetup = _BadPageSetup() + + def ExportAsFixedFormat( + self, _file_format: int, _output_path: str, *args: object, **kwargs: object + ) -> None: + _ = args + _ = kwargs + + render._export_sheet_pdf( + _SheetApi(), + tmp_path / "out.pdf", + ignore_print_areas=False, + print_area="A1:B2", + ) + + +def test_render_sheet_images_requires_pdfium(tmp_path: Path) -> None: + with pytest.raises(RenderError, match="pypdfium2 is required"): + render._render_sheet_images( + None, + tmp_path / "sheet.pdf", + tmp_path, + 0, + "Sheet1", + 144, + False, + ) + + +def test_export_sheet_images_with_app_retries_on_empty( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + calls: list[int] = [] + + def _fake_render( + _pdfium: ModuleType | None, + _pdf_path: Path, + output_dir: Path, + sheet_index: int, + safe_name: str, + _dpi: int, + _use_subprocess: bool, + ) -> list[Path]: + calls.append(1) + if len(calls) == 1: + return [] + return [output_dir / f"{sheet_index + 1:02d}_{safe_name}.png"] + + monkeypatch.setattr(render, "_render_sheet_images", _fake_render) + monkeypatch.setattr( + render, "_require_excel_app", lambda: FakeApp(["Sheet1"], False) + ) + monkeypatch.setattr(render, "_export_sheet_pdf", lambda *a, **k: None) + monkeypatch.setattr( + render, + "_build_sheet_export_plan", + lambda _wb: [("Sheet1", cast(render._SheetApiProtocol, object()), None)], + ) + + result = render._export_sheet_images_with_app( + tmp_path / "in.xlsx", + tmp_path / "out", + tmp_path / "tmp", + 144, + False, + None, + ) + assert len(calls) == 2 + assert result + + def test_page_index_from_suffix_handles_multi_digits() -> None: assert render._page_index_from_suffix("sheet_01") == 0 assert render._page_index_from_suffix("sheet_01_p01") == 0 @@ -552,7 +745,7 @@ def test_page_index_from_suffix_handles_multi_digits() -> None: def test_export_sheet_pdf_does_not_swallow_export_errors(tmp_path: Path) -> None: - class _FlakyPageSetup(render._PageSetupProtocol): + class _FlakyPageSetup: def __init__(self) -> None: self._print_area: object = "A1" self._set_calls = 0 @@ -569,7 +762,9 @@ def PrintArea(self, value: object) -> None: self._set_calls += 1 class _ExplodingSheetApi: - PageSetup: render._PageSetupProtocol = _FlakyPageSetup() + PageSetup: render._PageSetupProtocol = cast( + render._PageSetupProtocol, _FlakyPageSetup() + ) def ExportAsFixedFormat( self, file_format: int, output_path: str, *args: object, **kwargs: object From 30db3d8221f79440b6e1c85082d15aef8201a7d8 Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Fri, 23 Jan 2026 09:15:18 +0900 Subject: [PATCH 07/12] =?UTF-8?q?Git=E3=82=B3=E3=83=9E=E3=83=B3=E3=83=89?= =?UTF-8?q?=E3=81=AE=E5=AE=9F=E8=A1=8C=E6=96=B9=E6=B3=95=E3=82=92=E6=94=B9?= =?UTF-8?q?=E5=96=84=E3=81=97=E3=80=81=E3=82=A8=E3=83=A9=E3=83=BC=E3=83=8F?= =?UTF-8?q?=E3=83=B3=E3=83=89=E3=83=AA=E3=83=B3=E3=82=B0=E3=82=92=E8=BF=BD?= =?UTF-8?q?=E5=8A=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scripts/codacy_issues.py | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/scripts/codacy_issues.py b/scripts/codacy_issues.py index 06db656..368755b 100644 --- a/scripts/codacy_issues.py +++ b/scripts/codacy_issues.py @@ -7,7 +7,7 @@ import json import os import re -import subprocess +import subprocess # nosec B404 - used for fixed git commands only import sys from typing import Any, cast import urllib.parse @@ -108,20 +108,28 @@ def build_pr_issues_url( ) -def run_git(cmd: list[str]) -> str | None: - try: - out = subprocess.check_output(cmd, stderr=subprocess.DEVNULL) - return out.decode("utf-8", errors="replace").strip() - except Exception: - return None - - def get_git_origin_url() -> str | None: # git repo check - ok = run_git(["git", "rev-parse", "--is-inside-work-tree"]) - if not ok: + try: + result = subprocess.run( + ["git", "rev-parse", "--is-inside-work-tree"], + capture_output=True, + text=True, + check=False, + ) # nosec B603 - fixed git command without user input + if result.returncode != 0 or not result.stdout.strip(): + return None + result = subprocess.run( + ["git", "remote", "get-url", "origin"], + capture_output=True, + text=True, + check=False, + ) # nosec B603 - fixed git command without user input + if result.returncode != 0: + return None + return result.stdout.strip() + except Exception: return None - return run_git(["git", "remote", "get-url", "origin"]) @dataclass @@ -180,7 +188,7 @@ def fetch_json( ) try: - with urllib.request.urlopen(req, timeout=60) as res: + with urllib.request.urlopen(req, timeout=60) as res: # nosec B310 - validated https origin raw = res.read().decode("utf-8", errors="replace") status = getattr(res, "status", 0) or 0 if status < 200 or status >= 300: From 11c7695191b93d1362c138c003077113ac45b71a Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Fri, 23 Jan 2026 10:25:46 +0900 Subject: [PATCH 08/12] =?UTF-8?q?=E6=95=B0=E5=BC=8F=E5=8F=96=E5=BE=97?= =?UTF-8?q?=E6=A9=9F=E8=83=BD=E3=81=AE=E6=94=B9=E5=96=84=E3=81=A8=E3=83=86?= =?UTF-8?q?=E3=82=B9=E3=83=88=E3=81=AE=E8=BF=BD=E5=8A=A0=E3=80=81Codacy?= =?UTF-8?q?=E3=81=8A=E3=82=88=E3=81=B3Codecov=E3=81=AE=E8=AD=A6=E5=91=8A?= =?UTF-8?q?=E3=81=AB=E5=AF=BE=E5=BF=9C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/agents/TASKS.md | 12 + scripts/codacy_issues.py | 230 +++++++++++++++----- src/exstruct/render/__init__.py | 4 + tests/backends/test_backends.py | 6 +- tests/backends/test_print_areas_openpyxl.py | 11 +- tests/core/test_pipeline.py | 24 +- tests/models/test_models_export.py | 2 +- tests/render/test_render_init.py | 33 ++- tests/utils.py | 13 +- 9 files changed, 266 insertions(+), 69 deletions(-) diff --git a/docs/agents/TASKS.md b/docs/agents/TASKS.md index 1c1a3ae..cf7660f 100644 --- a/docs/agents/TASKS.md +++ b/docs/agents/TASKS.md @@ -25,3 +25,15 @@ - [x] Codecov パッチカバレッジ低下(60.53%)の指摘に対応し、対象ファイルの不足分テストを追加する(`src/exstruct/render/__init__.py`, `src/exstruct/core/cells.py`, `src/exstruct/core/backends/com_backend.py`, `src/exstruct/core/pipeline.py`, `src/exstruct/core/backends/openpyxl_backend.py`) - [x] Codecov の「Files with missing lines」で具体的な未カバー行を確認し、テスト観点を整理する - [x] Codacy 警告対応: `src/exstruct/render/__init__.py:274` の finally 内 return により例外が握りつぶされる可能性(`PyLintPython3_W0150`)を解消する + +## PR #44 CodeRabbit 再レビュー対応 + +- [ ] `scripts/codacy_issues.py`: トークン未設定時の `sys.exit(1)` をモジュールトップから排除し、`get_token()` または `main()` で検証する +- [ ] `scripts/codacy_issues.py`: `format_for_ai` の `sys.exit` を `ValueError` に置換し、呼び出し側でバリデーションする +- [ ] `scripts/codacy_issues.py`: `urlopen` の非2xxチェック(到達不能)を削除または `HTTPError` 側へ寄せる +- [ ] `scripts/codacy_issues.py`: `status` の固定値バリデーションを廃止する(固定なら直代入/必要なら CLI 引数化) +- [ ] `tests/backends/test_print_areas_openpyxl.py`: `PrintAreaData` 型に合わせる+関連テストに Google スタイル docstring を付与 +- [ ] `tests/core/test_pipeline.py`: 無効な `MergedCellRange` を有効な非重複レンジに修正する +- [ ] `tests/backends/test_backends.py`: `sheets` のクラス属性共有を避け、インスタンス属性に変更する +- [ ] `tests/render/test_render_init.py` / `tests/utils.py` / `tests/models/test_models_export.py`: docstring/コメントの指摘を反映する +- [ ] `src/exstruct/render/__init__.py`: Protocol クラスに Google スタイル docstring を追加する diff --git a/scripts/codacy_issues.py b/scripts/codacy_issues.py index 368755b..6881203 100644 --- a/scripts/codacy_issues.py +++ b/scripts/codacy_issues.py @@ -19,12 +19,21 @@ BASE = "https://api.codacy.com/api/v3" BASE_URL = urllib.parse.urlparse(BASE) BASE_PATH = BASE_URL.path.rstrip("/") # "/api/v3" -TOKEN = os.environ.get("CODACY_API_TOKEN") -if TOKEN is None: - print("CODACY_API_TOKEN is not set", file=sys.stderr) - sys.exit(1) -TOKEN_STR = TOKEN + +def get_token() -> str: + """Return the Codacy API token or raise if missing. + + Returns: + Codacy API token string from the environment. + + Raises: + ValueError: If CODACY_API_TOKEN is not set. + """ + token = os.environ.get("CODACY_API_TOKEN") + if token is None: + raise ValueError("CODACY_API_TOKEN is not set") + return token # ================================ @@ -34,6 +43,14 @@ def get_level_priority(level: str | None) -> int | None: + """Convert a severity level name to a priority number. + + Args: + level: Severity level string. + + Returns: + Priority number or None if unknown. + """ if level == "Error": return 4 if level == "High": @@ -46,31 +63,62 @@ def get_level_priority(level: str | None) -> int | None: def normalize_provider(value: str) -> str | None: + """Normalize provider short code. + + Args: + value: Provider identifier. + + Returns: + Provider code if valid, otherwise None. + """ return value if value in ("gh", "gl", "bb") else None def assert_valid_segment(name: str, value: str, pattern: re.Pattern[str]) -> str: + """Validate an identifier segment against a regex. + + Args: + name: Segment name for error reporting. + value: Segment value. + pattern: Compiled regex pattern for allowed values. + + Returns: + The validated value. + + Raises: + ValueError: If the value is empty or invalid. + """ if (not value) or (pattern.match(value) is None): - print(f"Invalid {name}: {value}", file=sys.stderr) - sys.exit(1) + raise ValueError(f"Invalid {name}: {value}") return value def assert_valid_choice(name: str, value: str, choices: list[str]) -> str: + """Validate that a value is in a list of choices. + + Args: + name: Parameter name for error reporting. + value: Input value. + choices: Allowed values. + + Returns: + The validated value. + + Raises: + ValueError: If the value is not allowed. + """ if value not in choices: - print( - f"Invalid {name}: {value}. Valid values: {', '.join(choices)}", - file=sys.stderr, - ) - sys.exit(1) + raise ValueError(f"Invalid {name}: {value}. Valid values: {', '.join(choices)}") return value def encode_segment(value: str) -> str: + """URL-encode a path segment.""" return urllib.parse.quote(value, safe="") def build_codacy_url(pathname: str, query: dict[str, str] | None = None) -> str: + """Build a Codacy API URL from a path and query parameters.""" # Ensure we keep origin and base path url = f"{BASE_URL.scheme}://{BASE_URL.netloc}{BASE_PATH}{pathname}" if query: @@ -79,18 +127,29 @@ def build_codacy_url(pathname: str, query: dict[str, str] | None = None) -> str: def assert_codacy_url(url: str) -> str: + """Ensure the URL targets the Codacy API origin and analysis path. + + Args: + url: URL to validate. + + Returns: + The original URL when valid. + + Raises: + ValueError: If the URL is not within the expected origin/path. + """ # Basic safety: must be same origin and start with /api/v3/analysis/ parsed = urllib.parse.urlparse(url) expected_origin = f"{BASE_URL.scheme}://{BASE_URL.netloc}" origin = f"{parsed.scheme}://{parsed.netloc}" expected_prefix = f"{BASE_PATH}/analysis/" if origin != expected_origin or not parsed.path.startswith(expected_prefix): - print(f"Invalid URL: {url}", file=sys.stderr) - sys.exit(1) + raise ValueError(f"Invalid URL: {url}") return url def build_repo_issues_url(provider: str, org: str, repo: str, limit: int) -> str: + """Build a repository issues API URL.""" return build_codacy_url( f"/analysis/organizations/{encode_segment(provider)}/{encode_segment(org)}" f"/repositories/{encode_segment(repo)}/issues/search", @@ -101,6 +160,7 @@ def build_repo_issues_url(provider: str, org: str, repo: str, limit: int) -> str def build_pr_issues_url( provider: str, org: str, repo: str, pr: str, limit: int, status: str ) -> str: + """Build a pull request issues API URL.""" return build_codacy_url( f"/analysis/organizations/{encode_segment(provider)}/{encode_segment(org)}" f"/repositories/{encode_segment(repo)}/pull-requests/{encode_segment(pr)}/issues", @@ -109,6 +169,7 @@ def build_pr_issues_url( def get_git_origin_url() -> str | None: + """Return the git origin URL if available.""" # git repo check try: result = subprocess.run( @@ -128,18 +189,21 @@ def get_git_origin_url() -> str | None: if result.returncode != 0: return None return result.stdout.strip() - except Exception: + except (OSError, subprocess.SubprocessError): return None @dataclass class GitRemoteInfo: + """Parsed git remote information.""" + provider: str org: str repo: str def parse_git_remote(url: str) -> GitRemoteInfo | None: + """Parse a git remote URL into provider/org/repo info.""" # HTTPS m = re.match(r"^https?://([^/]+)/([^/]+)/([^/]+?)(?:\.git)?$", url) # SSH @@ -169,11 +233,21 @@ def is_same_or_subdomain(hostname: str, base_domain: str) -> bool: def fetch_json( url: str, method: str = "GET", body: dict[str, Any] | None = None ) -> dict[str, Any]: + """Fetch JSON from the Codacy API. + + Args: + url: Codacy API URL. + method: HTTP method. + body: Optional JSON body for non-GET requests. + + Returns: + Parsed JSON dictionary. + """ safe_url = assert_codacy_url(url) headers = { "Accept": "application/json", - "api-token": TOKEN_STR, + "api-token": get_token(), } data: bytes | None = None @@ -190,9 +264,6 @@ def fetch_json( try: with urllib.request.urlopen(req, timeout=60) as res: # nosec B310 - validated https origin raw = res.read().decode("utf-8", errors="replace") - status = getattr(res, "status", 0) or 0 - if status < 200 or status >= 300: - raise RuntimeError(f"HTTP {status}: {raw}") try: parsed = json.loads(raw) except json.JSONDecodeError as exc: @@ -215,6 +286,7 @@ def fetch_json( # API # ================================ def fetch_repo_issues(provider: str, org: str, repo: str, limit: int) -> dict[str, Any]: + """Fetch issues for a repository.""" url = build_repo_issues_url(provider, org, repo, limit) return fetch_json(url, method="POST", body={}) @@ -222,6 +294,7 @@ def fetch_repo_issues(provider: str, org: str, repo: str, limit: int) -> dict[st def fetch_pr_issues( provider: str, org: str, repo: str, pr: str, limit: int, status: str = "all" ) -> dict[str, Any]: + """Fetch issues for a pull request.""" url = build_pr_issues_url(provider, org, repo, pr, limit, status) return fetch_json(url, method="GET") @@ -230,13 +303,23 @@ def fetch_pr_issues( # AI Output Formatter # ================================ def format_for_ai(raw_issues: list[dict[str, Any]], min_level: str) -> list[str]: + """Format raw Codacy issues for AI output. + + Args: + raw_issues: Issue dictionaries from Codacy API. + min_level: Minimum severity level to include. + + Returns: + Formatted issue strings. + + Raises: + ValueError: If min_level is invalid. + """ min_priority = get_level_priority(min_level) if min_priority is None: - print( - f"Invalid --min-level: {min_level}. Valid values: {', '.join(LEVELS)}", - file=sys.stderr, + raise ValueError( + f"Invalid min_level: {min_level}. Valid values: {', '.join(LEVELS)}" ) - sys.exit(1) out: list[str] = [] @@ -264,6 +347,7 @@ def format_for_ai(raw_issues: list[dict[str, Any]], min_level: str) -> list[str] # CLI # ================================ def parse_args(argv: list[str]) -> argparse.Namespace: + """Parse command-line arguments.""" p = argparse.ArgumentParser(add_help=False) p.add_argument("org", nargs="?", default=None) p.add_argument("repo", nargs="?", default=None) @@ -274,21 +358,68 @@ def parse_args(argv: list[str]) -> argparse.Namespace: return p.parse_args(argv) +def apply_git_defaults(args: argparse.Namespace) -> None: + """Populate missing org/repo/provider from git origin when possible.""" + if args.org and args.repo: + return + origin_url = get_git_origin_url() + if not origin_url: + return + parsed = parse_git_remote(origin_url) + if not parsed: + return + if args.provider is None: + args.provider = parsed.provider + if args.org is None: + args.org = parsed.org + if args.repo is None: + args.repo = parsed.repo + + +def resolve_segments(args: argparse.Namespace) -> tuple[str, str, str | None]: + """Validate and return org/repo/pr segments. + + Args: + args: Parsed CLI arguments. + + Returns: + Tuple of (org, repo, pr). + """ + segment_pattern = re.compile(r"^[A-Za-z0-9_.-]+$") + org = assert_valid_segment("org", args.org, segment_pattern) + repo = assert_valid_segment("repo", args.repo, segment_pattern) + pr = args.pr + if pr is not None: + pr = assert_valid_segment("pr", pr, re.compile(r"^[0-9]+$")) + return org, repo, pr + + +def build_payload( + *, + pr: str | None, + org: str, + repo: str, + min_level: str, + issues: list[str], +) -> dict[str, object]: + """Build the output payload for JSON serialization.""" + return { + "scope": "pull_request" if pr else "repository", + "organization": org, + "repository": repo, + "pullRequest": pr if pr else None, + "minLevel": min_level, + "total": len(issues), + "issues": issues, + } + + def main() -> int: + """Run the Codacy issues fetcher.""" args = parse_args(sys.argv[1:]) # --- Git auto-detect --- - if not args.org or not args.repo: - origin_url = get_git_origin_url() - if origin_url: - parsed = parse_git_remote(origin_url) - if parsed: - if args.provider is None: - args.provider = parsed.provider - if args.org is None: - args.org = parsed.org - if args.repo is None: - args.repo = parsed.repo + apply_git_defaults(args) if args.provider is None: args.provider = "gh" @@ -306,14 +437,13 @@ def main() -> int: ) return 1 - segment_pattern = re.compile(r"^[A-Za-z0-9_.-]+$") - org = assert_valid_segment("org", args.org, segment_pattern) - repo = assert_valid_segment("repo", args.repo, segment_pattern) - pr = args.pr - if pr is not None: - pr = assert_valid_segment("pr", pr, re.compile(r"^[0-9]+$")) + try: + org, repo, pr = resolve_segments(args) + except ValueError as exc: + print(str(exc), file=sys.stderr) + return 1 - status = assert_valid_choice("status", "all", ["all", "open", "closed"]) + status = "all" limit = 100 result = ( @@ -325,17 +455,15 @@ def main() -> int: ) issues = result.get("data") or [] - formatted = format_for_ai(issues, args.min_level) + try: + formatted = format_for_ai(issues, args.min_level) + except ValueError as exc: + print(str(exc), file=sys.stderr) + return 1 - payload = { - "scope": "pull_request" if pr else "repository", - "organization": org, - "repository": repo, - "pullRequest": pr if pr else None, - "minLevel": args.min_level, - "total": len(formatted), - "issues": formatted, - } + payload = build_payload( + pr=pr, org=org, repo=repo, min_level=args.min_level, issues=formatted + ) sys.stdout.write(json.dumps(payload, ensure_ascii=False, indent=2) + "\n") return 0 diff --git a/src/exstruct/render/__init__.py b/src/exstruct/render/__init__.py index f004e2f..ad6cc8a 100644 --- a/src/exstruct/render/__init__.py +++ b/src/exstruct/render/__init__.py @@ -110,10 +110,14 @@ def _sanitize_sheet_filename(name: str) -> str: class _PageSetupProtocol(Protocol): + """Protocol for Excel PageSetup objects exposing PrintArea.""" + PrintArea: object class _SheetApiProtocol(Protocol): + """Protocol for Excel sheet COM APIs used by render helpers.""" + PageSetup: _PageSetupProtocol def ExportAsFixedFormat( # noqa: N802 diff --git a/tests/backends/test_backends.py b/tests/backends/test_backends.py index 8de6f1a..cbebc72 100644 --- a/tests/backends/test_backends.py +++ b/tests/backends/test_backends.py @@ -289,7 +289,8 @@ def __init__(self) -> None: class _DummyWorkbook: - sheets = [_Sheet()] + def __init__(self) -> None: + self.sheets = [_Sheet()] def test_com_backend_extract_auto_page_breaks_success() -> None: @@ -329,7 +330,8 @@ def __init__(self) -> None: class _RestoreErrorWorkbook: - sheets = [_RestoreErrorSheet()] + def __init__(self) -> None: + self.sheets = [_RestoreErrorSheet()] def test_com_backend_extract_auto_page_breaks_restore_error() -> None: diff --git a/tests/backends/test_print_areas_openpyxl.py b/tests/backends/test_print_areas_openpyxl.py index 38d110d..90b58b2 100644 --- a/tests/backends/test_print_areas_openpyxl.py +++ b/tests/backends/test_print_areas_openpyxl.py @@ -3,6 +3,7 @@ from openpyxl import Workbook from exstruct import extract +from exstruct.core.backends.base import PrintAreaData from exstruct.core.backends.openpyxl_backend import ( OpenpyxlBackend, _append_print_areas, @@ -53,6 +54,8 @@ def test_openpyxl_backend_multiple_print_areas(tmp_path: Path) -> None: def test_extract_print_areas_from_defined_names_filters_unknown_sheets() -> None: + """Ignore defined-name destinations for sheets that do not exist.""" + class _DefinedArea: destinations = [("Sheet1", "A1:B2"), ("Unknown", "C1:D2")] @@ -70,6 +73,8 @@ class _DummyWorkbook: def test_extract_print_areas_from_defined_names_without_defined_names() -> None: + """Return an empty mapping when defined_names is missing.""" + class _DummyWorkbook: defined_names = None @@ -77,6 +82,8 @@ class _DummyWorkbook: def test_extract_print_areas_from_sheet_props_skips_empty() -> None: + """Skip sheet print areas when the property is empty.""" + class _SheetEmpty: title = "Sheet1" _print_area = None @@ -93,11 +100,13 @@ class _DummyWorkbook: def test_parse_print_area_range_invalid() -> None: + """Return None for invalid range strings.""" assert _parse_print_area_range("INVALID") is None def test_append_print_areas_skips_invalid_ranges() -> None: - areas: dict[str, list[object]] = {} + """Append only valid print areas and skip invalid ranges.""" + areas: PrintAreaData = {} _append_print_areas(areas, "Sheet1", "A1:B2,INVALID") assert "Sheet1" in areas assert len(areas["Sheet1"]) == 1 diff --git a/tests/core/test_pipeline.py b/tests/core/test_pipeline.py index cbecd84..9596a76 100644 --- a/tests/core/test_pipeline.py +++ b/tests/core/test_pipeline.py @@ -37,7 +37,7 @@ step_extract_print_areas_com, step_extract_shapes_com, ) -from exstruct.models import CellRow, PrintArea +from exstruct.models import CellRow, PrintArea, Shape def test_build_pre_com_pipeline_respects_flags( @@ -388,11 +388,12 @@ def test_step_extract_colors_map_openpyxl_sets_data( tmp_path: Path, monkeypatch: MonkeyPatch ) -> None: def _fake( - _: OpenpyxlBackend, + _backend: OpenpyxlBackend, *, include_default_background: bool, ignore_colors: set[str] | None, ) -> object: + _ = _backend _ = include_default_background _ = ignore_colors return WorkbookColorsMap(sheets={}) @@ -421,21 +422,23 @@ def test_step_extract_colors_map_com_falls_back( tmp_path: Path, monkeypatch: MonkeyPatch ) -> None: def _fake_com( - _: ComBackend, + _backend: ComBackend, *, include_default_background: bool, ignore_colors: set[str] | None, ) -> None: + _ = _backend _ = include_default_background _ = ignore_colors return None def _fake_openpyxl( - _: OpenpyxlBackend, + _backend: OpenpyxlBackend, *, include_default_background: bool, ignore_colors: set[str] | None, ) -> object: + _ = _backend _ = include_default_background _ = ignore_colors return WorkbookColorsMap(sheets={}) @@ -494,11 +497,12 @@ def test_build_cells_tables_workbook_fetches_missing_maps( formulas_map = WorkbookFormulasMap(sheets={}) def _fake_colors( - _: OpenpyxlBackend, + _backend: OpenpyxlBackend, *, include_default_background: bool, ignore_colors: set[str] | None, ) -> object: + _ = _backend _ = include_default_background _ = ignore_colors return colors_map @@ -595,7 +599,7 @@ def test_filter_rows_excluding_merged_values_returns_rows_when_intervals_empty() None ): rows = [CellRow(r=1, c={"0": "A"})] - merged_cells = [MergedCellRange(r1=2, c1=0, r2=1, c2=1, v="A")] + merged_cells = [MergedCellRange(r1=3, c1=0, r2=4, c2=1, v="A")] assert _filter_rows_excluding_merged_values(rows, merged_cells) == rows @@ -745,21 +749,23 @@ def test_step_extract_colors_map_com_sets_data( colors_map = WorkbookColorsMap(sheets={}) def _fake_com( - _: ComBackend, + _backend: ComBackend, *, include_default_background: bool, ignore_colors: set[str] | None, ) -> object: + _ = _backend _ = include_default_background _ = ignore_colors return colors_map def _raise( - _: OpenpyxlBackend, + _backend: OpenpyxlBackend, *, include_default_background: bool, ignore_colors: set[str] | None, ) -> object: + _ = _backend _ = include_default_background _ = ignore_colors raise RuntimeError("should not be called") @@ -790,7 +796,7 @@ def test_run_com_pipeline_executes_steps(tmp_path: Path) -> None: def _step(_: ExtractionInputs, artifacts: ExtractionArtifacts, __: object) -> None: calls.append("called") - artifacts.shape_data = {"Sheet1": [object()]} + artifacts.shape_data = {"Sheet1": [Shape(id=1, text="", l=0, t=0)]} inputs = ExtractionInputs( file_path=tmp_path / "book.xlsx", diff --git a/tests/models/test_models_export.py b/tests/models/test_models_export.py index 57d72b7..ab41ad1 100644 --- a/tests/models/test_models_export.py +++ b/tests/models/test_models_export.py @@ -70,7 +70,7 @@ def test_save_unsupported_format_raises(tmp_path: Path) -> None: wb.save(bad) -# pytest.skipif is typed; no ignore needed +# cast to _SkipIf satisfies mypy strict mode for decorator typing @cast(_SkipIf, pytest.mark.skipif(not HAS_PYYAML, reason="pyyaml not installed")) def test_sheet_to_yaml_roundtrip() -> None: sheet = _sheet() diff --git a/tests/render/test_render_init.py b/tests/render/test_render_init.py index b8aba44..75731c6 100644 --- a/tests/render/test_render_init.py +++ b/tests/render/test_render_init.py @@ -544,33 +544,42 @@ def test_sanitize_sheet_filename() -> None: def test_split_csv_respecting_quotes() -> None: + """Split CSV-like PrintArea strings while honoring quotes.""" raw = "'Sheet 1'!A1:B2,'Sheet,2'!C3:D4,'O''Brien'!E1:F2" parts = render._split_csv_respecting_quotes(raw) assert parts == ["'Sheet 1'!A1:B2", "'Sheet,2'!C3:D4", "'O''Brien'!E1:F2"] def test_extract_print_areas_with_page_setup() -> None: + """Parse PrintArea from a PageSetup stub.""" + class _PageSetup: PrintArea = "'Sheet 1'!A1:B2,'Sheet 1'!C3:D4" class _SheetApi: PageSetup = _PageSetup() - areas = render._extract_print_areas(_SheetApi()) + areas = render._extract_print_areas(cast(render._SheetApiProtocol, _SheetApi())) assert areas == ["'Sheet 1'!A1:B2", "'Sheet 1'!C3:D4"] def test_extract_print_areas_empty_print_area() -> None: + """Return empty list when PrintArea is empty.""" + class _PageSetup: PrintArea = "" class _SheetApi: PageSetup = _PageSetup() - assert render._extract_print_areas(_SheetApi()) == [] + assert ( + render._extract_print_areas(cast(render._SheetApiProtocol, _SheetApi())) == [] + ) def test_extract_print_areas_handles_exception() -> None: + """Return empty list when PrintArea access raises.""" + class _PageSetup: @property def PrintArea(self) -> str: @@ -579,10 +588,14 @@ def PrintArea(self) -> str: class _SheetApi: PageSetup = _PageSetup() - assert render._extract_print_areas(_SheetApi()) == [] + assert ( + render._extract_print_areas(cast(render._SheetApiProtocol, _SheetApi())) == [] + ) def test_iter_sheet_apis_prefers_worksheets_collection() -> None: + """Prefer the Worksheets collection when iterating COM sheets.""" + class _WsApi: def __init__(self, name: str) -> None: self.Name = name @@ -618,6 +631,7 @@ def _raise() -> xw.App: def test_require_pdfium_success(monkeypatch: pytest.MonkeyPatch) -> None: + """_require_pdfium returns the imported module when available.""" fake_pdfium = ModuleType("pypdfium2") sys.modules["pypdfium2"] = fake_pdfium try: @@ -629,6 +643,8 @@ def test_require_pdfium_success(monkeypatch: pytest.MonkeyPatch) -> None: def test_build_sheet_export_plan_handles_multiple_areas( monkeypatch: pytest.MonkeyPatch, ) -> None: + """Expand multiple print areas into separate export plan rows.""" + class _SheetApi: pass @@ -647,14 +663,18 @@ def _fake_extract(_: _SheetApi) -> list[str]: def test_page_index_from_suffix_default() -> None: + """Default to zero when no suffix exists.""" assert render._page_index_from_suffix("sheet") == 0 def test_page_index_from_suffix_non_digit() -> None: + """Default to zero when suffix is not numeric.""" assert render._page_index_from_suffix("sheet_pxx") == 0 def test_export_sheet_pdf_skips_invalid_print_area(tmp_path: Path) -> None: + """Skip restoring PrintArea when setter fails.""" + class _BadPageSetup: @property def PrintArea(self) -> str: @@ -674,7 +694,7 @@ def ExportAsFixedFormat( _ = kwargs render._export_sheet_pdf( - _SheetApi(), + cast(render._SheetApiProtocol, _SheetApi()), tmp_path / "out.pdf", ignore_print_areas=False, print_area="A1:B2", @@ -682,6 +702,7 @@ def ExportAsFixedFormat( def test_render_sheet_images_requires_pdfium(tmp_path: Path) -> None: + """Raise RenderError when pdfium is missing.""" with pytest.raises(RenderError, match="pypdfium2 is required"): render._render_sheet_images( None, @@ -697,6 +718,7 @@ def test_render_sheet_images_requires_pdfium(tmp_path: Path) -> None: def test_export_sheet_images_with_app_retries_on_empty( tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: + """Retry export when rendering returns empty results.""" calls: list[int] = [] def _fake_render( @@ -737,6 +759,7 @@ def _fake_render( def test_page_index_from_suffix_handles_multi_digits() -> None: + """Support multi-digit page suffixes.""" assert render._page_index_from_suffix("sheet_01") == 0 assert render._page_index_from_suffix("sheet_01_p01") == 0 assert render._page_index_from_suffix("sheet_01_p10") == 9 @@ -745,6 +768,8 @@ def test_page_index_from_suffix_handles_multi_digits() -> None: def test_export_sheet_pdf_does_not_swallow_export_errors(tmp_path: Path) -> None: + """Propagate export errors even if restore fails.""" + class _FlakyPageSetup: def __init__(self) -> None: self._print_area: object = "A1" diff --git a/tests/utils.py b/tests/utils.py index da3e063..c85b1df 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -18,7 +18,18 @@ def parametrize( | None = None, scope: Literal["session", "package", "module", "class", "function"] | None = None, ) -> Callable[[Callable[P, R]], Callable[P, R]]: - """Type-safe wrapper around pytest.mark.parametrize.""" + """Type-safe wrapper around pytest.mark.parametrize. + + Args: + argnames: Parameter names for the parametrized test. + argvalues: Parameter values for each test case. + indirect: Whether to treat parameters as fixtures. + ids: Optional case IDs or an ID factory. + scope: Optional fixture scope for parametrization. + + Returns: + Decorator preserving the wrapped callable signature. + """ return cast( Callable[[Callable[P, R]], Callable[P, R]], pytest.mark.parametrize( From c39c3d745b20ad582be5b56c89295a0d2ff34dee Mon Sep 17 00:00:00 2001 From: "coderabbitai[bot]" <136622811+coderabbitai[bot]@users.noreply.github.com> Date: Fri, 23 Jan 2026 01:43:38 +0000 Subject: [PATCH 09/12] =?UTF-8?q?=F0=9F=93=9D=20Add=20docstrings=20to=20`d?= =?UTF-8?q?ev/formula`?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Docstrings generation was requested by @harumiWeb. * https://github.com/harumiWeb/exstruct/pull/44#discussion_r2719270210 The following files were modified: * `scripts/codacy_issues.py` * `src/exstruct/__init__.py` * `src/exstruct/core/backends/base.py` * `src/exstruct/core/backends/com_backend.py` * `src/exstruct/core/backends/openpyxl_backend.py` * `src/exstruct/core/cells.py` * `src/exstruct/core/integrate.py` * `src/exstruct/core/pipeline.py` * `src/exstruct/core/workbook.py` * `src/exstruct/engine.py` * `src/exstruct/render/__init__.py` * `tests/backends/test_auto_page_breaks.py` * `tests/backends/test_backends.py` * `tests/backends/test_print_areas_openpyxl.py` * `tests/com/test_render_smoke.py` * `tests/core/test_cells_utils.py` * `tests/core/test_mode_output.py` * `tests/core/test_pipeline.py` * `tests/core/test_pipeline_fallbacks.py` * `tests/engine/test_engine.py` * `tests/models/test_models_export.py` * `tests/render/test_render_init.py` * `tests/utils.py` --- scripts/codacy_issues.py | 219 ++++++++++++++---- src/exstruct/__init__.py | 30 +-- src/exstruct/core/backends/base.py | 7 +- src/exstruct/core/backends/com_backend.py | 33 +-- .../core/backends/openpyxl_backend.py | 22 +- src/exstruct/core/cells.py | 132 ++++++----- src/exstruct/core/integrate.py | 43 ++-- src/exstruct/core/pipeline.py | 189 ++++++++------- src/exstruct/core/workbook.py | 19 +- src/exstruct/engine.py | 38 ++- src/exstruct/render/__init__.py | 177 ++++++++++++-- tests/backends/test_auto_page_breaks.py | 23 +- tests/backends/test_backends.py | 140 ++++++++++- tests/backends/test_print_areas_openpyxl.py | 14 +- tests/com/test_render_smoke.py | 7 +- tests/core/test_cells_utils.py | 17 +- tests/core/test_mode_output.py | 7 +- tests/core/test_pipeline.py | 189 ++++++++++++++- tests/core/test_pipeline_fallbacks.py | 7 +- tests/engine/test_engine.py | 15 +- tests/models/test_models_export.py | 8 +- tests/render/test_render_init.py | 115 ++++++++- tests/utils.py | 23 +- 23 files changed, 1148 insertions(+), 326 deletions(-) diff --git a/scripts/codacy_issues.py b/scripts/codacy_issues.py index 6881203..8816d62 100644 --- a/scripts/codacy_issues.py +++ b/scripts/codacy_issues.py @@ -63,13 +63,14 @@ def get_level_priority(level: str | None) -> int | None: def normalize_provider(value: str) -> str | None: - """Normalize provider short code. - - Args: - value: Provider identifier. - + """ + Normalize a provider identifier to a supported short code. + + Parameters: + value (str): Provider identifier to normalize (expected 'gh', 'gl', or 'bb'). + Returns: - Provider code if valid, otherwise None. + str | None: The provider code ('gh', 'gl', or 'bb') if valid, `None` otherwise. """ return value if value in ("gh", "gl", "bb") else None @@ -113,12 +114,26 @@ def assert_valid_choice(name: str, value: str, choices: list[str]) -> str: def encode_segment(value: str) -> str: - """URL-encode a path segment.""" + """ + URL-encode a URL path segment so it is safe for inclusion in a path. + + Returns: + encoded (str): The percent-encoded representation of the input string. + """ return urllib.parse.quote(value, safe="") def build_codacy_url(pathname: str, query: dict[str, str] | None = None) -> str: - """Build a Codacy API URL from a path and query parameters.""" + """ + Constructs a full Codacy API URL using the configured base origin and base path. + + Parameters: + pathname (str): Pathname to append to the base path (should begin with a forward slash). + query (dict[str, str] | None): Optional mapping of query parameter names to values; values are URL-encoded. + + Returns: + url (str): The complete URL including query string if `query` is provided. + """ # Ensure we keep origin and base path url = f"{BASE_URL.scheme}://{BASE_URL.netloc}{BASE_PATH}{pathname}" if query: @@ -127,16 +142,17 @@ def build_codacy_url(pathname: str, query: dict[str, str] | None = None) -> str: def assert_codacy_url(url: str) -> str: - """Ensure the URL targets the Codacy API origin and analysis path. - - Args: - url: URL to validate. - + """ + Validate that `url` targets the configured Codacy API origin and begins with the `/analysis/` path. + + Parameters: + url (str): The full URL to validate. + Returns: - The original URL when valid. - + str: The original URL when it is confirmed to target the configured Codacy API origin and start with the `/analysis/` path. + Raises: - ValueError: If the URL is not within the expected origin/path. + ValueError: If the URL does not use the configured Codacy API origin or does not start with the expected `/analysis/` path. """ # Basic safety: must be same origin and start with /api/v3/analysis/ parsed = urllib.parse.urlparse(url) @@ -149,7 +165,18 @@ def assert_codacy_url(url: str) -> str: def build_repo_issues_url(provider: str, org: str, repo: str, limit: int) -> str: - """Build a repository issues API URL.""" + """ + Constructs the Codacy API URL to search repository issues for a given provider, organization, repository, and result limit. + + Parameters: + provider (str): Provider code (e.g., "gh", "gl", "bb"). + org (str): Organization or owner name. + repo (str): Repository name. + limit (int): Maximum number of results to request. + + Returns: + str: A Codacy API URL for the repository issues search endpoint with the `limit` query parameter set. + """ return build_codacy_url( f"/analysis/organizations/{encode_segment(provider)}/{encode_segment(org)}" f"/repositories/{encode_segment(repo)}/issues/search", @@ -160,7 +187,20 @@ def build_repo_issues_url(provider: str, org: str, repo: str, limit: int) -> str def build_pr_issues_url( provider: str, org: str, repo: str, pr: str, limit: int, status: str ) -> str: - """Build a pull request issues API URL.""" + """ + Constructs the Codacy API URL for fetching issues of a pull request. + + Parameters: + provider (str): Provider code (e.g., "gh", "gl", "bb"). + org (str): Organization or owner name. + repo (str): Repository name. + pr (str): Pull request identifier. + limit (int): Maximum number of issues to request. + status (str): Issue status filter (e.g., "all", "open", "closed"). + + Returns: + str: The Codacy API URL for the pull-request issues endpoint including `status` and `limit` query parameters. + """ return build_codacy_url( f"/analysis/organizations/{encode_segment(provider)}/{encode_segment(org)}" f"/repositories/{encode_segment(repo)}/pull-requests/{encode_segment(pr)}/issues", @@ -169,7 +209,12 @@ def build_pr_issues_url( def get_git_origin_url() -> str | None: - """Return the git origin URL if available.""" + """ + Get the Git remote "origin" URL for the current repository, or None when it cannot be determined. + + Returns: + origin_url (str | None): The remote URL configured for 'origin' if the current directory is inside a Git work tree and the origin URL is available; `None` if not inside a Git repository, if the origin is not set, or on error. + """ # git repo check try: result = subprocess.run( @@ -203,7 +248,18 @@ class GitRemoteInfo: def parse_git_remote(url: str) -> GitRemoteInfo | None: - """Parse a git remote URL into provider/org/repo info.""" + """ + Extract provider, organization, and repository from a Git remote URL. + + Accepts HTTPS (https://host/org/repo[.git]) and SSH (git@host:org/repo[.git]) remote formats. + Provider is one of: "gh" for GitHub, "gl" for GitLab, "bb" for Bitbucket, or "unknown" for other hosts. + + Parameters: + url (str): Git remote URL to parse. + + Returns: + GitRemoteInfo | None: Parsed GitRemoteInfo with fields `provider`, `org`, and `repo`, or `None` if the URL could not be parsed. + """ # HTTPS m = re.match(r"^https?://([^/]+)/([^/]+)/([^/]+?)(?:\.git)?$", url) # SSH @@ -216,6 +272,16 @@ def parse_git_remote(url: str) -> GitRemoteInfo | None: host, org, repo = m.group(1), m.group(2), m.group(3) def is_same_or_subdomain(hostname: str, base_domain: str) -> bool: + """ + Check whether a hostname is equal to a base domain or is a subdomain of that base domain. + + Parameters: + hostname (str): Hostname to test (e.g., "api.example.com"). + base_domain (str): Base domain to compare against (e.g., "example.com"). + + Returns: + `true` if `hostname` equals `base_domain` or ends with `.` followed by `base_domain`, `false` otherwise. + """ return hostname == base_domain or hostname.endswith("." + base_domain) if is_same_or_subdomain(host, "github.com"): @@ -233,15 +299,19 @@ def is_same_or_subdomain(hostname: str, base_domain: str) -> bool: def fetch_json( url: str, method: str = "GET", body: dict[str, Any] | None = None ) -> dict[str, Any]: - """Fetch JSON from the Codacy API. - - Args: - url: Codacy API URL. - method: HTTP method. - body: Optional JSON body for non-GET requests. - + """ + Fetch and return a JSON object from a validated Codacy API URL. + + Parameters: + url (str): Codacy API URL; must target the configured Codacy origin and start with the /analysis/ path. + method (str): HTTP method to use (e.g., "GET", "POST"). + body (dict[str, Any] | None): Optional JSON body for non-GET requests. + Returns: - Parsed JSON dictionary. + dict[str, Any]: The parsed JSON response as a dictionary. + + Raises: + RuntimeError: On HTTP errors, network errors, invalid JSON, or when the JSON root value is not an object. """ safe_url = assert_codacy_url(url) @@ -286,7 +356,18 @@ def fetch_json( # API # ================================ def fetch_repo_issues(provider: str, org: str, repo: str, limit: int) -> dict[str, Any]: - """Fetch issues for a repository.""" + """ + Request Codacy for issues belonging to a repository. + + Parameters: + provider (str): Provider code ('gh', 'gl', 'bb') indicating GitHub, GitLab, or Bitbucket. + org (str): Organization or owner name. + repo (str): Repository name. + limit (int): Maximum number of issues to return. + + Returns: + dict[str, Any]: Parsed JSON response from the Codacy API containing issue data. + """ url = build_repo_issues_url(provider, org, repo, limit) return fetch_json(url, method="POST", body={}) @@ -294,7 +375,20 @@ def fetch_repo_issues(provider: str, org: str, repo: str, limit: int) -> dict[st def fetch_pr_issues( provider: str, org: str, repo: str, pr: str, limit: int, status: str = "all" ) -> dict[str, Any]: - """Fetch issues for a pull request.""" + """ + Retrieve Codacy issues for a specific pull request. + + Parameters: + provider (str): Provider code ("gh", "gl", "bb"). + org (str): Organization or user name. + repo (str): Repository name. + pr (str): Pull request number or identifier. + limit (int): Maximum number of issues to request. + status (str): Issue status filter (for example "all", "open", "closed"). + + Returns: + dict: Parsed JSON response from the Codacy API. + """ url = build_pr_issues_url(provider, org, repo, pr, limit, status) return fetch_json(url, method="GET") @@ -303,17 +397,21 @@ def fetch_pr_issues( # AI Output Formatter # ================================ def format_for_ai(raw_issues: list[dict[str, Any]], min_level: str) -> list[str]: - """Format raw Codacy issues for AI output. - - Args: - raw_issues: Issue dictionaries from Codacy API. - min_level: Minimum severity level to include. - + """ + Format Codacy issue records into compact AI-friendly lines filtered by minimum severity. + + Each returned string has the form: + " | : | | | ". + + Parameters: + raw_issues: List of issue objects returned by the Codacy API (each item may be an issue or contain a `commitIssue` key). + min_level: Minimum severity level to include; must be one of the values in LEVELS. + Returns: - Formatted issue strings. - + A list of formatted issue strings matching the format above, including only issues whose severity is at or above `min_level`. + Raises: - ValueError: If min_level is invalid. + ValueError: If `min_level` is not a valid severity level. """ min_priority = get_level_priority(min_level) if min_priority is None: @@ -377,13 +475,17 @@ def apply_git_defaults(args: argparse.Namespace) -> None: def resolve_segments(args: argparse.Namespace) -> tuple[str, str, str | None]: - """Validate and return org/repo/pr segments. - - Args: - args: Parsed CLI arguments. - + """ + Validate CLI org, repo, and optional pr segments and return them. + + Parameters: + args (argparse.Namespace): Parsed CLI arguments with attributes `org`, `repo`, and optional `pr`. + Returns: - Tuple of (org, repo, pr). + tuple[str, str, str | None]: A tuple (org, repo, pr) where `pr` is None if not supplied. + + Raises: + ValueError: If any segment is empty or contains invalid characters. """ segment_pattern = re.compile(r"^[A-Za-z0-9_.-]+$") org = assert_valid_segment("org", args.org, segment_pattern) @@ -402,7 +504,21 @@ def build_payload( min_level: str, issues: list[str], ) -> dict[str, object]: - """Build the output payload for JSON serialization.""" + """ + Create a JSON-serializable payload describing the fetched issues and their scope. + + The returned dictionary contains: + - scope: "pull_request" when `pr` is set, otherwise "repository". + - organization: organization/owner name. + - repository: repository name. + - pullRequest: pull request identifier string when present, otherwise `None`. + - minLevel: the minimum severity level used to filter issues. + - total: the number of issues in `issues`. + - issues: list of formatted issue strings. + + Returns: + dict[str, object]: Payload ready for JSON serialization with the keys described above. + """ return { "scope": "pull_request" if pr else "repository", "organization": org, @@ -415,7 +531,14 @@ def build_payload( def main() -> int: - """Run the Codacy issues fetcher.""" + """ + Run the CLI: parse arguments, fetch Codacy issues (repository or pull request), format them for AI consumption, and write a JSON payload to stdout. + + Writes error messages to stderr when validation or fetching fails and prints the final JSON payload to stdout. + + Returns: + int: 0 on success, 1 on error. + """ args = parse_args(sys.argv[1:]) # --- Git auto-detect --- @@ -474,4 +597,4 @@ def main() -> int: raise SystemExit(main()) except Exception as e: print(str(e), file=sys.stderr) - raise SystemExit(1) from e + raise SystemExit(1) from e \ No newline at end of file diff --git a/src/exstruct/__init__.py b/src/exstruct/__init__.py index dee7580..6ad3744 100644 --- a/src/exstruct/__init__.py +++ b/src/exstruct/__init__.py @@ -90,28 +90,14 @@ def extract(file_path: str | Path, mode: ExtractionMode = "standard") -> WorkbookData: """ - Extract an Excel workbook into WorkbookData. - - Args: - file_path: Path to .xlsx/.xlsm/.xls. - mode: "light" / "standard" / "verbose" - - light: cells + table detection only (no COM, shapes/charts empty). Print areas via openpyxl. - - standard: texted shapes + arrows + charts (COM if available), print areas included. Shape/chart size is kept but hidden by default in output. - - verbose: all shapes (including textless) with size, charts with size, and colors_map. - + Extracts an Excel workbook into a WorkbookData structure. + + Parameters: + file_path (str | Path): Path to the workbook file (.xlsx, .xlsm, .xls). + mode (ExtractionMode): Extraction detail level. "light" includes cells and table detection only (no COM, shapes/charts empty; print areas via openpyxl). "standard" includes texted shapes, arrows, charts (COM if available) and print areas. "verbose" also includes shape/chart sizes, cell link map, colors map, and formulas map. + Returns: - WorkbookData containing sheets, rows, shapes, charts, and print areas. - - Raises: - ValueError: If an invalid mode is provided. - - Examples: - Extract with hyperlinks (verbose) and inspect table candidates: - - >>> from exstruct import extract - >>> wb = extract("input.xlsx", mode="verbose") - >>> wb.sheets["Sheet1"].table_candidates - ['A1:B5'] + WorkbookData: Parsed workbook representation containing sheets, rows, shapes, charts, and print areas. """ include_links = True if mode == "verbose" else False include_colors_map = True if mode == "verbose" else None @@ -397,4 +383,4 @@ def process_excel( print_areas_dir=print_areas_dir, auto_page_breaks_dir=auto_page_breaks_dir, stream=stream, - ) + ) \ No newline at end of file diff --git a/src/exstruct/core/backends/base.py b/src/exstruct/core/backends/base.py index 0cf283c..7ffbdde 100644 --- a/src/exstruct/core/backends/base.py +++ b/src/exstruct/core/backends/base.py @@ -42,4 +42,9 @@ def extract_merged_cells(self) -> MergedCellData: """Extract merged cell ranges from the workbook.""" def extract_formulas_map(self) -> WorkbookFormulasMap | None: - """Extract formulas map from the workbook.""" + """ + Retrieve the workbook's formulas organized by worksheet. + + Returns: + WorkbookFormulasMap | None: A mapping of worksheet identifiers to their formulas, or `None` if the backend cannot provide a formulas map. + """ \ No newline at end of file diff --git a/src/exstruct/core/backends/com_backend.py b/src/exstruct/core/backends/com_backend.py index 81ec58c..0c1348f 100644 --- a/src/exstruct/core/backends/com_backend.py +++ b/src/exstruct/core/backends/com_backend.py @@ -63,14 +63,15 @@ def extract_print_areas(self) -> PrintAreaData: def extract_colors_map( self, *, include_default_background: bool, ignore_colors: set[str] | None ) -> WorkbookColorsMap | None: - """Extract colors_map via COM; logs and skips on failure. - - Args: - include_default_background: Whether to include default backgrounds. - ignore_colors: Optional set of color keys to ignore. - + """ + Extract a workbook colors map using the Excel COM API. + + Parameters: + include_default_background (bool): Include the workbook's default background color in the resulting map. + ignore_colors (set[str] | None): Optional set of color keys to exclude from the map. + Returns: - WorkbookColorsMap or None when extraction fails. + WorkbookColorsMap | None: A mapping of workbook color definitions when extraction succeeds, or `None` if COM extraction fails. """ try: return extract_sheet_colors_map_com( @@ -86,10 +87,11 @@ def extract_colors_map( return None def extract_formulas_map(self) -> WorkbookFormulasMap | None: - """Extract formulas_map via COM; logs and skips on failure. - + """ + Extracts the workbook's formulas map using COM. + Returns: - WorkbookFormulasMap or None when extraction fails. + WorkbookFormulasMap or None: The extracted formulas map, or `None` if extraction failed. """ try: return extract_sheet_formulas_map_com(self.workbook) @@ -101,10 +103,13 @@ def extract_formulas_map(self) -> WorkbookFormulasMap | None: return None def extract_auto_page_breaks(self) -> PrintAreaData: - """Compute auto page-break rectangles per sheet using Excel COM. - + """ + Compute auto page-break rectangles for each worksheet using Excel COM. + + For each sheet, determine the sheet's print area (PageSetup.PrintArea or the used range) and split it into sub-rectangles along Excel's horizontal and vertical page breaks; parts that reference a different sheet are ignored. If extraction for a sheet fails, the sheet is skipped and a warning is logged. + Returns: - Mapping of sheet name to auto page-break areas. + Mapping from sheet name to a list of PrintArea entries. Each PrintArea describes a rectangular region with `r1` and `r2` as 1-based row indices and `c1` and `c2` as 0-based column indices. """ results: PrintAreaData = {} for sheet in self.workbook.sheets: @@ -247,4 +252,4 @@ def _split_csv_respecting_quotes(raw: str) -> list[str]: i += 1 if buf: parts.append("".join(buf).strip()) - return [p for p in parts if p] + return [p for p in parts if p] \ No newline at end of file diff --git a/src/exstruct/core/backends/openpyxl_backend.py b/src/exstruct/core/backends/openpyxl_backend.py index d67ae59..acaba4a 100644 --- a/src/exstruct/core/backends/openpyxl_backend.py +++ b/src/exstruct/core/backends/openpyxl_backend.py @@ -102,10 +102,11 @@ def extract_merged_cells(self) -> MergedCellData: return {} def extract_formulas_map(self) -> WorkbookFormulasMap | None: - """Extract formulas_map using openpyxl. - + """ + Extract a mapping of workbook formulas for each sheet. + Returns: - WorkbookFormulasMap or None when extraction fails. + WorkbookFormulasMap | None: A mapping from sheet name to its formulas, or `None` if extraction fails. """ try: return extract_sheet_formulas_map(self.file_path) @@ -116,13 +117,14 @@ def extract_formulas_map(self) -> WorkbookFormulasMap | None: return None def detect_tables(self, sheet_name: str) -> list[str]: - """Detect table candidates for a single sheet. - - Args: - sheet_name: Target worksheet name. - + """ + Detects table candidate ranges within the specified worksheet. + + Parameters: + sheet_name (str): Name of the worksheet to analyze for table candidates. + Returns: - List of table candidate ranges. + list[str]: Detected table candidate ranges as A1-style range strings; empty list if none are found or detection fails. """ try: return detect_tables_openpyxl(self.file_path, sheet_name) @@ -204,4 +206,4 @@ def _parse_print_area_range(range_str: str) -> tuple[int, int, int, int] | None: bounds = parse_range_zero_based(range_str) if bounds is None: return None - return (bounds.r1, bounds.c1, bounds.r2, bounds.c2) + return (bounds.r1, bounds.c1, bounds.r2, bounds.c2) \ No newline at end of file diff --git a/src/exstruct/core/cells.py b/src/exstruct/core/cells.py index 1024888..20e9385 100644 --- a/src/exstruct/core/cells.py +++ b/src/exstruct/core/cells.py @@ -56,13 +56,14 @@ class WorkbookColorsMap: sheets: dict[str, SheetColorsMap] def get_sheet(self, sheet_name: str) -> SheetColorsMap | None: - """Return the colors map for a sheet if available. - - Args: - sheet_name: Target worksheet name. - + """ + Retrieve the SheetColorsMap for a worksheet by name. + + Parameters: + sheet_name (str): Name of the worksheet to retrieve. + Returns: - SheetColorsMap for the sheet, or None if missing. + SheetColorsMap | None: The sheet's color map if present, `None` otherwise. """ return self.sheets.get(sheet_name) @@ -82,13 +83,14 @@ class WorkbookFormulasMap: sheets: dict[str, SheetFormulasMap] def get_sheet(self, sheet_name: str) -> SheetFormulasMap | None: - """Return the formulas map for a sheet if available. - - Args: - sheet_name: Target worksheet name. - + """ + Retrieve the formulas map for a worksheet. + + Parameters: + sheet_name (str): Name of the worksheet to look up. + Returns: - SheetFormulasMap for the sheet, or None if missing. + SheetFormulasMap | None: The sheet's formulas map if present, `None` if the worksheet is not found. """ return self.sheets.get(sheet_name) @@ -129,13 +131,14 @@ def extract_sheet_colors_map( def extract_sheet_formulas_map(file_path: Path) -> WorkbookFormulasMap: - """Extract formula strings for each worksheet. - - Args: - file_path: Excel workbook path. - + """ + Extract normalized formula strings from every worksheet in the workbook. + + Parameters: + file_path (Path): Path to the Excel workbook to read. + Returns: - WorkbookFormulasMap containing per-sheet formula maps. + WorkbookFormulasMap: Mapping of sheet names to SheetFormulasMap objects. Each SheetFormulasMap contains a mapping from normalized formula strings (each beginning with "=") to a list of cell coordinates (row, column) where that formula occurs. """ sheets: dict[str, SheetFormulasMap] = {} with openpyxl_workbook(file_path, data_only=False, read_only=False) as wb: @@ -146,13 +149,14 @@ def extract_sheet_formulas_map(file_path: Path) -> WorkbookFormulasMap: def extract_sheet_formulas_map_com(workbook: xw.Book) -> WorkbookFormulasMap: - """Extract formula strings for each worksheet via COM. - - Args: - workbook: xlwings workbook instance. - + """ + Collects and normalizes formulas from every worksheet in an xlwings workbook into per-sheet mappings. + + Parameters: + workbook: xlwings Book instance whose sheets will be scanned for formulas. + Returns: - WorkbookFormulasMap containing per-sheet formula maps. + WorkbookFormulasMap: maps sheet names to SheetFormulasMap objects. Each SheetFormulasMap.formulas_map maps a normalized formula string (consistent representation, e.g., beginning with "=") to a list of (row, column) tuples representing cell locations using Excel 1-based indices. """ sheets: dict[str, SheetFormulasMap] = {} for sheet in workbook.sheets: @@ -189,16 +193,16 @@ def extract_sheet_colors_map_com( include_default_background: bool, ignore_colors: set[str] | None, ) -> WorkbookColorsMap: - """Extract background colors for each worksheet via COM display formats. - - Args: - workbook: xlwings workbook instance. - include_default_background: Whether to include default (white) backgrounds - within the used range. - ignore_colors: Optional set of color keys to ignore. - + """ + Extract per-sheet background color maps using the workbook's COM/display-format interfaces. + + Parameters: + workbook (xw.Book): xlwings workbook whose sheets will be inspected. + include_default_background (bool): If true, include default background colors (e.g., white) for cells inside each sheet's used range. + ignore_colors (set[str] | None): Optional set of normalized color keys to exclude from results. + Returns: - WorkbookColorsMap containing per-sheet color maps. + WorkbookColorsMap: Mapping of sheet names to SheetColorsMap containing detected background color positions for each worksheet. """ _prepare_workbook_for_display_format(workbook) sheets: dict[str, SheetColorsMap] = {} @@ -214,15 +218,16 @@ def extract_sheet_colors_map_com( def _extract_sheet_colors( ws: Worksheet, include_default_background: bool, ignore_colors: set[str] | None ) -> SheetColorsMap: - """Extract background colors for a single worksheet. - - Args: - ws: Target worksheet. - include_default_background: Whether to include default (white) backgrounds. - ignore_colors: Optional set of color keys to ignore. - + """ + Extract the background color locations present on a single worksheet. + + Parameters: + ws (Worksheet): Worksheet to scan. + include_default_background (bool): If true, treat cells with the workbook default/background color as having a color key. + ignore_colors (set[str] | None): Optional set of color keys to ignore (keys are normalized before comparison). + Returns: - SheetColorsMap for the worksheet. + SheetColorsMap: Mapping from normalized color key to a list of cell coordinates where that color appears. Coordinates are tuples (row, col) where `row` is 1-based and `col` is 0-based. """ min_row, min_col, max_row, max_col = _get_used_range_bounds(ws) colors_map: dict[str, list[tuple[int, int]]] = {} @@ -247,13 +252,14 @@ def _extract_sheet_colors( def _extract_sheet_formulas(ws: Worksheet) -> SheetFormulasMap: - """Extract formula strings for a single worksheet. - - Args: - ws: Target worksheet. - + """ + Collect normalized formula strings from a worksheet and group their cell coordinates. + + Parameters: + ws (Worksheet): Worksheet to scan for formulas. + Returns: - SheetFormulasMap for the worksheet. + SheetFormulasMap: container with the sheet's name and a mapping from each normalized formula string (prefixed with "=") to a list of cell coordinates as (row, zero-based-column). """ min_row, min_col, max_row, max_col = _get_used_range_bounds(ws) formulas_map: dict[str, list[tuple[int, int]]] = {} @@ -297,13 +303,14 @@ def _normalize_formula_value(value: object) -> str | None: def _normalize_formula_from_com(value: object) -> str | None: - """Normalize a formula string returned by COM. - - Args: - value: Raw COM formula value. - + """ + Normalize a COM-returned cell formula into a string that begins with '='. + + Parameters: + value (object): Raw value returned from COM for a cell's formula. + Returns: - Formula string with leading "=", or None when not a formula. + str | None: The input string if it is non-empty and starts with '=', `None` otherwise. """ if value is None or not isinstance(value, str): return None @@ -318,15 +325,16 @@ def _normalize_formula_from_com(value: object) -> str | None: def _extract_sheet_colors_com( sheet: xw.Sheet, include_default_background: bool, ignore_colors: set[str] | None ) -> SheetColorsMap: - """Extract background colors for a single worksheet via COM. - - Args: - sheet: Target worksheet. - include_default_background: Whether to include default (white) backgrounds. - ignore_colors: Optional set of color keys to ignore. - + """ + Extract per-sheet background color mapping using COM/DisplayFormat. + + Parameters: + sheet (xw.Sheet): xlwings sheet object to inspect. + include_default_background (bool): If True, include cells whose background is the workbook default color. + ignore_colors (set[str] | None): Optional set of normalized color keys to exclude from the result. + Returns: - SheetColorsMap for the worksheet. + SheetColorsMap: Mapping from normalized color key (hex/theme/index canonical form) to a list of cell coordinates where that color appears. Each coordinate is a tuple (row, col) where `row` is the worksheet row number (1-based) and `col` is the zero-based column index. """ colors_map: dict[str, list[tuple[int, int]]] = {} used = sheet.used_range @@ -1715,4 +1723,4 @@ def _coerce_numeric_preserve_format(val: str) -> int | float | str: return float(quantized) except (InvalidOperation, Exception): return val - return val + return val \ No newline at end of file diff --git a/src/exstruct/core/integrate.py b/src/exstruct/core/integrate.py index ccd8131..402dddf 100644 --- a/src/exstruct/core/integrate.py +++ b/src/exstruct/core/integrate.py @@ -21,28 +21,29 @@ def extract_workbook( # noqa: C901 include_merged_cells: bool | None = None, include_merged_values_in_rows: bool = True, ) -> WorkbookData: - """Extract workbook and return WorkbookData. - - Falls back to cells+tables if Excel COM is unavailable. - - Args: - file_path: Workbook path. - mode: Extraction mode. - include_cell_links: Whether to include cell hyperlinks; None uses mode defaults. - include_print_areas: Whether to include print areas; None defaults to True. - include_auto_page_breaks: Whether to include auto page breaks. - include_colors_map: Whether to include colors map; None uses mode defaults. - include_default_background: Whether to include default background color. - ignore_colors: Optional set of color keys to ignore. - include_formulas_map: Whether to include formulas map; None uses mode defaults. - include_merged_cells: Whether to include merged cell ranges; None uses mode defaults. - include_merged_values_in_rows: Whether to keep merged values in rows. - + """ + Extract a workbook into a structured WorkbookData representation. + + May fall back to cells+tables extraction if Excel COM automation is unavailable. + + Parameters: + file_path (str | Path): Path to the workbook file. + mode (Literal['light', 'standard', 'verbose']): Extraction mode that controls detail level. + include_cell_links (bool | None): Include cell hyperlinks; `None` uses mode defaults. + include_print_areas (bool | None): Include print areas; `None` defaults to True. + include_auto_page_breaks (bool): Include automatic page break information. + include_colors_map (bool | None): Include a colors map; `None` uses mode defaults. + include_default_background (bool): Include default background color when present. + ignore_colors (set[str] | None): Set of color keys to ignore during color mapping. + include_formulas_map (bool | None): Include a map of cell formulas; `None` uses mode defaults. + include_merged_cells (bool | None): Include merged cell ranges; `None` uses mode defaults. + include_merged_values_in_rows (bool): Preserve merged cell values in row-wise output. + Returns: - Extracted WorkbookData. - + WorkbookData: The extracted workbook representation. + Raises: - ValueError: If mode is unsupported. + ValueError: If `mode` is not one of "light", "standard", or "verbose". """ inputs = resolve_extraction_inputs( file_path, @@ -58,4 +59,4 @@ def extract_workbook( # noqa: C901 include_merged_values_in_rows=include_merged_values_in_rows, ) result = run_extraction_pipeline(inputs) - return result.workbook + return result.workbook \ No newline at end of file diff --git a/src/exstruct/core/pipeline.py b/src/exstruct/core/pipeline.py index ff46dbe..4fc316c 100644 --- a/src/exstruct/core/pipeline.py +++ b/src/exstruct/core/pipeline.py @@ -272,13 +272,14 @@ def resolve_extraction_inputs( def build_pipeline_plan(inputs: ExtractionInputs) -> PipelinePlan: - """Build a pipeline plan based on resolved inputs. - - Args: - inputs: Resolved pipeline inputs. - + """ + Builds a pipeline plan describing which pre-COM and COM extraction steps to run for the given resolved inputs. + + Parameters: + inputs (ExtractionInputs): Resolved extraction configuration (including mode and COM/formulas flags). + Returns: - PipelinePlan containing pre-COM/COM steps and COM usage flag. + PipelinePlan: Plan containing ordered `pre_com_steps`, ordered `com_steps`, and `use_com` set to true when the pipeline should use COM (when `mode` is not "light" or `use_com_for_formulas` is true). """ return PipelinePlan( pre_com_steps=build_pre_com_pipeline(inputs), @@ -500,11 +501,12 @@ def step_extract_cells( def step_extract_print_areas_openpyxl( inputs: ExtractionInputs, artifacts: ExtractionArtifacts ) -> None: - """Extract print areas via openpyxl. - - Args: - inputs: Pipeline inputs. - artifacts: Artifact container to update. + """ + Extract print areas from the workbook and populate artifacts.print_area_data. + + Parameters: + inputs (ExtractionInputs): Pipeline inputs containing the file path and extraction options. + artifacts (ExtractionArtifacts): Mutable artifact container; `artifacts.print_area_data` will be set to the extracted print area mapping. """ backend = OpenpyxlBackend(inputs.file_path) artifacts.print_area_data = backend.extract_print_areas() @@ -513,11 +515,14 @@ def step_extract_print_areas_openpyxl( def step_extract_formulas_map_openpyxl( inputs: ExtractionInputs, artifacts: ExtractionArtifacts ) -> None: - """Extract formulas_map via openpyxl; logs and skips on failure. - - Args: - inputs: Pipeline inputs. - artifacts: Artifact container to update. + """ + Populate artifacts.formulas_map_data by extracting workbook formulas using openpyxl. + + Attempts to extract a WorkbookFormulasMap from the file at inputs.file_path and stores it on artifacts.formulas_map_data. If extraction fails, a warning is logged and artifacts.formulas_map_data is left unchanged. + + Parameters: + inputs (ExtractionInputs): Resolved pipeline inputs (provides file_path). + artifacts (ExtractionArtifacts): Mutable container to receive the extracted formulas map. """ backend = OpenpyxlBackend(inputs.file_path) try: @@ -532,11 +537,11 @@ def step_extract_formulas_map_openpyxl( def step_extract_colors_map_openpyxl( inputs: ExtractionInputs, artifacts: ExtractionArtifacts ) -> None: - """Extract colors_map via openpyxl; logs and skips on failure. - - Args: - inputs: Pipeline inputs. - artifacts: Artifact container to update. + """ + Extract the workbook colors map using openpyxl and store it on the artifacts. + + Sets artifacts.colors_map_data to the colors map extracted from inputs.file_path, + respecting inputs.include_default_background and inputs.ignore_colors. """ backend = OpenpyxlBackend(inputs.file_path) artifacts.colors_map_data = backend.extract_colors_map( @@ -605,12 +610,13 @@ def step_extract_print_areas_com( def step_extract_auto_page_breaks_com( inputs: ExtractionInputs, artifacts: ExtractionArtifacts, workbook: xw.Book ) -> None: - """Extract auto page breaks via COM. - - Args: - inputs: Pipeline inputs. - artifacts: Artifact container to update. - workbook: xlwings workbook instance. + """ + Extract auto page break information from a COM workbook and store it in the artifacts. + + Parameters: + inputs (ExtractionInputs): Pipeline inputs that may influence extraction behavior. + artifacts (ExtractionArtifacts): Mutable artifact container; updated with extracted data. + workbook (xw.Book): xlwings COM workbook used to read auto page break settings. """ artifacts.auto_page_break_data = ComBackend(workbook).extract_auto_page_breaks() @@ -618,12 +624,14 @@ def step_extract_auto_page_breaks_com( def step_extract_formulas_map_com( inputs: ExtractionInputs, artifacts: ExtractionArtifacts, workbook: xw.Book ) -> None: - """Extract formulas_map via COM; logs and skips on failure. - - Args: - inputs: Pipeline inputs. - artifacts: Artifact container to update. - workbook: xlwings workbook instance. + """ + Extract the workbook's formulas map via COM and store it into the artifacts. + + On success assigns the extracted WorkbookFormulasMap to artifacts.formulas_map_data. + On failure leaves artifacts.formulas_map_data unchanged and logs a warning. + + Parameters: + workbook (xlwings.Book): COM workbook to extract formulas from. """ try: artifacts.formulas_map_data = ComBackend(workbook).extract_formulas_map() @@ -663,14 +671,15 @@ def step_extract_colors_map_com( def _resolve_sheet_colors_map( colors_map_data: WorkbookColorsMap | None, sheet_name: str ) -> dict[str, list[tuple[int, int]]]: - """Resolve colors_map for a single sheet. - - Args: - colors_map_data: Optional workbook colors map container. - sheet_name: Target sheet name. - + """ + Resolve the colors map for a given sheet. + + Parameters: + colors_map_data (WorkbookColorsMap | None): Optional workbook-level colors map container. + sheet_name (str): Name of the sheet to resolve. + Returns: - colors_map dictionary for the sheet, or empty dict if unavailable. + dict[str, list[tuple[int, int]]]: Mapping of color keys to lists of (start_col, end_col) intervals for the sheet; empty dict if no colors map is available for the workbook or sheet. """ if not colors_map_data: return {} @@ -683,14 +692,15 @@ def _resolve_sheet_colors_map( def _resolve_sheet_formulas_map( formulas_map_data: WorkbookFormulasMap | None, sheet_name: str ) -> dict[str, list[tuple[int, int]]]: - """Resolve formulas_map for a single sheet. - - Args: - formulas_map_data: Optional workbook formulas map container. - sheet_name: Target sheet name. - + """ + Get the formulas map for a named sheet from a workbook formulas container. + + Parameters: + formulas_map_data: Optional workbook formulas map container; may be None. + sheet_name: Name of the sheet to resolve formulas for. + Returns: - formulas_map dictionary for the sheet, or empty dict if unavailable. + A mapping for the sheet (str -> list of (row, column) tuples) representing formula locations, or an empty dict if no data is available. """ if not formulas_map_data: return {} @@ -704,14 +714,18 @@ def _filter_rows_excluding_merged_values( rows: list[CellRow], merged_cells: list[MergedCellRange], ) -> list[CellRow]: - """Remove merged-cell values from rows. - - Args: - rows: Extracted rows. - merged_cells: Merged cell ranges. - + """ + Filter out cell values that originate from merged-cell ranges. + + Parameters: + rows (list[CellRow]): Extracted rows to filter. + merged_cells (list[MergedCellRange]): Merged cell ranges to exclude values from. + Returns: - Filtered rows with merged-cell values removed. + list[CellRow]: Rows where any cell whose column index falls inside a merged range has been removed. + - Rows with no remaining cells are omitted. + - Cell entries with non-integer column keys are preserved. + - `links` are retained only for cells that remain; if a row has no links after filtering, `links` is set to None. """ if not rows or not merged_cells: return rows @@ -816,23 +830,26 @@ def collect_sheet_raw_data( formulas_map_data: WorkbookFormulasMap | None = None, colors_map_data: WorkbookColorsMap | None = None, ) -> dict[str, SheetRawData]: - """Collect per-sheet raw data from extraction artifacts. - - Args: - cell_data: Extracted cell rows per sheet. - shape_data: Extracted shapes per sheet. - chart_data: Extracted charts per sheet. - merged_cell_data: Extracted merged cells per sheet. - workbook: xlwings workbook instance. - mode: Extraction mode. - print_area_data: Optional print area data per sheet. - auto_page_break_data: Optional auto page-break data per sheet. - formulas_map_data: Optional formulas map data. - colors_map_data: Optional colors map data. - include_merged_values_in_rows: Whether to keep merged values in rows. - + """ + Collect per-sheet raw extraction data and assemble SheetRawData for each sheet. + + For each sheet in cell_data this returns a SheetRawData containing rows (optionally excluding values contributed by merged cells), shapes, charts (omitted in "light" mode), detected table candidates, print/auto-print areas, per-sheet formulas map, per-sheet colors map, and merged cell ranges. + + Parameters: + cell_data (CellData): Extracted cell rows keyed by sheet name. + shape_data (ShapeData): Extracted shapes keyed by sheet name. + chart_data (ChartData): Extracted charts keyed by sheet name. + merged_cell_data (MergedCellData): Merged cell ranges keyed by sheet name. + workbook (xw.Book): xlwings workbook used to resolve sheets and detect tables. + mode (ExtractionMode): Extraction mode; when "light", charts are omitted. + include_merged_values_in_rows (bool): If False, remove values that originate from merged cells when building row data. + print_area_data (PrintAreaData | None): Optional print areas keyed by sheet name. + auto_page_break_data (PrintAreaData | None): Optional auto page-break areas keyed by sheet name. + formulas_map_data (WorkbookFormulasMap | None): Optional per-sheet formulas map to include in SheetRawData. + colors_map_data (WorkbookColorsMap | None): Optional per-sheet colors map to include in SheetRawData. + Returns: - Mapping of sheet name to raw sheet data. + dict[str, SheetRawData]: Mapping from sheet name to the assembled SheetRawData. """ result: dict[str, SheetRawData] = {} for sheet_name, rows in cell_data.items(): @@ -861,13 +878,14 @@ def collect_sheet_raw_data( def run_extraction_pipeline(inputs: ExtractionInputs) -> PipelineResult: - """Run the full extraction pipeline and return the result. - - Args: - inputs: Resolved pipeline inputs. - + """ + Execute the configured extraction pipeline and produce the extraction result. + + Parameters: + inputs (ExtractionInputs): Resolved pipeline inputs controlling which extraction steps run. + Returns: - PipelineResult with workbook data, artifacts, and execution state. + PipelineResult: Contains the constructed workbook data, collected artifacts, and pipeline execution state (including COM attempt/success and any fallback reason). """ plan = build_pipeline_plan(inputs) artifacts = run_pipeline(plan.pre_com_steps, inputs, ExtractionArtifacts()) @@ -941,15 +959,16 @@ def build_cells_tables_workbook( artifacts: ExtractionArtifacts, reason: str, ) -> WorkbookData: - """Build a WorkbookData containing cells + table_candidates (fallback). - - Args: - inputs: Pipeline inputs. - artifacts: Collected artifacts from extraction steps. - reason: Reason to log for fallback. - + """ + Builds a WorkbookData from available cell rows and detected table candidates to use as a fallback when COM-based extraction is not used or has failed. + + Parameters: + inputs (ExtractionInputs): Resolved extraction inputs that control which extra maps and merged-value handling to include. + artifacts (ExtractionArtifacts): Collected artifacts produced by pre-COM extraction steps; cell rows and any existing maps are consumed from here. + reason (str): Short description of why the fallback is being used (logged for debugging). + Returns: - WorkbookData constructed from cells and detected tables. + WorkbookData: A workbook composed from the available per-sheet cell rows, detected table candidates, merged-cell information, and any resolved formulas and colors maps. Shapes and charts are empty in this fallback path; formulas and colors maps are extracted from artifacts or from the Openpyxl backend when requested and not already present. """ logger.debug("Building fallback workbook: %s", reason) backend = OpenpyxlBackend(inputs.file_path) @@ -995,4 +1014,4 @@ def build_cells_tables_workbook( merged_cells=merged_cells, ) raw = WorkbookRawData(book_name=inputs.file_path.name, sheets=sheets) - return build_workbook_data(raw) + return build_workbook_data(raw) \ No newline at end of file diff --git a/src/exstruct/core/workbook.py b/src/exstruct/core/workbook.py index 3f33822..199eca6 100644 --- a/src/exstruct/core/workbook.py +++ b/src/exstruct/core/workbook.py @@ -19,15 +19,16 @@ def openpyxl_workbook( file_path: Path, *, data_only: bool, read_only: bool ) -> Iterator[Any]: - """Open an openpyxl workbook and ensure it is closed. - - Args: - file_path: Workbook path. - data_only: Whether to read formula results. - read_only: Whether to open in read-only mode. - + """ + Open an openpyxl Workbook for temporary use and ensure it is closed on exit. + + Parameters: + file_path (Path): Path to the workbook file. + data_only (bool): If True, read stored cell values instead of formulas. + read_only (bool): If True, open the workbook in optimized read-only mode. + Yields: - openpyxl workbook instance. + openpyxl.workbook.workbook.Workbook: The opened workbook instance. """ with warnings.catch_warnings(): warnings.filterwarnings( @@ -113,4 +114,4 @@ def _find_open_workbook(file_path: Path) -> xw.Book | None: except Exception as exc: logger.debug("Failed to inspect open Excel workbooks. (%r)", exc) return None - return None + return None \ No newline at end of file diff --git a/src/exstruct/engine.py b/src/exstruct/engine.py index 1d5b3b9..ef16a89 100644 --- a/src/exstruct/engine.py +++ b/src/exstruct/engine.py @@ -261,6 +261,24 @@ def _include_auto_print_areas(self) -> bool: def _filter_sheet( self, sheet: SheetData, include_auto_override: bool | None = None ) -> SheetData: + """ + Return a filtered copy of a SheetData according to the engine's output filters and resolved size/print-area flags. + + Parameters: + sheet: The original SheetData to filter. + include_auto_override: If not None, overrides the engine's automatic decision for including auto page-break areas; if None, the engine's auto rule is used. + + Returns: + A new SheetData where: + - rows are kept only if include_rows is enabled; otherwise an empty list. + - shapes are kept only if include_shapes is enabled; when kept and shape-size inclusion is disabled, each shape's width and height are cleared. + - charts are kept only if include_charts is enabled; when kept and chart-size inclusion is disabled, each chart's width and height are cleared. + - table_candidates are kept only if include_tables is enabled; otherwise an empty list. + - colors_map and formulas_map are preserved as-is. + - print_areas are kept only if print areas are included by the engine; otherwise an empty list. + - auto_print_areas are kept only if auto page-break areas are included (after applying include_auto_override); otherwise an empty list. + - merged_cells are kept only if include_merged_cells is enabled; otherwise set to None. + """ include_shape_size, include_chart_size = self._resolve_size_flags() include_print_areas = self._include_print_areas() include_auto_print_areas = ( @@ -335,15 +353,15 @@ def extract( self, file_path: str | Path, *, mode: ExtractionMode | None = None ) -> WorkbookData: """ - Extract a workbook and return normalized workbook data. - - Args: - file_path: Path to the .xlsx/.xlsm/.xls file to extract. - mode: Extraction mode; defaults to the engine's StructOptions.mode. - - light: COM-free; cells, table candidates, and print areas only. - - standard: Shapes with text/arrows plus charts; print areas included; - size fields retained but hidden from default output. - - verbose: All shapes (with size) and charts (with size). + Produce a normalized WorkbookData extracted from the given workbook file. + + Parameters: + file_path (str | Path): Path to the .xlsx/.xlsm/.xls file to extract. + mode (ExtractionMode | None): Extraction mode to use; if None the engine's configured mode is used. + Modes: "light", "standard", "verbose". + + Returns: + WorkbookData: Normalized workbook data extracted from the file. """ chosen_mode = mode or self.options.mode include_auto_page_breaks = ( @@ -575,4 +593,4 @@ def process( export_pdf(normalized_file_path, pdf_path) if image: images_dir = pdf_path.parent / f"{pdf_path.stem}_images" - export_sheet_images(normalized_file_path, images_dir, dpi=dpi) + export_sheet_images(normalized_file_path, images_dir, dpi=dpi) \ No newline at end of file diff --git a/src/exstruct/render/__init__.py b/src/exstruct/render/__init__.py index ad6cc8a..e30dbab 100644 --- a/src/exstruct/render/__init__.py +++ b/src/exstruct/render/__init__.py @@ -79,7 +79,15 @@ def _require_pdfium() -> ModuleType: def export_sheet_images( excel_path: str | Path, output_dir: str | Path, dpi: int = 144 ) -> list[Path]: - """Export each sheet as PNG (via PDF then pypdfium2 rasterization) and return paths in sheet order.""" + """ + Export each worksheet in the given Excel workbook to PNG files and return the image paths in workbook order. + + Returns: + paths (list[Path]): Paths to the generated PNG files, ordered by the corresponding worksheets. + + Raises: + RenderError: If export or rendering fails. + """ normalized_excel_path = Path(excel_path) normalized_output_dir = Path(output_dir) normalized_output_dir.mkdir(parents=True, exist_ok=True) @@ -106,6 +114,17 @@ def export_sheet_images( def _sanitize_sheet_filename(name: str) -> str: + """ + Create a filesystem-safe filename derived from an Excel sheet name. + + Replaces characters that are not allowed in filenames (\/:*?"<>|) with underscores, trims surrounding whitespace, and returns "sheet" if the result is empty. + + Parameters: + name (str): Original sheet name. + + Returns: + safe_name (str): Filename-safe string derived from `name`. + """ return "".join("_" if c in '\\/:*?"<>|' else c for c in name).strip() or "sheet" @@ -122,11 +141,27 @@ class _SheetApiProtocol(Protocol): def ExportAsFixedFormat( # noqa: N802 self, file_format: int, output_path: str, *args: object, **kwargs: object - ) -> None: ... + ) -> None: """ + Export the sheet or workbook to a fixed-format file (for example, PDF or XPS). + + Parameters: + file_format (int): Excel XlFixedFormatType enum value indicating the output format (e.g., the constant for PDF). + output_path (str): Filesystem path where the fixed-format file will be written. + *args (object): Additional positional arguments forwarded to the underlying Excel COM ExportAsFixedFormat call. + **kwargs (object): Additional keyword arguments forwarded to the underlying Excel COM ExportAsFixedFormat call. + """ + ... def _iter_sheet_apis(wb: xw.Book) -> list[tuple[int, str, _SheetApiProtocol]]: - """Return sheet index, name, and COM api handle in order.""" + """ + Enumerate workbook sheets and return each sheet's zero-based index, display name, and COM API handle in workbook order. + + If direct COM access to Worksheets is unavailable, falls back to iterating wb.sheets to build the same list. + + Returns: + List[tuple[int, str, _SheetApiProtocol]]: Tuples of (zero-based sheet index, sheet name, sheet COM API handle) in workbook order. + """ try: ws_collection = getattr(getattr(wb, "api", None), "Worksheets", None) if ws_collection is None: @@ -152,9 +187,10 @@ def _iter_sheet_apis(wb: xw.Book) -> list[tuple[int, str, _SheetApiProtocol]]: def _build_sheet_export_plan( wb: xw.Book, ) -> list[tuple[str, _SheetApiProtocol, str | None]]: - """Return export plan rows for sheets and their print areas. - - Each item is (sheet_name, sheet_api, print_area). + """ + Build an ordered export plan mapping each worksheet to its print areas. + + Each returned tuple is (sheet_name, sheet_api, print_area). The list preserves workbook sheet order; for sheets with no defined print areas `print_area` is `None`, and for sheets with multiple print areas there is one tuple per area. """ plan: list[tuple[str, _SheetApiProtocol, str | None]] = [] for _, sheet_name, sheet_api in _iter_sheet_apis(wb): @@ -168,7 +204,17 @@ def _build_sheet_export_plan( def _extract_print_areas(sheet_api: _SheetApiProtocol) -> list[str]: - """Return print areas for a sheet API, split into individual ranges.""" + """ + Extract the sheet's print-area ranges as a list of strings. + + Retrieves the PageSetup.PrintArea value from the provided sheet API, splits it by commas while respecting single-quoted sections, and returns each range as a separate string. If the sheet has no print area or the property is inaccessible, an empty list is returned. + + Parameters: + sheet_api (_SheetApiProtocol): Excel sheet API object exposing a `PageSetup.PrintArea` attribute. + + Returns: + list[str]: List of print-area range strings in the order they appear, or an empty list if none are defined or on access failure. + """ try: page_setup = getattr(sheet_api, "PageSetup", None) if page_setup is None: @@ -182,7 +228,18 @@ def _extract_print_areas(sheet_api: _SheetApiProtocol) -> list[str]: def _split_csv_respecting_quotes(raw: str) -> list[str]: - """Split a CSV-like string while keeping commas inside single quotes intact.""" + """ + Split a comma-separated string into parts while treating single-quoted sections as atomic. + + This function splits raw on commas that are not inside single quotes. Text enclosed in single quotes is preserved (including internal commas). Two consecutive single quotes inside a quoted section are treated as an escaped single-quote pair. Leading and trailing whitespace is trimmed from each part and empty parts are removed. + + Parameters: + raw (str): The input CSV-like string that may contain single-quoted segments. + + Returns: + list[str]: A list of non-empty tokens obtained from splitting `raw` by unquoted commas, + with surrounding whitespace removed and quoted segments preserved. + """ parts: list[str] = [] buf: list[str] = [] in_quote = False @@ -216,7 +273,18 @@ def _rename_pages_for_print_area( base_index: int, safe_name: str, ) -> list[Path]: - """Rename multi-page outputs to unique prefixes for print areas.""" + """ + Rename the given image files so each gets a unique numeric prefix based on a base index and a safe sheet name. + + Parameters: + paths (list[Path]): Existing image files for a single sheet or print area (may include per-page suffixes). + output_dir (Path): Directory where renamed files will reside. + base_index (int): Zero-based starting index used to compute the numeric prefix for each output file. + safe_name (str): Filesystem-safe base name to use after the numeric prefix. + + Returns: + list[Path]: Paths to the renamed files in the same order as input, each named "{index:02d}_{safe_name}.png". + """ renamed: list[Path] = [] for path in paths: page_index = _page_index_from_suffix(path.stem) @@ -229,7 +297,17 @@ def _rename_pages_for_print_area( def _page_index_from_suffix(stem: str) -> int: - """Extract zero-based page index from a _pNN suffix when present.""" + """ + Extracts a zero-based page index from a filename stem ending with a "_pNN" numeric suffix. + + If the stem ends with "_p" followed by digits, returns that number minus one. If the suffix is missing, non-numeric, or less than 1, returns 0. + + Parameters: + stem (str): Filename stem to parse. + + Returns: + int: Zero-based page index derived from the "_pNN" suffix, or 0 when no valid suffix is present. + """ if "_p" not in stem: return 0 base, suffix = stem.rsplit("_p", 1) @@ -249,13 +327,16 @@ def _export_sheet_pdf( ignore_print_areas: bool, print_area: str | None = None, ) -> None: - """Export a sheet to PDF via Excel COM. - + """ + Export the given worksheet to a PDF file, optionally applying a temporary print area. + + If `print_area` is provided, it is applied to the sheet's PageSetup.PrintArea before exporting and restored afterwards. The function attempts to call ExportAsFixedFormat with an IgnorePrintAreas keyword; if that call fails due to an unexpected COM signature, it retries with a minimal argument set. + Args: - sheet_api: Target worksheet COM api. - pdf_path: Output PDF path. - ignore_print_areas: Whether to ignore print areas. - print_area: Optional print area string to apply for this export. + sheet_api: COM-like worksheet API exposing `PageSetup` and `ExportAsFixedFormat`. + pdf_path (Path): Filesystem path to write the PDF to. + ignore_print_areas (bool): If True, request that Excel ignore sheet print areas during export. + print_area (str | None): Optional print area string to apply for this export; if None, the sheet's current print area is left unchanged. """ original_print_area: object | None = None page_setup = None @@ -282,7 +363,18 @@ def _export_sheet_pdf( def _ensure_pdfium(use_subprocess: bool) -> ModuleType | None: - """Return pdfium module when needed, or None for subprocess rendering.""" + """ + Ensure the pypdfium2 dependency is available and return the pdfium module for in-process rendering. + + Parameters: + use_subprocess (bool): When True, confirm pypdfium2 is installed for subprocess rendering but do not keep the module in-process; when False, import and return the pdfium module for direct use. + + Returns: + ModuleType | None: The imported `pdfium` module when `use_subprocess` is False, or `None` when `use_subprocess` is True. + + Raises: + MissingDependencyError: If pypdfium2 (and required extras) is not installed. + """ if use_subprocess: _require_pdfium() return None @@ -297,7 +389,20 @@ def _export_sheet_images_with_app( use_subprocess: bool, pdfium: ModuleType | None, ) -> list[Path]: - """Export sheet images using Excel COM and PDF rendering.""" + """ + Export each worksheet of an Excel workbook to PNG images by exporting sheets to per-sheet PDFs and rendering those PDFs. + + Parameters: + excel_path (Path): Path to the source Excel workbook. + output_dir (Path): Directory where generated PNGs will be written. + temp_dir (Path): Temporary directory for per-sheet intermediate PDF files. + dpi (int): Dots per inch used when rasterizing PDF pages. + use_subprocess (bool): If True, render PDF pages in a subprocess; otherwise render in-process. + pdfium (ModuleType | None): In-process pypdfium2 module when rendering in-process, or None when subprocess rendering is used. + + Returns: + list[Path]: Paths to generated PNG images in the order corresponding to the workbook's sheets and print-area splits. + """ written: list[Path] = [] app: xw.App | None = None wb: xw.Book | None = None @@ -364,7 +469,15 @@ def _render_sheet_images( dpi: int, use_subprocess: bool, ) -> list[Path]: - """Render sheet PDF to PNGs using the configured renderer.""" + """ + Render a sheet PDF to one or more PNG files using either a subprocess or in-process renderer. + + Returns: + paths (list[Path]): Paths to the generated PNG files in output order. + + Raises: + RenderError: If in-process rendering is requested but the `pypdfium2` module (`pdfium`) is not provided. + """ if use_subprocess: return _render_pdf_pages_subprocess( sheet_pdf, @@ -391,14 +504,34 @@ def _normalize_multipage_paths( base_index: int, safe_name: str, ) -> list[Path]: - """Normalize multi-page outputs to unique prefixes when needed.""" + """ + Assign distinct, ordered filenames for multi-page sheet outputs. + + If `paths` contains a single file, the list is returned unchanged. If `paths` contains multiple files, each file is given a unique, numbered filename in `output_dir` using `base_index` and `safe_name` so pages are ordered and do not collide. + + Parameters: + paths (list[Path]): Existing file paths for a sheet's rendered pages. + output_dir (Path): Directory containing or intended to contain the output files. + base_index (int): Zero-based starting index used to compute numeric prefixes for filenames. + safe_name (str): Filesystem-safe base name included in the generated filenames. + + Returns: + list[Path]: Paths to the resulting files in `output_dir`. When multiple input paths are provided, returned paths reflect the new, uniquely prefixed filenames. + """ if len(paths) <= 1: return paths return _rename_pages_for_print_area(paths, output_dir, base_index, safe_name) def _use_render_subprocess() -> bool: - """Return True when PDF->PNG rendering should run in a subprocess.""" + """ + Decide whether PDF-to-PNG rendering should be performed in a subprocess. + + Reads the environment variable EXSTRUCT_RENDER_SUBPROCESS (case-insensitive). Subprocess rendering is disabled when the variable is set to "0" or "false"; if the variable is unset or set to any other value, subprocess rendering is enabled. + + Returns: + `true` if subprocess rendering is enabled, `false` otherwise. + """ return os.getenv("EXSTRUCT_RENDER_SUBPROCESS", "1").lower() not in {"0", "false"} @@ -492,4 +625,4 @@ def _render_pdf_pages_worker( queue.put({"error": str(exc)}) -__all__ = ["export_pdf", "export_sheet_images"] +__all__ = ["export_pdf", "export_sheet_images"] \ No newline at end of file diff --git a/tests/backends/test_auto_page_breaks.py b/tests/backends/test_auto_page_breaks.py index dd472d5..8c0e9f2 100644 --- a/tests/backends/test_auto_page_breaks.py +++ b/tests/backends/test_auto_page_breaks.py @@ -16,6 +16,13 @@ def test_extract_passes_auto_page_break_flag( monkeypatch: MonkeyPatch, tmp_path: Path ) -> None: + """ + Verify that extract_workbook is invoked with include_auto_page_breaks set to True. + + Creates a fake extractor that captures the include_auto_page_breaks argument, replaces + exstruct.engine.extract_workbook with it, runs ExStructEngine.extract against a dummy + workbook path configured to export auto page breaks, and asserts the captured flag is True. + """ called: dict[str, object] = {} def fake_extract( @@ -31,6 +38,20 @@ def fake_extract( include_merged_cells: bool | None = None, include_merged_values_in_rows: bool = True, ) -> WorkbookData: + """ + Test stub for workbook extraction that records the auto page breaks flag. + + This fake extractor captures the value of `include_auto_page_breaks` in the outer + `called` mapping and returns a minimal `WorkbookData` with `book_name` set to + the provided path's filename and an empty `sheets` mapping. + + Parameters: + path (Path): Filesystem path used to derive the returned `WorkbookData.book_name`. + include_auto_page_breaks (bool): Flag whose value is written to `called["include_auto_page_breaks"]`. + + Returns: + WorkbookData: A minimal workbook data object with `book_name` set to `path.name` and no sheets. + """ called["include_auto_page_breaks"] = include_auto_page_breaks return WorkbookData(book_name=path.name, sheets={}) @@ -82,4 +103,4 @@ class _DummyWorkbook: sheets = [_FailingSheet()] backend = ComBackend(_DummyWorkbook()) - assert backend.extract_auto_page_breaks() == {} + assert backend.extract_auto_page_breaks() == {} \ No newline at end of file diff --git a/tests/backends/test_backends.py b/tests/backends/test_backends.py index cbebc72..0046896 100644 --- a/tests/backends/test_backends.py +++ b/tests/backends/test_backends.py @@ -79,6 +79,12 @@ def test_openpyxl_backend_extract_formulas_map_returns_none_on_failure( monkeypatch: MonkeyPatch, tmp_path: Path ) -> None: def fake_formulas_map(file_path: Path) -> object: + """ + Test helper that always raises a RuntimeError to simulate a failure when extracting a formulas map. + + Raises: + RuntimeError: with message "boom". + """ raise RuntimeError("boom") monkeypatch.setattr( @@ -120,6 +126,15 @@ def test_com_backend_extract_formulas_map_returns_none_on_failure( monkeypatch: MonkeyPatch, ) -> None: def fake_formulas_map(workbook: object) -> object: + """ + Test stub that simulates a failure by always raising a RuntimeError. + + Parameters: + workbook (object): Workbook-like object (ignored); present to match the real function's signature. + + Raises: + RuntimeError: Always raised with message "boom". + """ raise RuntimeError("boom") monkeypatch.setattr( @@ -157,6 +172,11 @@ class _DummyWorkbook: def test_openpyxl_backend_extract_print_areas(tmp_path: Path) -> None: + """ + Verifies that OpenpyxlBackend.extract_print_areas reads an openpyxl workbook's print area and returns the corresponding zero-based ranges keyed by sheet name. + + Creates an in-memory workbook with a single sheet named "Sheet1", sets its print area to "A1:B2", saves and loads it via OpenpyxlBackend, then asserts the sheet is present, has at least one area, and that the first area's r1 and c1 are 1 and 0 respectively. + """ wb = Workbook() ws = wb.active ws.title = "Sheet1" @@ -178,6 +198,11 @@ def test_openpyxl_backend_extract_print_areas(tmp_path: Path) -> None: def test_openpyxl_backend_extract_print_areas_returns_empty_on_error( monkeypatch: MonkeyPatch, tmp_path: Path ) -> None: + """ + Ensure OpenpyxlBackend.extract_print_areas returns an empty dict when the workbook loader raises an error. + + Verifies that the backend handles errors from the underlying workbook opening function by returning an empty mapping of print areas. + """ def _raise(*_args: object, **_kwargs: object) -> None: raise RuntimeError("boom") @@ -226,31 +251,72 @@ def test_com_backend_parse_print_area_range_invalid() -> None: class _Location: def __init__(self, row: int | None = None, col: int | None = None) -> None: + """ + Initialize the location with row and column values. + + Parameters: + row (int | None): Row index or None. + col (int | None): Column index or None. + """ self.Row = row self.Column = col class _BreakItem: def __init__(self, row: int | None = None, col: int | None = None) -> None: + """ + Initialize the break item with an optional sheet location. + + Parameters: + row (int | None): Row index (1-based) for the location, or None if unspecified. + col (int | None): Column index (1-based) for the location, or None if unspecified. + """ self.Location = _Location(row=row, col=col) class _Breaks: def __init__(self, items: list[_BreakItem]) -> None: + """ + Initialize the Breaks collection from a list of break items. + + Parameters: + items (list[_BreakItem]): Sequence of `_BreakItem` instances representing page break entries; ordering corresponds to 1-based access via `Item`. + """ self._items = items self.Count = len(items) def Item(self, index: int) -> _BreakItem: + """ + Return the break item at the given 1-based position. + + Parameters: + index (int): 1-based position of the break to retrieve. + + Returns: + _BreakItem: The break item at the specified position. + """ return self._items[index - 1] class _RangeRows: def __init__(self, count: int) -> None: + """ + Initialize the breaks container with a specified item count. + + Parameters: + count (int): Number of break items the container should report via its `Count` attribute. + """ self.Count = count class _RangeCols: def __init__(self, count: int) -> None: + """ + Initialize the breaks container with a specified item count. + + Parameters: + count (int): Number of break items the container should report via its `Count` attribute. + """ self.Count = count @@ -271,6 +337,16 @@ class _PageSetup: class _SheetApi: def __init__(self) -> None: + """ + Initialize a fake sheet API used by COM backend tests with default page and range state. + + Creates default attributes: + - DisplayPageBreaks set to False. + - PageSetup populated with a default PrintArea. + - UsedRange populated with a default Address. + - HPageBreaks containing one horizontal break at row 2. + - VPageBreaks containing one vertical break at column 2. + """ self.DisplayPageBreaks = False self.PageSetup = _PageSetup() self.UsedRange = _UsedRange() @@ -278,6 +354,15 @@ def __init__(self) -> None: self.VPageBreaks = _Breaks([_BreakItem(col=2)]) def Range(self, _addr: str) -> _Range: + """ + Create and return a Range wrapper for the given Excel-style address. + + Parameters: + _addr (str): Excel-style address or range string (e.g., "A1", "A1:B2", or "Sheet1!A1:B2"). + + Returns: + _Range: An object representing the requested worksheet range. + """ return _Range() @@ -285,11 +370,21 @@ class _Sheet: name = "Sheet1" def __init__(self) -> None: + """ + Initialize a mock sheet and attach its API. + + Sets the `api` attribute to a new `_SheetApi` instance used by tests to simulate a sheet's COM-like API. + """ self.api = _SheetApi() class _DummyWorkbook: def __init__(self) -> None: + """ + Initialize a dummy workbook containing a single default sheet. + + The instance provides a `sheets` attribute set to a list with one `_Sheet` object. + """ self.sheets = [_Sheet()] @@ -302,6 +397,15 @@ def test_com_backend_extract_auto_page_breaks_success() -> None: class _RestoreErrorSheetApi: def __init__(self) -> None: + """ + Initialize a mock sheet API with default page, range, and break attributes. + + Creates: + - `_display`: boolean flag for DisplayPageBreaks (defaults to False). + - `PageSetup`: a default page setup object. + - `UsedRange`: a default used-range object. + - `HPageBreaks` and `VPageBreaks`: horizontal and vertical break collections, initialized empty. + """ self._display = False self.PageSetup = _PageSetup() self.UsedRange = _UsedRange() @@ -310,15 +414,39 @@ def __init__(self) -> None: @property def DisplayPageBreaks(self) -> bool: + """ + Get whether displaying page breaks is enabled on the sheet. + + Returns: + `True` if page break display is enabled, `False` otherwise. + """ return self._display @DisplayPageBreaks.setter def DisplayPageBreaks(self, value: bool) -> None: + """ + Set the sheet's DisplayPageBreaks flag. + + Parameters: + value (bool): True to enable display of automatic page breaks. Passing False will trigger a restore failure. + + Raises: + RuntimeError: If `value` is False (restore failed). + """ if value is False: raise RuntimeError("restore failed") self._display = value def Range(self, _addr: str) -> _Range: + """ + Create and return a Range wrapper for the given Excel-style address. + + Parameters: + _addr (str): Excel-style address or range string (e.g., "A1", "A1:B2", or "Sheet1!A1:B2"). + + Returns: + _Range: An object representing the requested worksheet range. + """ return _Range() @@ -326,15 +454,25 @@ class _RestoreErrorSheet: name = "Sheet1" def __init__(self) -> None: + """ + Create a sheet object whose underlying API simulates an error when restoring DisplayPageBreaks. + + This constructor assigns an instance of _RestoreErrorSheetApi to the `api` attribute so tests can exercise code paths that handle failures when restoring page-break state. + """ self.api = _RestoreErrorSheetApi() class _RestoreErrorWorkbook: def __init__(self) -> None: + """ + Create a mock workbook containing a single sheet that raises an error when restoring DisplayPageBreaks. + + The instance exposes a `sheets` attribute set to a list with one _RestoreErrorSheet(), which is used to simulate failures during page-break restoration in tests. + """ self.sheets = [_RestoreErrorSheet()] def test_com_backend_extract_auto_page_breaks_restore_error() -> None: backend = ComBackend(_RestoreErrorWorkbook()) areas = backend.extract_auto_page_breaks() - assert "Sheet1" in areas + assert "Sheet1" in areas \ No newline at end of file diff --git a/tests/backends/test_print_areas_openpyxl.py b/tests/backends/test_print_areas_openpyxl.py index 90b58b2..31362e3 100644 --- a/tests/backends/test_print_areas_openpyxl.py +++ b/tests/backends/test_print_areas_openpyxl.py @@ -14,6 +14,12 @@ def _make_book_with_print_area(path: Path) -> None: + """ + Create a simple Excel workbook with a single sheet named "Sheet1", set its print area to "A1:B2", write "x" to cell A1, save it to the given path, and close the file. + + Parameters: + path (Path): Filesystem path where the workbook will be saved. + """ wb = Workbook() ws = wb.active ws.title = "Sheet1" @@ -61,6 +67,12 @@ class _DefinedArea: class _DefinedNames: def get(self, _name: str) -> _DefinedArea: + """ + Create a default defined area object. + + Returns: + _DefinedArea: A new, empty/default defined-area instance. + """ return _DefinedArea() class _DummyWorkbook: @@ -109,4 +121,4 @@ def test_append_print_areas_skips_invalid_ranges() -> None: areas: PrintAreaData = {} _append_print_areas(areas, "Sheet1", "A1:B2,INVALID") assert "Sheet1" in areas - assert len(areas["Sheet1"]) == 1 + assert len(areas["Sheet1"]) == 1 \ No newline at end of file diff --git a/tests/com/test_render_smoke.py b/tests/com/test_render_smoke.py index d85ec52..6557a98 100644 --- a/tests/com/test_render_smoke.py +++ b/tests/com/test_render_smoke.py @@ -37,6 +37,11 @@ def test_render_smoke_pdf_and_png(tmp_path: Path) -> None: def test_render_multiple_print_ranges_images(tmp_path: Path) -> None: + """ + Verify that processing a workbook with multiple print ranges across four sheets produces an images directory containing exactly four PNG files. + + Uses the test asset 'assets/multiple_print_ranges_4sheets.xlsx', runs process_excel with image output enabled, and asserts the generated images directory exists and contains four .png images. + """ xlsx = ( Path(__file__).resolve().parents[1] / "assets" @@ -55,4 +60,4 @@ def test_render_multiple_print_ranges_images(tmp_path: Path) -> None: images_dir = out_json.parent / f"{out_json.stem}_images" images = list(images_dir.glob("*.png")) assert images_dir.exists() - assert len(images) == 4 + assert len(images) == 4 \ No newline at end of file diff --git a/tests/core/test_cells_utils.py b/tests/core/test_cells_utils.py index 69bf9da..9a03248 100644 --- a/tests/core/test_cells_utils.py +++ b/tests/core/test_cells_utils.py @@ -71,6 +71,11 @@ def test_detect_tables_openpyxl_respects_table_params( def test_normalize_formula_value_prefers_array_text() -> None: + """ + Verify that _normalize_formula_value prefers an array-like object's text and treats an empty string as no formula. + + Asserts that an object with a `text` attribute is converted to a formula string prefixed with '=' (e.g., "=SUM(A1:A3)"), and that an empty string is normalized to None. + """ class _ArrayFormulaLike: text = "SUM(A1:A3)" @@ -143,6 +148,16 @@ class _DummySheet: used_range = _DummyUsedRange() def range(self, _start: object, _end: object) -> _DummyRange: + """ + Return a new _DummyRange representing a requested cell range. + + Parameters: + _start (object): Start coordinate or cell reference for the range request (ignored by this dummy implementation). + _end (object): End coordinate or cell reference for the range request (ignored by this dummy implementation). + + Returns: + _DummyRange: A fresh _DummyRange instance corresponding to the requested range. + """ return _DummyRange() class _DummyWorkbook: @@ -154,4 +169,4 @@ class _DummyWorkbook: assert sheet.formulas_map == { "=A1": [(1, 0)], "=SUM(A1)": [(2, 0)], - } + } \ No newline at end of file diff --git a/tests/core/test_mode_output.py b/tests/core/test_mode_output.py index fbcfdaa..202a87a 100644 --- a/tests/core/test_mode_output.py +++ b/tests/core/test_mode_output.py @@ -30,6 +30,11 @@ def _make_basic_book(path: Path) -> None: def _ensure_excel() -> None: + """ + Ensure Excel COM is available for tests and skip the current test if it is not. + + If the SKIP_COM_TESTS environment variable is set, this function skips the test. Otherwise it tries to start a hidden xlwings App and quits it; if starting the App fails, the function skips the test due to unavailable Excel COM. + """ if os.getenv("SKIP_COM_TESTS"): pytest.skip("SKIP_COM_TESTS is set; skipping Excel-dependent test.") try: @@ -190,4 +195,4 @@ def test_CLI_defaults_to_stdout(tmp_path: Path) -> None: ] result = subprocess.run(cmd, capture_output=True, text=True) assert result.returncode == 0 - assert '"book_name": "book.xlsx"' in result.stdout + assert '"book_name": "book.xlsx"' in result.stdout \ No newline at end of file diff --git a/tests/core/test_pipeline.py b/tests/core/test_pipeline.py index 9596a76..d12dde3 100644 --- a/tests/core/test_pipeline.py +++ b/tests/core/test_pipeline.py @@ -223,6 +223,13 @@ def test_resolve_extraction_inputs_warns_on_xls_formulas( calls: list[str] = [] def _warn_once(key: str, message: str) -> None: + """ + Record a warning key in the shared `calls` list while ignoring the message. + + Parameters: + key (str): Identifier for the warning; appended to the module-level `calls` list. + message (str): Ignored placeholder kept for compatibility with expected callback signature. + """ calls.append(key) _ = message @@ -393,6 +400,16 @@ def _fake( include_default_background: bool, ignore_colors: set[str] | None, ) -> object: + """ + Provide a placeholder colors map for testing that is always empty. + + Parameters: + include_default_background (bool): Accepted for signature compatibility; has no effect on the returned value. + ignore_colors (set[str] | None): Accepted for signature compatibility; has no effect on the returned value. + + Returns: + WorkbookColorsMap: An empty colors map with no sheets. + """ _ = _backend _ = include_default_background _ = ignore_colors @@ -427,6 +444,11 @@ def _fake_com( include_default_background: bool, ignore_colors: set[str] | None, ) -> None: + """ + No-op placeholder that simulates a COM backend extraction step without producing any side effects. + + This function accepts a COM backend and related flags but intentionally performs no operations; it is used in tests as a stub implementation. + """ _ = _backend _ = include_default_background _ = ignore_colors @@ -438,6 +460,16 @@ def _fake_openpyxl( include_default_background: bool, ignore_colors: set[str] | None, ) -> object: + """ + Return an empty WorkbookColorsMap regardless of inputs. + + Parameters: + include_default_background (bool): Ignored; present for signature compatibility. + ignore_colors (set[str] | None): Ignored; present for signature compatibility. + + Returns: + WorkbookColorsMap: A colors map with no sheets. + """ _ = _backend _ = include_default_background _ = ignore_colors @@ -468,6 +500,12 @@ def test_step_extract_auto_page_breaks_com_sets_data( tmp_path: Path, monkeypatch: MonkeyPatch ) -> None: def _fake(_: ComBackend) -> dict[str, list[PrintArea]]: + """ + Return a stub mapping of sheet names to print areas containing a single 1x1 print area for "Sheet1". + + Returns: + dict[str, list[PrintArea]]: Mapping where "Sheet1" maps to a list with one PrintArea covering row 1, column 0 to row 1, column 0. + """ return {"Sheet1": [PrintArea(r1=1, c1=0, r2=1, c2=0)]} monkeypatch.setattr(ComBackend, "extract_auto_page_breaks", _fake) @@ -502,12 +540,29 @@ def _fake_colors( include_default_background: bool, ignore_colors: set[str] | None, ) -> object: + """ + Return a fake workbook colors map used by tests. + + Parameters: + _backend (OpenpyxlBackend): Ignored backend parameter retained for signature compatibility. + include_default_background (bool): Whether the default background color would be included (ignored). + ignore_colors (set[str] | None): Set of color names to ignore (ignored). + + Returns: + object: A preconstructed colors map object used by tests. + """ _ = _backend _ = include_default_background _ = ignore_colors return colors_map def _fake_formulas(_: OpenpyxlBackend) -> object: + """ + Return the pre-captured formulas_map object. + + Returns: + The pre-captured `formulas_map` object. + """ return formulas_map monkeypatch.setattr(OpenpyxlBackend, "extract_colors_map", _fake_colors) @@ -539,6 +594,12 @@ def test_step_extract_formulas_map_openpyxl_skips_on_failure( tmp_path: Path, monkeypatch: MonkeyPatch, caplog: "pytest.LogCaptureFixture" ) -> None: def _raise(_: OpenpyxlBackend) -> object: + """ + Always raises a RuntimeError with the message "boom". + + Raises: + RuntimeError: always raised with message "boom". + """ raise RuntimeError("boom") monkeypatch.setattr(OpenpyxlBackend, "extract_formulas_map", _raise) @@ -569,6 +630,12 @@ def test_step_extract_formulas_map_com_skips_on_failure( tmp_path: Path, monkeypatch: MonkeyPatch, caplog: "pytest.LogCaptureFixture" ) -> None: def _raise(_: ComBackend) -> object: + """ + Always raises a RuntimeError with message "boom". + + Raises: + RuntimeError: Always raised by this helper. + """ raise RuntimeError("boom") monkeypatch.setattr(ComBackend, "extract_formulas_map", _raise) @@ -631,6 +698,16 @@ def test_step_extract_shapes_com_sets_data( shapes_data = {"Sheet1": [object()]} def _fake(_: object, *, mode: str) -> dict[str, list[object]]: + """ + Provide a stub that supplies the module-level `shapes_data` mapping. + + Parameters: + _ (object): Placeholder positional argument; ignored. + mode (str): Mode selector; ignored. + + Returns: + dict[str, list[object]]: Mapping of sheet names to lists of shape objects from `shapes_data`. + """ _ = mode return shapes_data @@ -660,11 +737,26 @@ def test_step_extract_charts_com_sets_data( charts = [object()] def _fake(_: object, *, mode: str) -> list[object]: + """ + Return the captured charts list. + + Parameters: + mode (str): Ignored; accepted for compatibility with callers. + + Returns: + list[object]: The charts list captured from the enclosing scope. + """ _ = mode return charts class _Sheet: def __init__(self, name: str) -> None: + """ + Initialize the instance with a display name. + + Parameters: + name (str): The name to assign to the instance. + """ self.name = name class _Workbook: @@ -694,6 +786,11 @@ def test_step_extract_print_areas_com_skips_when_present( tmp_path: Path, monkeypatch: MonkeyPatch ) -> None: def _raise(_: ComBackend) -> object: + """ + Raise a RuntimeError indicating this code path must not be invoked. + + This function always raises RuntimeError("should not be called"). + """ raise RuntimeError("should not be called") monkeypatch.setattr(ComBackend, "extract_print_areas", _raise) @@ -721,6 +818,12 @@ def test_step_extract_print_areas_com_sets_data( tmp_path: Path, monkeypatch: MonkeyPatch ) -> None: def _fake(_: ComBackend) -> dict[str, list[PrintArea]]: + """ + Return a stub mapping of sheet names to print areas containing a single 1x1 print area for "Sheet1". + + Returns: + dict[str, list[PrintArea]]: Mapping where "Sheet1" maps to a list with one PrintArea covering row 1, column 0 to row 1, column 0. + """ return {"Sheet1": [PrintArea(r1=1, c1=0, r2=1, c2=0)]} monkeypatch.setattr(ComBackend, "extract_print_areas", _fake) @@ -754,6 +857,16 @@ def _fake_com( include_default_background: bool, ignore_colors: set[str] | None, ) -> object: + """ + Return a colors map object suitable for use as a COM backend response. + + Parameters: + include_default_background (bool): If true, the returned colors map should include the default background color. + ignore_colors (set[str] | None): Optional set of color identifiers to exclude from the returned map; `None` means no colors are excluded. + + Returns: + object: A colors map representing workbook-level color mappings. + """ _ = _backend _ = include_default_background _ = ignore_colors @@ -765,6 +878,12 @@ def _raise( include_default_background: bool, ignore_colors: set[str] | None, ) -> object: + """ + Placeholder backend sentinel that always raises a RuntimeError when invoked. + + Raises: + RuntimeError: Always raised with message "should not be called". + """ _ = _backend _ = include_default_background _ = ignore_colors @@ -795,6 +914,16 @@ def test_run_com_pipeline_executes_steps(tmp_path: Path) -> None: calls: list[str] = [] def _step(_: ExtractionInputs, artifacts: ExtractionArtifacts, __: object) -> None: + """ + Test pipeline step that simulates shape extraction. + + Sets artifacts.shape_data to a mapping for "Sheet1" containing a single Shape and records invocation by appending "called" to the outer `calls` list. + + Parameters: + _ (ExtractionInputs): Unused extraction inputs placeholder. + artifacts (ExtractionArtifacts): Artifacts object to populate with shape data. + __ (object): Unused context placeholder. + """ calls.append("called") artifacts.shape_data = {"Sheet1": [Shape(id=1, text="", l=0, t=0)]} @@ -823,29 +952,87 @@ def test_run_extraction_pipeline_com_success( ) -> None: class _Sheet: def __init__(self, name: str) -> None: + """ + Initialize the instance with a display name. + + Parameters: + name (str): The name to assign to the instance. + """ self.name = name class _Sheets: def __init__(self) -> None: + """ + Initialize the object with a single default sheet named "Sheet1". + + Creates the internal mapping `self._sheets` and populates it with one `_Sheet` instance keyed by "Sheet1". + """ self._sheets = {"Sheet1": _Sheet("Sheet1")} def __getitem__(self, name: str) -> _Sheet: + """ + Access a worksheet by its name. + + Parameters: + name (str): The name of the sheet to retrieve. + + Returns: + _Sheet: The sheet object associated with `name`. + + Raises: + KeyError: If no sheet with the given name exists. + """ return self._sheets[name] class _Workbook: sheets = _Sheets() def _pre_step(_: ExtractionInputs, artifacts: ExtractionArtifacts) -> None: + """ + Populate artifacts with default minimal cell and merged-cell data for a single sheet. + + Parameters: + _ (ExtractionInputs): Unused extraction inputs placeholder. + artifacts (ExtractionArtifacts): Mutable extraction artifacts that will be updated with + `cell_data` set to a single row for "Sheet1" and `merged_cell_data` set to an empty list + for "Sheet1". + """ artifacts.cell_data = {"Sheet1": [CellRow(r=1, c={"0": "A"})]} artifacts.merged_cell_data = {"Sheet1": []} def _fake_plan(_: ExtractionInputs) -> PipelinePlan: + """ + Create a fixed PipelinePlan for tests that forces COM usage and provides a single pre-COM step. + + Parameters: + _ (ExtractionInputs): Ignored input; present to match the PipelinePlan factory signature. + + Returns: + PipelinePlan: A plan with `pre_com_steps` set to a list containing `_pre_step`, `com_steps` empty, and `use_com` set to `True`. + """ return PipelinePlan(pre_com_steps=[_pre_step], com_steps=[], use_com=True) def _fake_detect_tables(_: object) -> list[str]: + """ + Provide a detector that always reports no table ranges. + + The input workbook-like object is ignored. + + Returns: + list[str]: An empty list of table range identifiers. + """ return [] def _fake_workbook(_: Path) -> object: + """ + Provide a context manager that yields a lightweight fake workbook for tests. + + Parameters: + _ (Path): Ignored file path parameter retained to match the real backend signature. + + Returns: + object: A context manager whose `__enter__` returns a new `_Workbook` instance and whose `__exit__` does not suppress exceptions (returns `None`). + """ class _Context: def __enter__(self) -> _Workbook: return _Workbook() @@ -886,4 +1073,4 @@ def __exit__( result = run_extraction_pipeline(inputs) assert result.state.com_attempted is True assert result.state.com_succeeded is True - assert "Sheet1" in result.workbook.sheets + assert "Sheet1" in result.workbook.sheets \ No newline at end of file diff --git a/tests/core/test_pipeline_fallbacks.py b/tests/core/test_pipeline_fallbacks.py index 5099c87..322d9bc 100644 --- a/tests/core/test_pipeline_fallbacks.py +++ b/tests/core/test_pipeline_fallbacks.py @@ -51,6 +51,11 @@ def test_pipeline_fallback_skip_com_tests( def test_pipeline_fallback_com_unavailable( monkeypatch: MonkeyPatch, tmp_path: Path ) -> None: + """ + Verifies that the extraction pipeline falls back when COM access is unavailable. + + Creates a basic workbook, forces the COM-access entry point to raise, runs the extraction pipeline, and asserts that the pipeline records a fallback due to COM being unavailable (`FallbackReason.COM_UNAVAILABLE`), did not attempt COM (`com_attempted is False`), and that the resulting sheet "Sheet1" exists, contains rows, and has no shapes or charts. + """ path = tmp_path / "book.xlsx" _make_basic_book(path) monkeypatch.delenv("SKIP_COM_TESTS", raising=False) @@ -123,4 +128,4 @@ def _raise( sheet = result.workbook.sheets["Sheet1"] assert sheet.shapes == [] assert sheet.charts == [] - assert sheet.rows + assert sheet.rows \ No newline at end of file diff --git a/tests/engine/test_engine.py b/tests/engine/test_engine.py index 084ee12..3725036 100644 --- a/tests/engine/test_engine.py +++ b/tests/engine/test_engine.py @@ -38,6 +38,19 @@ def fake_extract( include_merged_cells: bool | None = None, include_merged_values_in_rows: bool = True, ) -> WorkbookData: + """ + Test helper that simulates workbook extraction for unit tests. + + Records the received `mode` and `include_print_areas` into the outer `called` mapping and returns a minimal WorkbookData whose `book_name` is the input path's filename and whose `sheets` is empty. + + Parameters: + path (Path): Path to the workbook; its filename is used for the returned WorkbookData.book_name. + mode (str): Extraction mode passed through and recorded. + include_print_areas (bool): Whether print areas were requested; the value is recorded in `called`. + + Returns: + WorkbookData: A WorkbookData instance with `book_name` set to path.name and an empty `sheets` mapping. + """ called["mode"] = mode called["include_print_areas"] = include_print_areas return WorkbookData(book_name=path.name, sheets={}) @@ -280,4 +293,4 @@ def fake_images(file_path: Path, images_dir: Path, *, dpi: int) -> None: assert calls["pdf_path"].suffix == ".pdf" assert isinstance(calls["images_dir"], Path) assert calls["images_dir"].name.endswith("_images") - assert calls["dpi"] == 144 + assert calls["dpi"] == 144 \ No newline at end of file diff --git a/tests/models/test_models_export.py b/tests/models/test_models_export.py index ab41ad1..2ea52c2 100644 --- a/tests/models/test_models_export.py +++ b/tests/models/test_models_export.py @@ -22,6 +22,12 @@ def _sheet() -> SheetData: + """ + Create a sample SheetData containing one row, no shapes or charts, and a single table candidate. + + Returns: + SheetData: A SheetData instance with one CellRow (r=1, c={"0": "A"}), empty shapes and charts lists, and table_candidates set to ["A1:B2"]. + """ return SheetData( rows=[CellRow(r=1, c={"0": "A"})], shapes=[], @@ -155,4 +161,4 @@ def test_sheet_json_includes_merged_cells_schema() -> None: ) data = json.loads(sheet.to_json()) assert data["merged_cells"]["schema"] == ["r1", "c1", "r2", "c2", "v"] - assert data["merged_cells"]["items"][0] == [1, 0, 1, 1, "merged"] + assert data["merged_cells"]["items"][0] == [1, 0, 1, 1, "merged"] \ No newline at end of file diff --git a/tests/render/test_render_init.py b/tests/render/test_render_init.py index 75731c6..3285144 100644 --- a/tests/render/test_render_init.py +++ b/tests/render/test_render_init.py @@ -583,6 +583,12 @@ def test_extract_print_areas_handles_exception() -> None: class _PageSetup: @property def PrintArea(self) -> str: + """ + Simulate accessing a worksheet's PrintArea and always raise an error to emulate a failure. + + Raises: + RuntimeError: Always raised to simulate an error when retrieving the PrintArea. + """ raise RuntimeError("boom") class _SheetApi: @@ -598,13 +604,33 @@ def test_iter_sheet_apis_prefers_worksheets_collection() -> None: class _WsApi: def __init__(self, name: str) -> None: + """ + Initialize the FakeSheet with the given Excel sheet name. + + Parameters: + name (str): The sheet's name to assign to the object's `Name` attribute. + """ self.Name = name class _Worksheets: def __init__(self) -> None: + """ + Initialize the fake PDF document stub. + + Sets the `Count` attribute to 2 to emulate a document with two pages. + """ self.Count = 2 def Item(self, index: int) -> _WsApi: + """ + Return a worksheet API stub for the sheet at the given index. + + Parameters: + index (int): One-based index of the worksheet within the workbook. + + Returns: + _WsApi: A worksheet API stub corresponding to the sheet at `index`. + """ return _WsApi(f"Sheet{index}") class _Api: @@ -623,6 +649,12 @@ def test_export_pdf_propagates_render_error( tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: def _raise() -> xw.App: + """ + Always raises a RenderError to simulate failure when obtaining an Excel application. + + Raises: + RenderError: Always raised with the message "boom". + """ raise RenderError("boom") monkeypatch.setattr(render, "_require_excel_app", _raise) @@ -649,9 +681,24 @@ class _SheetApi: pass def _fake_iter(_: xw.Book) -> list[tuple[int, str, _SheetApi]]: + """ + Return a single-item list that mimics iterating workbook sheets for tests. + + Returns: + A list with one tuple (index, sheet name, sheet API stub): (0, "Sheet1", _SheetApi()). + """ return [(0, "Sheet1", _SheetApi())] def _fake_extract(_: _SheetApi) -> list[str]: + """ + Provide two fake print-area ranges for testing. + + Parameters: + _ (_SheetApi): Ignored sheet API placeholder. + + Returns: + list[str]: Two print-area ranges: "A1:B2" and "C3:D4". + """ return ["A1:B2", "C3:D4"] monkeypatch.setattr(render, "_iter_sheet_apis", _fake_iter) @@ -678,10 +725,25 @@ def test_export_sheet_pdf_skips_invalid_print_area(tmp_path: Path) -> None: class _BadPageSetup: @property def PrintArea(self) -> str: + """ + Represents the worksheet's PrintArea setting as an Excel range string. + + Returns: + str: The PrintArea range (e.g., "A1:B2"). + """ return "A1:B2" @PrintArea.setter def PrintArea(self, _value: object) -> None: + """ + Simulated setter for PrintArea that always fails. + + Parameters: + _value (object): Ignored; the provided value is not used because the setter always raises. + + Raises: + RuntimeError: Always raised with the message "bad". + """ raise RuntimeError("bad") class _SheetApi: @@ -690,6 +752,14 @@ class _SheetApi: def ExportAsFixedFormat( self, _file_format: int, _output_path: str, *args: object, **kwargs: object ) -> None: + """ + Simulate exporting a workbook/sheet to a fixed-format file by writing a minimal fake PDF header to the given path. + + Parameters: + _file_format (int): Ignored numeric format indicator. + _output_path (str): Filesystem path where the fake export file will be written. + *args, **kwargs: Additional arguments are accepted and ignored. + """ _ = args _ = kwargs @@ -730,6 +800,23 @@ def _fake_render( _dpi: int, _use_subprocess: bool, ) -> list[Path]: + """ + Simulates rendering a PDF sheet to image files for tests. + + On the first invocation this function returns an empty list to simulate a transient empty render result; on subsequent invocations it returns a single Path inside output_dir named "{sheet_index+1:02d}_{safe_name}.png". + + Parameters: + _pdfium: Ignored in the fake implementation (kept for signature compatibility). + _pdf_path: Ignored in the fake implementation (kept for signature compatibility). + output_dir (Path): Directory where the fake image path is located. + sheet_index (int): Zero-based index of the sheet; used to build the filename prefix. + safe_name (str): Sanitized sheet name used in the filename. + _dpi: Ignored in the fake implementation (kept for signature compatibility). + _use_subprocess: Ignored in the fake implementation (kept for signature compatibility). + + Returns: + list[Path]: Empty list on the first call, otherwise a list containing one Path pointing to the fake PNG file. + """ calls.append(1) if len(calls) == 1: return [] @@ -772,15 +859,35 @@ def test_export_sheet_pdf_does_not_swallow_export_errors(tmp_path: Path) -> None class _FlakyPageSetup: def __init__(self) -> None: + """ + Initialize a PageSetup-like test stub with a default print area and a setter call counter. + + The instance starts with `_print_area` set to "A1" and `_set_calls` set to 0 to track how many times the print area setter has been invoked. + """ self._print_area: object = "A1" self._set_calls = 0 @property def PrintArea(self) -> object: + """ + Retrieve the current PrintArea value from the PageSetup stub. + + Returns: + print_area (object): The stored PrintArea value (typically a string) or whatever was set on the stub. + """ return self._print_area @PrintArea.setter def PrintArea(self, value: object) -> None: + """ + Set the PrintArea value on this stub PageSetup instance. + + Parameters: + value (object): The print area value to assign. + + Raises: + RuntimeError: If the setter is invoked more than once (simulates a restore failure). + """ if self._set_calls >= 1: raise RuntimeError("restore failed") self._print_area = value @@ -794,6 +901,12 @@ class _ExplodingSheetApi: def ExportAsFixedFormat( self, file_format: int, output_path: str, *args: object, **kwargs: object ) -> None: + """ + Simulate exporting to a fixed format; this stub always raises an export error. + + Raises: + RuntimeError: with message "export failed" when invoked. + """ _ = file_format _ = output_path _ = args @@ -807,4 +920,4 @@ def ExportAsFixedFormat( pdf_path, ignore_print_areas=False, print_area="A1:B2", - ) + ) \ No newline at end of file diff --git a/tests/utils.py b/tests/utils.py index c85b1df..8ed8bc0 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -18,17 +18,18 @@ def parametrize( | None = None, scope: Literal["session", "package", "module", "class", "function"] | None = None, ) -> Callable[[Callable[P, R]], Callable[P, R]]: - """Type-safe wrapper around pytest.mark.parametrize. - - Args: - argnames: Parameter names for the parametrized test. - argvalues: Parameter values for each test case. - indirect: Whether to treat parameters as fixtures. - ids: Optional case IDs or an ID factory. - scope: Optional fixture scope for parametrization. - + """ + Return a decorator that parametrizes a test callable with the given argument names and values. + + Parameters: + argnames: One or more parameter names (single string or sequence of strings) to inject into the test callable. + argvalues: An iterable of values or value-tuples to use for each generated test case. + indirect: If True or a sequence of names, treat corresponding parameters as fixtures and resolve them indirectly. + ids: Optional iterable of case identifiers or a callable that produces an identifier for each value. + scope: Optional fixture scope to apply when parameters are used as fixtures ("session", "package", "module", "class", or "function"). + Returns: - Decorator preserving the wrapped callable signature. + decorator: A decorator that applies the specified parametrization to a callable while preserving its signature. """ return cast( Callable[[Callable[P, R]], Callable[P, R]], @@ -39,4 +40,4 @@ def parametrize( ids=ids, scope=scope, ), - ) + ) \ No newline at end of file From 7adf1c17e6008451d20ee8734b5bcf9e34e58f32 Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Fri, 23 Jan 2026 11:00:27 +0900 Subject: [PATCH 10/12] Refactor docstrings and comments for clarity and consistency across multiple modules - Updated docstrings in `src/exstruct/render/__init__.py` to improve formatting and clarity. - Enhanced comments in test files to provide better context and consistency in style. - Removed unnecessary whitespace in various docstrings and comments for cleaner code. - Ensured all docstrings follow a consistent format, including parameter and return descriptions. - Improved readability of test cases by clarifying the purpose and expected outcomes. --- scripts/codacy_issues.py | 74 +++++++-------- src/exstruct/__init__.py | 6 +- src/exstruct/core/backends/base.py | 4 +- src/exstruct/core/backends/com_backend.py | 12 +-- .../core/backends/openpyxl_backend.py | 8 +- src/exstruct/core/cells.py | 38 ++++---- src/exstruct/core/integrate.py | 10 +- src/exstruct/core/pipeline.py | 46 ++++----- src/exstruct/core/workbook.py | 6 +- src/exstruct/engine.py | 10 +- src/exstruct/render/__init__.py | 94 ++++++++++--------- tests/backends/test_auto_page_breaks.py | 10 +- tests/backends/test_backends.py | 57 +++++------ tests/backends/test_print_areas_openpyxl.py | 6 +- tests/com/test_render_smoke.py | 4 +- tests/core/test_cells_utils.py | 15 +-- tests/core/test_mode_output.py | 4 +- tests/core/test_pipeline.py | 81 ++++++++-------- tests/core/test_pipeline_fallbacks.py | 4 +- tests/engine/test_engine.py | 8 +- tests/models/test_models_export.py | 4 +- tests/render/test_render_init.py | 48 +++++----- tests/utils.py | 6 +- 23 files changed, 283 insertions(+), 272 deletions(-) diff --git a/scripts/codacy_issues.py b/scripts/codacy_issues.py index 8816d62..42add78 100644 --- a/scripts/codacy_issues.py +++ b/scripts/codacy_issues.py @@ -65,10 +65,10 @@ def get_level_priority(level: str | None) -> int | None: def normalize_provider(value: str) -> str | None: """ Normalize a provider identifier to a supported short code. - + Parameters: value (str): Provider identifier to normalize (expected 'gh', 'gl', or 'bb'). - + Returns: str | None: The provider code ('gh', 'gl', or 'bb') if valid, `None` otherwise. """ @@ -116,7 +116,7 @@ def assert_valid_choice(name: str, value: str, choices: list[str]) -> str: def encode_segment(value: str) -> str: """ URL-encode a URL path segment so it is safe for inclusion in a path. - + Returns: encoded (str): The percent-encoded representation of the input string. """ @@ -126,11 +126,11 @@ def encode_segment(value: str) -> str: def build_codacy_url(pathname: str, query: dict[str, str] | None = None) -> str: """ Constructs a full Codacy API URL using the configured base origin and base path. - + Parameters: pathname (str): Pathname to append to the base path (should begin with a forward slash). query (dict[str, str] | None): Optional mapping of query parameter names to values; values are URL-encoded. - + Returns: url (str): The complete URL including query string if `query` is provided. """ @@ -144,13 +144,13 @@ def build_codacy_url(pathname: str, query: dict[str, str] | None = None) -> str: def assert_codacy_url(url: str) -> str: """ Validate that `url` targets the configured Codacy API origin and begins with the `/analysis/` path. - + Parameters: url (str): The full URL to validate. - + Returns: str: The original URL when it is confirmed to target the configured Codacy API origin and start with the `/analysis/` path. - + Raises: ValueError: If the URL does not use the configured Codacy API origin or does not start with the expected `/analysis/` path. """ @@ -167,13 +167,13 @@ def assert_codacy_url(url: str) -> str: def build_repo_issues_url(provider: str, org: str, repo: str, limit: int) -> str: """ Constructs the Codacy API URL to search repository issues for a given provider, organization, repository, and result limit. - + Parameters: provider (str): Provider code (e.g., "gh", "gl", "bb"). org (str): Organization or owner name. repo (str): Repository name. limit (int): Maximum number of results to request. - + Returns: str: A Codacy API URL for the repository issues search endpoint with the `limit` query parameter set. """ @@ -189,7 +189,7 @@ def build_pr_issues_url( ) -> str: """ Constructs the Codacy API URL for fetching issues of a pull request. - + Parameters: provider (str): Provider code (e.g., "gh", "gl", "bb"). org (str): Organization or owner name. @@ -197,7 +197,7 @@ def build_pr_issues_url( pr (str): Pull request identifier. limit (int): Maximum number of issues to request. status (str): Issue status filter (e.g., "all", "open", "closed"). - + Returns: str: The Codacy API URL for the pull-request issues endpoint including `status` and `limit` query parameters. """ @@ -211,7 +211,7 @@ def build_pr_issues_url( def get_git_origin_url() -> str | None: """ Get the Git remote "origin" URL for the current repository, or None when it cannot be determined. - + Returns: origin_url (str | None): The remote URL configured for 'origin' if the current directory is inside a Git work tree and the origin URL is available; `None` if not inside a Git repository, if the origin is not set, or on error. """ @@ -250,13 +250,13 @@ class GitRemoteInfo: def parse_git_remote(url: str) -> GitRemoteInfo | None: """ Extract provider, organization, and repository from a Git remote URL. - + Accepts HTTPS (https://host/org/repo[.git]) and SSH (git@host:org/repo[.git]) remote formats. Provider is one of: "gh" for GitHub, "gl" for GitLab, "bb" for Bitbucket, or "unknown" for other hosts. - + Parameters: url (str): Git remote URL to parse. - + Returns: GitRemoteInfo | None: Parsed GitRemoteInfo with fields `provider`, `org`, and `repo`, or `None` if the URL could not be parsed. """ @@ -274,11 +274,11 @@ def parse_git_remote(url: str) -> GitRemoteInfo | None: def is_same_or_subdomain(hostname: str, base_domain: str) -> bool: """ Check whether a hostname is equal to a base domain or is a subdomain of that base domain. - + Parameters: hostname (str): Hostname to test (e.g., "api.example.com"). base_domain (str): Base domain to compare against (e.g., "example.com"). - + Returns: `true` if `hostname` equals `base_domain` or ends with `.` followed by `base_domain`, `false` otherwise. """ @@ -301,15 +301,15 @@ def fetch_json( ) -> dict[str, Any]: """ Fetch and return a JSON object from a validated Codacy API URL. - + Parameters: url (str): Codacy API URL; must target the configured Codacy origin and start with the /analysis/ path. method (str): HTTP method to use (e.g., "GET", "POST"). body (dict[str, Any] | None): Optional JSON body for non-GET requests. - + Returns: dict[str, Any]: The parsed JSON response as a dictionary. - + Raises: RuntimeError: On HTTP errors, network errors, invalid JSON, or when the JSON root value is not an object. """ @@ -358,13 +358,13 @@ def fetch_json( def fetch_repo_issues(provider: str, org: str, repo: str, limit: int) -> dict[str, Any]: """ Request Codacy for issues belonging to a repository. - + Parameters: provider (str): Provider code ('gh', 'gl', 'bb') indicating GitHub, GitLab, or Bitbucket. org (str): Organization or owner name. repo (str): Repository name. limit (int): Maximum number of issues to return. - + Returns: dict[str, Any]: Parsed JSON response from the Codacy API containing issue data. """ @@ -377,7 +377,7 @@ def fetch_pr_issues( ) -> dict[str, Any]: """ Retrieve Codacy issues for a specific pull request. - + Parameters: provider (str): Provider code ("gh", "gl", "bb"). org (str): Organization or user name. @@ -385,7 +385,7 @@ def fetch_pr_issues( pr (str): Pull request number or identifier. limit (int): Maximum number of issues to request. status (str): Issue status filter (for example "all", "open", "closed"). - + Returns: dict: Parsed JSON response from the Codacy API. """ @@ -399,17 +399,17 @@ def fetch_pr_issues( def format_for_ai(raw_issues: list[dict[str, Any]], min_level: str) -> list[str]: """ Format Codacy issue records into compact AI-friendly lines filtered by minimum severity. - + Each returned string has the form: " | : | | | ". - + Parameters: raw_issues: List of issue objects returned by the Codacy API (each item may be an issue or contain a `commitIssue` key). min_level: Minimum severity level to include; must be one of the values in LEVELS. - + Returns: A list of formatted issue strings matching the format above, including only issues whose severity is at or above `min_level`. - + Raises: ValueError: If `min_level` is not a valid severity level. """ @@ -477,13 +477,13 @@ def apply_git_defaults(args: argparse.Namespace) -> None: def resolve_segments(args: argparse.Namespace) -> tuple[str, str, str | None]: """ Validate CLI org, repo, and optional pr segments and return them. - + Parameters: args (argparse.Namespace): Parsed CLI arguments with attributes `org`, `repo`, and optional `pr`. - + Returns: tuple[str, str, str | None]: A tuple (org, repo, pr) where `pr` is None if not supplied. - + Raises: ValueError: If any segment is empty or contains invalid characters. """ @@ -506,7 +506,7 @@ def build_payload( ) -> dict[str, object]: """ Create a JSON-serializable payload describing the fetched issues and their scope. - + The returned dictionary contains: - scope: "pull_request" when `pr` is set, otherwise "repository". - organization: organization/owner name. @@ -515,7 +515,7 @@ def build_payload( - minLevel: the minimum severity level used to filter issues. - total: the number of issues in `issues`. - issues: list of formatted issue strings. - + Returns: dict[str, object]: Payload ready for JSON serialization with the keys described above. """ @@ -533,9 +533,9 @@ def build_payload( def main() -> int: """ Run the CLI: parse arguments, fetch Codacy issues (repository or pull request), format them for AI consumption, and write a JSON payload to stdout. - + Writes error messages to stderr when validation or fetching fails and prints the final JSON payload to stdout. - + Returns: int: 0 on success, 1 on error. """ @@ -597,4 +597,4 @@ def main() -> int: raise SystemExit(main()) except Exception as e: print(str(e), file=sys.stderr) - raise SystemExit(1) from e \ No newline at end of file + raise SystemExit(1) from e diff --git a/src/exstruct/__init__.py b/src/exstruct/__init__.py index 6ad3744..6bdfd75 100644 --- a/src/exstruct/__init__.py +++ b/src/exstruct/__init__.py @@ -91,11 +91,11 @@ def extract(file_path: str | Path, mode: ExtractionMode = "standard") -> WorkbookData: """ Extracts an Excel workbook into a WorkbookData structure. - + Parameters: file_path (str | Path): Path to the workbook file (.xlsx, .xlsm, .xls). mode (ExtractionMode): Extraction detail level. "light" includes cells and table detection only (no COM, shapes/charts empty; print areas via openpyxl). "standard" includes texted shapes, arrows, charts (COM if available) and print areas. "verbose" also includes shape/chart sizes, cell link map, colors map, and formulas map. - + Returns: WorkbookData: Parsed workbook representation containing sheets, rows, shapes, charts, and print areas. """ @@ -383,4 +383,4 @@ def process_excel( print_areas_dir=print_areas_dir, auto_page_breaks_dir=auto_page_breaks_dir, stream=stream, - ) \ No newline at end of file + ) diff --git a/src/exstruct/core/backends/base.py b/src/exstruct/core/backends/base.py index 7ffbdde..7853a36 100644 --- a/src/exstruct/core/backends/base.py +++ b/src/exstruct/core/backends/base.py @@ -44,7 +44,7 @@ def extract_merged_cells(self) -> MergedCellData: def extract_formulas_map(self) -> WorkbookFormulasMap | None: """ Retrieve the workbook's formulas organized by worksheet. - + Returns: WorkbookFormulasMap | None: A mapping of worksheet identifiers to their formulas, or `None` if the backend cannot provide a formulas map. - """ \ No newline at end of file + """ diff --git a/src/exstruct/core/backends/com_backend.py b/src/exstruct/core/backends/com_backend.py index 0c1348f..0f7d2a9 100644 --- a/src/exstruct/core/backends/com_backend.py +++ b/src/exstruct/core/backends/com_backend.py @@ -65,11 +65,11 @@ def extract_colors_map( ) -> WorkbookColorsMap | None: """ Extract a workbook colors map using the Excel COM API. - + Parameters: include_default_background (bool): Include the workbook's default background color in the resulting map. ignore_colors (set[str] | None): Optional set of color keys to exclude from the map. - + Returns: WorkbookColorsMap | None: A mapping of workbook color definitions when extraction succeeds, or `None` if COM extraction fails. """ @@ -89,7 +89,7 @@ def extract_colors_map( def extract_formulas_map(self) -> WorkbookFormulasMap | None: """ Extracts the workbook's formulas map using COM. - + Returns: WorkbookFormulasMap or None: The extracted formulas map, or `None` if extraction failed. """ @@ -105,9 +105,9 @@ def extract_formulas_map(self) -> WorkbookFormulasMap | None: def extract_auto_page_breaks(self) -> PrintAreaData: """ Compute auto page-break rectangles for each worksheet using Excel COM. - + For each sheet, determine the sheet's print area (PageSetup.PrintArea or the used range) and split it into sub-rectangles along Excel's horizontal and vertical page breaks; parts that reference a different sheet are ignored. If extraction for a sheet fails, the sheet is skipped and a warning is logged. - + Returns: Mapping from sheet name to a list of PrintArea entries. Each PrintArea describes a rectangular region with `r1` and `r2` as 1-based row indices and `c1` and `c2` as 0-based column indices. """ @@ -252,4 +252,4 @@ def _split_csv_respecting_quotes(raw: str) -> list[str]: i += 1 if buf: parts.append("".join(buf).strip()) - return [p for p in parts if p] \ No newline at end of file + return [p for p in parts if p] diff --git a/src/exstruct/core/backends/openpyxl_backend.py b/src/exstruct/core/backends/openpyxl_backend.py index acaba4a..bef9428 100644 --- a/src/exstruct/core/backends/openpyxl_backend.py +++ b/src/exstruct/core/backends/openpyxl_backend.py @@ -104,7 +104,7 @@ def extract_merged_cells(self) -> MergedCellData: def extract_formulas_map(self) -> WorkbookFormulasMap | None: """ Extract a mapping of workbook formulas for each sheet. - + Returns: WorkbookFormulasMap | None: A mapping from sheet name to its formulas, or `None` if extraction fails. """ @@ -119,10 +119,10 @@ def extract_formulas_map(self) -> WorkbookFormulasMap | None: def detect_tables(self, sheet_name: str) -> list[str]: """ Detects table candidate ranges within the specified worksheet. - + Parameters: sheet_name (str): Name of the worksheet to analyze for table candidates. - + Returns: list[str]: Detected table candidate ranges as A1-style range strings; empty list if none are found or detection fails. """ @@ -206,4 +206,4 @@ def _parse_print_area_range(range_str: str) -> tuple[int, int, int, int] | None: bounds = parse_range_zero_based(range_str) if bounds is None: return None - return (bounds.r1, bounds.c1, bounds.r2, bounds.c2) \ No newline at end of file + return (bounds.r1, bounds.c1, bounds.r2, bounds.c2) diff --git a/src/exstruct/core/cells.py b/src/exstruct/core/cells.py index 20e9385..d6d9644 100644 --- a/src/exstruct/core/cells.py +++ b/src/exstruct/core/cells.py @@ -58,10 +58,10 @@ class WorkbookColorsMap: def get_sheet(self, sheet_name: str) -> SheetColorsMap | None: """ Retrieve the SheetColorsMap for a worksheet by name. - + Parameters: sheet_name (str): Name of the worksheet to retrieve. - + Returns: SheetColorsMap | None: The sheet's color map if present, `None` otherwise. """ @@ -85,10 +85,10 @@ class WorkbookFormulasMap: def get_sheet(self, sheet_name: str) -> SheetFormulasMap | None: """ Retrieve the formulas map for a worksheet. - + Parameters: sheet_name (str): Name of the worksheet to look up. - + Returns: SheetFormulasMap | None: The sheet's formulas map if present, `None` if the worksheet is not found. """ @@ -133,10 +133,10 @@ def extract_sheet_colors_map( def extract_sheet_formulas_map(file_path: Path) -> WorkbookFormulasMap: """ Extract normalized formula strings from every worksheet in the workbook. - + Parameters: file_path (Path): Path to the Excel workbook to read. - + Returns: WorkbookFormulasMap: Mapping of sheet names to SheetFormulasMap objects. Each SheetFormulasMap contains a mapping from normalized formula strings (each beginning with "=") to a list of cell coordinates (row, column) where that formula occurs. """ @@ -151,10 +151,10 @@ def extract_sheet_formulas_map(file_path: Path) -> WorkbookFormulasMap: def extract_sheet_formulas_map_com(workbook: xw.Book) -> WorkbookFormulasMap: """ Collects and normalizes formulas from every worksheet in an xlwings workbook into per-sheet mappings. - + Parameters: workbook: xlwings Book instance whose sheets will be scanned for formulas. - + Returns: WorkbookFormulasMap: maps sheet names to SheetFormulasMap objects. Each SheetFormulasMap.formulas_map maps a normalized formula string (consistent representation, e.g., beginning with "=") to a list of (row, column) tuples representing cell locations using Excel 1-based indices. """ @@ -195,12 +195,12 @@ def extract_sheet_colors_map_com( ) -> WorkbookColorsMap: """ Extract per-sheet background color maps using the workbook's COM/display-format interfaces. - + Parameters: workbook (xw.Book): xlwings workbook whose sheets will be inspected. include_default_background (bool): If true, include default background colors (e.g., white) for cells inside each sheet's used range. ignore_colors (set[str] | None): Optional set of normalized color keys to exclude from results. - + Returns: WorkbookColorsMap: Mapping of sheet names to SheetColorsMap containing detected background color positions for each worksheet. """ @@ -220,12 +220,12 @@ def _extract_sheet_colors( ) -> SheetColorsMap: """ Extract the background color locations present on a single worksheet. - + Parameters: ws (Worksheet): Worksheet to scan. include_default_background (bool): If true, treat cells with the workbook default/background color as having a color key. ignore_colors (set[str] | None): Optional set of color keys to ignore (keys are normalized before comparison). - + Returns: SheetColorsMap: Mapping from normalized color key to a list of cell coordinates where that color appears. Coordinates are tuples (row, col) where `row` is 1-based and `col` is 0-based. """ @@ -254,10 +254,10 @@ def _extract_sheet_colors( def _extract_sheet_formulas(ws: Worksheet) -> SheetFormulasMap: """ Collect normalized formula strings from a worksheet and group their cell coordinates. - + Parameters: ws (Worksheet): Worksheet to scan for formulas. - + Returns: SheetFormulasMap: container with the sheet's name and a mapping from each normalized formula string (prefixed with "=") to a list of cell coordinates as (row, zero-based-column). """ @@ -305,10 +305,10 @@ def _normalize_formula_value(value: object) -> str | None: def _normalize_formula_from_com(value: object) -> str | None: """ Normalize a COM-returned cell formula into a string that begins with '='. - + Parameters: value (object): Raw value returned from COM for a cell's formula. - + Returns: str | None: The input string if it is non-empty and starts with '=', `None` otherwise. """ @@ -327,12 +327,12 @@ def _extract_sheet_colors_com( ) -> SheetColorsMap: """ Extract per-sheet background color mapping using COM/DisplayFormat. - + Parameters: sheet (xw.Sheet): xlwings sheet object to inspect. include_default_background (bool): If True, include cells whose background is the workbook default color. ignore_colors (set[str] | None): Optional set of normalized color keys to exclude from the result. - + Returns: SheetColorsMap: Mapping from normalized color key (hex/theme/index canonical form) to a list of cell coordinates where that color appears. Each coordinate is a tuple (row, col) where `row` is the worksheet row number (1-based) and `col` is the zero-based column index. """ @@ -1723,4 +1723,4 @@ def _coerce_numeric_preserve_format(val: str) -> int | float | str: return float(quantized) except (InvalidOperation, Exception): return val - return val \ No newline at end of file + return val diff --git a/src/exstruct/core/integrate.py b/src/exstruct/core/integrate.py index 402dddf..b610b90 100644 --- a/src/exstruct/core/integrate.py +++ b/src/exstruct/core/integrate.py @@ -23,9 +23,9 @@ def extract_workbook( # noqa: C901 ) -> WorkbookData: """ Extract a workbook into a structured WorkbookData representation. - + May fall back to cells+tables extraction if Excel COM automation is unavailable. - + Parameters: file_path (str | Path): Path to the workbook file. mode (Literal['light', 'standard', 'verbose']): Extraction mode that controls detail level. @@ -38,10 +38,10 @@ def extract_workbook( # noqa: C901 include_formulas_map (bool | None): Include a map of cell formulas; `None` uses mode defaults. include_merged_cells (bool | None): Include merged cell ranges; `None` uses mode defaults. include_merged_values_in_rows (bool): Preserve merged cell values in row-wise output. - + Returns: WorkbookData: The extracted workbook representation. - + Raises: ValueError: If `mode` is not one of "light", "standard", or "verbose". """ @@ -59,4 +59,4 @@ def extract_workbook( # noqa: C901 include_merged_values_in_rows=include_merged_values_in_rows, ) result = run_extraction_pipeline(inputs) - return result.workbook \ No newline at end of file + return result.workbook diff --git a/src/exstruct/core/pipeline.py b/src/exstruct/core/pipeline.py index 4fc316c..53a4fda 100644 --- a/src/exstruct/core/pipeline.py +++ b/src/exstruct/core/pipeline.py @@ -274,10 +274,10 @@ def resolve_extraction_inputs( def build_pipeline_plan(inputs: ExtractionInputs) -> PipelinePlan: """ Builds a pipeline plan describing which pre-COM and COM extraction steps to run for the given resolved inputs. - + Parameters: inputs (ExtractionInputs): Resolved extraction configuration (including mode and COM/formulas flags). - + Returns: PipelinePlan: Plan containing ordered `pre_com_steps`, ordered `com_steps`, and `use_com` set to true when the pipeline should use COM (when `mode` is not "light" or `use_com_for_formulas` is true). """ @@ -503,7 +503,7 @@ def step_extract_print_areas_openpyxl( ) -> None: """ Extract print areas from the workbook and populate artifacts.print_area_data. - + Parameters: inputs (ExtractionInputs): Pipeline inputs containing the file path and extraction options. artifacts (ExtractionArtifacts): Mutable artifact container; `artifacts.print_area_data` will be set to the extracted print area mapping. @@ -517,9 +517,9 @@ def step_extract_formulas_map_openpyxl( ) -> None: """ Populate artifacts.formulas_map_data by extracting workbook formulas using openpyxl. - + Attempts to extract a WorkbookFormulasMap from the file at inputs.file_path and stores it on artifacts.formulas_map_data. If extraction fails, a warning is logged and artifacts.formulas_map_data is left unchanged. - + Parameters: inputs (ExtractionInputs): Resolved pipeline inputs (provides file_path). artifacts (ExtractionArtifacts): Mutable container to receive the extracted formulas map. @@ -539,7 +539,7 @@ def step_extract_colors_map_openpyxl( ) -> None: """ Extract the workbook colors map using openpyxl and store it on the artifacts. - + Sets artifacts.colors_map_data to the colors map extracted from inputs.file_path, respecting inputs.include_default_background and inputs.ignore_colors. """ @@ -612,7 +612,7 @@ def step_extract_auto_page_breaks_com( ) -> None: """ Extract auto page break information from a COM workbook and store it in the artifacts. - + Parameters: inputs (ExtractionInputs): Pipeline inputs that may influence extraction behavior. artifacts (ExtractionArtifacts): Mutable artifact container; updated with extracted data. @@ -626,10 +626,10 @@ def step_extract_formulas_map_com( ) -> None: """ Extract the workbook's formulas map via COM and store it into the artifacts. - + On success assigns the extracted WorkbookFormulasMap to artifacts.formulas_map_data. On failure leaves artifacts.formulas_map_data unchanged and logs a warning. - + Parameters: workbook (xlwings.Book): COM workbook to extract formulas from. """ @@ -673,11 +673,11 @@ def _resolve_sheet_colors_map( ) -> dict[str, list[tuple[int, int]]]: """ Resolve the colors map for a given sheet. - + Parameters: colors_map_data (WorkbookColorsMap | None): Optional workbook-level colors map container. sheet_name (str): Name of the sheet to resolve. - + Returns: dict[str, list[tuple[int, int]]]: Mapping of color keys to lists of (start_col, end_col) intervals for the sheet; empty dict if no colors map is available for the workbook or sheet. """ @@ -694,11 +694,11 @@ def _resolve_sheet_formulas_map( ) -> dict[str, list[tuple[int, int]]]: """ Get the formulas map for a named sheet from a workbook formulas container. - + Parameters: formulas_map_data: Optional workbook formulas map container; may be None. sheet_name: Name of the sheet to resolve formulas for. - + Returns: A mapping for the sheet (str -> list of (row, column) tuples) representing formula locations, or an empty dict if no data is available. """ @@ -716,11 +716,11 @@ def _filter_rows_excluding_merged_values( ) -> list[CellRow]: """ Filter out cell values that originate from merged-cell ranges. - + Parameters: rows (list[CellRow]): Extracted rows to filter. merged_cells (list[MergedCellRange]): Merged cell ranges to exclude values from. - + Returns: list[CellRow]: Rows where any cell whose column index falls inside a merged range has been removed. - Rows with no remaining cells are omitted. @@ -832,9 +832,9 @@ def collect_sheet_raw_data( ) -> dict[str, SheetRawData]: """ Collect per-sheet raw extraction data and assemble SheetRawData for each sheet. - + For each sheet in cell_data this returns a SheetRawData containing rows (optionally excluding values contributed by merged cells), shapes, charts (omitted in "light" mode), detected table candidates, print/auto-print areas, per-sheet formulas map, per-sheet colors map, and merged cell ranges. - + Parameters: cell_data (CellData): Extracted cell rows keyed by sheet name. shape_data (ShapeData): Extracted shapes keyed by sheet name. @@ -847,7 +847,7 @@ def collect_sheet_raw_data( auto_page_break_data (PrintAreaData | None): Optional auto page-break areas keyed by sheet name. formulas_map_data (WorkbookFormulasMap | None): Optional per-sheet formulas map to include in SheetRawData. colors_map_data (WorkbookColorsMap | None): Optional per-sheet colors map to include in SheetRawData. - + Returns: dict[str, SheetRawData]: Mapping from sheet name to the assembled SheetRawData. """ @@ -880,10 +880,10 @@ def collect_sheet_raw_data( def run_extraction_pipeline(inputs: ExtractionInputs) -> PipelineResult: """ Execute the configured extraction pipeline and produce the extraction result. - + Parameters: inputs (ExtractionInputs): Resolved pipeline inputs controlling which extraction steps run. - + Returns: PipelineResult: Contains the constructed workbook data, collected artifacts, and pipeline execution state (including COM attempt/success and any fallback reason). """ @@ -961,12 +961,12 @@ def build_cells_tables_workbook( ) -> WorkbookData: """ Builds a WorkbookData from available cell rows and detected table candidates to use as a fallback when COM-based extraction is not used or has failed. - + Parameters: inputs (ExtractionInputs): Resolved extraction inputs that control which extra maps and merged-value handling to include. artifacts (ExtractionArtifacts): Collected artifacts produced by pre-COM extraction steps; cell rows and any existing maps are consumed from here. reason (str): Short description of why the fallback is being used (logged for debugging). - + Returns: WorkbookData: A workbook composed from the available per-sheet cell rows, detected table candidates, merged-cell information, and any resolved formulas and colors maps. Shapes and charts are empty in this fallback path; formulas and colors maps are extracted from artifacts or from the Openpyxl backend when requested and not already present. """ @@ -1014,4 +1014,4 @@ def build_cells_tables_workbook( merged_cells=merged_cells, ) raw = WorkbookRawData(book_name=inputs.file_path.name, sheets=sheets) - return build_workbook_data(raw) \ No newline at end of file + return build_workbook_data(raw) diff --git a/src/exstruct/core/workbook.py b/src/exstruct/core/workbook.py index 199eca6..3aa9430 100644 --- a/src/exstruct/core/workbook.py +++ b/src/exstruct/core/workbook.py @@ -21,12 +21,12 @@ def openpyxl_workbook( ) -> Iterator[Any]: """ Open an openpyxl Workbook for temporary use and ensure it is closed on exit. - + Parameters: file_path (Path): Path to the workbook file. data_only (bool): If True, read stored cell values instead of formulas. read_only (bool): If True, open the workbook in optimized read-only mode. - + Yields: openpyxl.workbook.workbook.Workbook: The opened workbook instance. """ @@ -114,4 +114,4 @@ def _find_open_workbook(file_path: Path) -> xw.Book | None: except Exception as exc: logger.debug("Failed to inspect open Excel workbooks. (%r)", exc) return None - return None \ No newline at end of file + return None diff --git a/src/exstruct/engine.py b/src/exstruct/engine.py index ef16a89..2788adb 100644 --- a/src/exstruct/engine.py +++ b/src/exstruct/engine.py @@ -263,11 +263,11 @@ def _filter_sheet( ) -> SheetData: """ Return a filtered copy of a SheetData according to the engine's output filters and resolved size/print-area flags. - + Parameters: sheet: The original SheetData to filter. include_auto_override: If not None, overrides the engine's automatic decision for including auto page-break areas; if None, the engine's auto rule is used. - + Returns: A new SheetData where: - rows are kept only if include_rows is enabled; otherwise an empty list. @@ -354,12 +354,12 @@ def extract( ) -> WorkbookData: """ Produce a normalized WorkbookData extracted from the given workbook file. - + Parameters: file_path (str | Path): Path to the .xlsx/.xlsm/.xls file to extract. mode (ExtractionMode | None): Extraction mode to use; if None the engine's configured mode is used. Modes: "light", "standard", "verbose". - + Returns: WorkbookData: Normalized workbook data extracted from the file. """ @@ -593,4 +593,4 @@ def process( export_pdf(normalized_file_path, pdf_path) if image: images_dir = pdf_path.parent / f"{pdf_path.stem}_images" - export_sheet_images(normalized_file_path, images_dir, dpi=dpi) \ No newline at end of file + export_sheet_images(normalized_file_path, images_dir, dpi=dpi) diff --git a/src/exstruct/render/__init__.py b/src/exstruct/render/__init__.py index e30dbab..f70df28 100644 --- a/src/exstruct/render/__init__.py +++ b/src/exstruct/render/__init__.py @@ -81,12 +81,12 @@ def export_sheet_images( ) -> list[Path]: """ Export each worksheet in the given Excel workbook to PNG files and return the image paths in workbook order. - + Returns: - paths (list[Path]): Paths to the generated PNG files, ordered by the corresponding worksheets. - + paths (list[Path]): Paths to the generated PNG files, ordered by the corresponding worksheets. + Raises: - RenderError: If export or rendering fails. + RenderError: If export or rendering fails. """ normalized_excel_path = Path(excel_path) normalized_output_dir = Path(output_dir) @@ -114,14 +114,14 @@ def export_sheet_images( def _sanitize_sheet_filename(name: str) -> str: - """ + r""" Create a filesystem-safe filename derived from an Excel sheet name. - + Replaces characters that are not allowed in filenames (\/:*?"<>|) with underscores, trims surrounding whitespace, and returns "sheet" if the result is empty. - + Parameters: name (str): Original sheet name. - + Returns: safe_name (str): Filename-safe string derived from `name`. """ @@ -141,9 +141,9 @@ class _SheetApiProtocol(Protocol): def ExportAsFixedFormat( # noqa: N802 self, file_format: int, output_path: str, *args: object, **kwargs: object - ) -> None: """ - Export the sheet or workbook to a fixed-format file (for example, PDF or XPS). - + ) -> None: + """Export the sheet or workbook to a fixed-format file (for example, PDF or XPS). + Parameters: file_format (int): Excel XlFixedFormatType enum value indicating the output format (e.g., the constant for PDF). output_path (str): Filesystem path where the fixed-format file will be written. @@ -156,9 +156,9 @@ def ExportAsFixedFormat( # noqa: N802 def _iter_sheet_apis(wb: xw.Book) -> list[tuple[int, str, _SheetApiProtocol]]: """ Enumerate workbook sheets and return each sheet's zero-based index, display name, and COM API handle in workbook order. - + If direct COM access to Worksheets is unavailable, falls back to iterating wb.sheets to build the same list. - + Returns: List[tuple[int, str, _SheetApiProtocol]]: Tuples of (zero-based sheet index, sheet name, sheet COM API handle) in workbook order. """ @@ -189,7 +189,7 @@ def _build_sheet_export_plan( ) -> list[tuple[str, _SheetApiProtocol, str | None]]: """ Build an ordered export plan mapping each worksheet to its print areas. - + Each returned tuple is (sheet_name, sheet_api, print_area). The list preserves workbook sheet order; for sheets with no defined print areas `print_area` is `None`, and for sheets with multiple print areas there is one tuple per area. """ plan: list[tuple[str, _SheetApiProtocol, str | None]] = [] @@ -206,12 +206,12 @@ def _build_sheet_export_plan( def _extract_print_areas(sheet_api: _SheetApiProtocol) -> list[str]: """ Extract the sheet's print-area ranges as a list of strings. - + Retrieves the PageSetup.PrintArea value from the provided sheet API, splits it by commas while respecting single-quoted sections, and returns each range as a separate string. If the sheet has no print area or the property is inaccessible, an empty list is returned. - + Parameters: sheet_api (_SheetApiProtocol): Excel sheet API object exposing a `PageSetup.PrintArea` attribute. - + Returns: list[str]: List of print-area range strings in the order they appear, or an empty list if none are defined or on access failure. """ @@ -230,12 +230,12 @@ def _extract_print_areas(sheet_api: _SheetApiProtocol) -> list[str]: def _split_csv_respecting_quotes(raw: str) -> list[str]: """ Split a comma-separated string into parts while treating single-quoted sections as atomic. - + This function splits raw on commas that are not inside single quotes. Text enclosed in single quotes is preserved (including internal commas). Two consecutive single quotes inside a quoted section are treated as an escaped single-quote pair. Leading and trailing whitespace is trimmed from each part and empty parts are removed. - + Parameters: raw (str): The input CSV-like string that may contain single-quoted segments. - + Returns: list[str]: A list of non-empty tokens obtained from splitting `raw` by unquoted commas, with surrounding whitespace removed and quoted segments preserved. @@ -275,13 +275,13 @@ def _rename_pages_for_print_area( ) -> list[Path]: """ Rename the given image files so each gets a unique numeric prefix based on a base index and a safe sheet name. - + Parameters: paths (list[Path]): Existing image files for a single sheet or print area (may include per-page suffixes). output_dir (Path): Directory where renamed files will reside. base_index (int): Zero-based starting index used to compute the numeric prefix for each output file. safe_name (str): Filesystem-safe base name to use after the numeric prefix. - + Returns: list[Path]: Paths to the renamed files in the same order as input, each named "{index:02d}_{safe_name}.png". """ @@ -299,12 +299,12 @@ def _rename_pages_for_print_area( def _page_index_from_suffix(stem: str) -> int: """ Extracts a zero-based page index from a filename stem ending with a "_pNN" numeric suffix. - + If the stem ends with "_p" followed by digits, returns that number minus one. If the suffix is missing, non-numeric, or less than 1, returns 0. - + Parameters: stem (str): Filename stem to parse. - + Returns: int: Zero-based page index derived from the "_pNN" suffix, or 0 when no valid suffix is present. """ @@ -329,9 +329,9 @@ def _export_sheet_pdf( ) -> None: """ Export the given worksheet to a PDF file, optionally applying a temporary print area. - + If `print_area` is provided, it is applied to the sheet's PageSetup.PrintArea before exporting and restored afterwards. The function attempts to call ExportAsFixedFormat with an IgnorePrintAreas keyword; if that call fails due to an unexpected COM signature, it retries with a minimal argument set. - + Args: sheet_api: COM-like worksheet API exposing `PageSetup` and `ExportAsFixedFormat`. pdf_path (Path): Filesystem path to write the PDF to. @@ -346,13 +346,21 @@ def _export_sheet_pdf( if page_setup is not None: original_print_area = getattr(page_setup, "PrintArea", None) page_setup.PrintArea = print_area - except Exception: + except Exception as exc: + logger.debug("Failed to set PrintArea. (%r)", exc) page_setup = None try: sheet_api.ExportAsFixedFormat( 0, str(pdf_path), IgnorePrintAreas=ignore_print_areas ) except TypeError: + if ignore_print_areas and page_setup is None: + try: + page_setup = getattr(sheet_api, "PageSetup", None) + if page_setup is not None: + page_setup.PrintArea = "" + except Exception as exc: + logger.debug("Failed to clear PrintArea for ignore. (%r)", exc) sheet_api.ExportAsFixedFormat(0, str(pdf_path)) finally: if page_setup is not None and print_area is not None: @@ -365,13 +373,13 @@ def _export_sheet_pdf( def _ensure_pdfium(use_subprocess: bool) -> ModuleType | None: """ Ensure the pypdfium2 dependency is available and return the pdfium module for in-process rendering. - + Parameters: use_subprocess (bool): When True, confirm pypdfium2 is installed for subprocess rendering but do not keep the module in-process; when False, import and return the pdfium module for direct use. - + Returns: ModuleType | None: The imported `pdfium` module when `use_subprocess` is False, or `None` when `use_subprocess` is True. - + Raises: MissingDependencyError: If pypdfium2 (and required extras) is not installed. """ @@ -391,7 +399,7 @@ def _export_sheet_images_with_app( ) -> list[Path]: """ Export each worksheet of an Excel workbook to PNG images by exporting sheets to per-sheet PDFs and rendering those PDFs. - + Parameters: excel_path (Path): Path to the source Excel workbook. output_dir (Path): Directory where generated PNGs will be written. @@ -399,7 +407,7 @@ def _export_sheet_images_with_app( dpi (int): Dots per inch used when rasterizing PDF pages. use_subprocess (bool): If True, render PDF pages in a subprocess; otherwise render in-process. pdfium (ModuleType | None): In-process pypdfium2 module when rendering in-process, or None when subprocess rendering is used. - + Returns: list[Path]: Paths to generated PNG images in the order corresponding to the workbook's sheets and print-area splits. """ @@ -471,12 +479,12 @@ def _render_sheet_images( ) -> list[Path]: """ Render a sheet PDF to one or more PNG files using either a subprocess or in-process renderer. - + Returns: - paths (list[Path]): Paths to the generated PNG files in output order. - + paths (list[Path]): Paths to the generated PNG files in output order. + Raises: - RenderError: If in-process rendering is requested but the `pypdfium2` module (`pdfium`) is not provided. + RenderError: If in-process rendering is requested but the `pypdfium2` module (`pdfium`) is not provided. """ if use_subprocess: return _render_pdf_pages_subprocess( @@ -506,15 +514,15 @@ def _normalize_multipage_paths( ) -> list[Path]: """ Assign distinct, ordered filenames for multi-page sheet outputs. - + If `paths` contains a single file, the list is returned unchanged. If `paths` contains multiple files, each file is given a unique, numbered filename in `output_dir` using `base_index` and `safe_name` so pages are ordered and do not collide. - + Parameters: paths (list[Path]): Existing file paths for a sheet's rendered pages. output_dir (Path): Directory containing or intended to contain the output files. base_index (int): Zero-based starting index used to compute numeric prefixes for filenames. safe_name (str): Filesystem-safe base name included in the generated filenames. - + Returns: list[Path]: Paths to the resulting files in `output_dir`. When multiple input paths are provided, returned paths reflect the new, uniquely prefixed filenames. """ @@ -526,9 +534,9 @@ def _normalize_multipage_paths( def _use_render_subprocess() -> bool: """ Decide whether PDF-to-PNG rendering should be performed in a subprocess. - + Reads the environment variable EXSTRUCT_RENDER_SUBPROCESS (case-insensitive). Subprocess rendering is disabled when the variable is set to "0" or "false"; if the variable is unset or set to any other value, subprocess rendering is enabled. - + Returns: `true` if subprocess rendering is enabled, `false` otherwise. """ @@ -625,4 +633,4 @@ def _render_pdf_pages_worker( queue.put({"error": str(exc)}) -__all__ = ["export_pdf", "export_sheet_images"] \ No newline at end of file +__all__ = ["export_pdf", "export_sheet_images"] diff --git a/tests/backends/test_auto_page_breaks.py b/tests/backends/test_auto_page_breaks.py index 8c0e9f2..f2cce3e 100644 --- a/tests/backends/test_auto_page_breaks.py +++ b/tests/backends/test_auto_page_breaks.py @@ -18,7 +18,7 @@ def test_extract_passes_auto_page_break_flag( ) -> None: """ Verify that extract_workbook is invoked with include_auto_page_breaks set to True. - + Creates a fake extractor that captures the include_auto_page_breaks argument, replaces exstruct.engine.extract_workbook with it, runs ExStructEngine.extract against a dummy workbook path configured to export auto page breaks, and asserts the captured flag is True. @@ -40,15 +40,15 @@ def fake_extract( ) -> WorkbookData: """ Test stub for workbook extraction that records the auto page breaks flag. - + This fake extractor captures the value of `include_auto_page_breaks` in the outer `called` mapping and returns a minimal `WorkbookData` with `book_name` set to the provided path's filename and an empty `sheets` mapping. - + Parameters: path (Path): Filesystem path used to derive the returned `WorkbookData.book_name`. include_auto_page_breaks (bool): Flag whose value is written to `called["include_auto_page_breaks"]`. - + Returns: WorkbookData: A minimal workbook data object with `book_name` set to `path.name` and no sheets. """ @@ -103,4 +103,4 @@ class _DummyWorkbook: sheets = [_FailingSheet()] backend = ComBackend(_DummyWorkbook()) - assert backend.extract_auto_page_breaks() == {} \ No newline at end of file + assert backend.extract_auto_page_breaks() == {} diff --git a/tests/backends/test_backends.py b/tests/backends/test_backends.py index 0046896..ebfd947 100644 --- a/tests/backends/test_backends.py +++ b/tests/backends/test_backends.py @@ -81,7 +81,7 @@ def test_openpyxl_backend_extract_formulas_map_returns_none_on_failure( def fake_formulas_map(file_path: Path) -> object: """ Test helper that always raises a RuntimeError to simulate a failure when extracting a formulas map. - + Raises: RuntimeError: with message "boom". """ @@ -128,10 +128,10 @@ def test_com_backend_extract_formulas_map_returns_none_on_failure( def fake_formulas_map(workbook: object) -> object: """ Test stub that simulates a failure by always raising a RuntimeError. - + Parameters: workbook (object): Workbook-like object (ignored); present to match the real function's signature. - + Raises: RuntimeError: Always raised with message "boom". """ @@ -174,7 +174,7 @@ class _DummyWorkbook: def test_openpyxl_backend_extract_print_areas(tmp_path: Path) -> None: """ Verifies that OpenpyxlBackend.extract_print_areas reads an openpyxl workbook's print area and returns the corresponding zero-based ranges keyed by sheet name. - + Creates an in-memory workbook with a single sheet named "Sheet1", sets its print area to "A1:B2", saves and loads it via OpenpyxlBackend, then asserts the sheet is present, has at least one area, and that the first area's r1 and c1 are 1 and 0 respectively. """ wb = Workbook() @@ -200,9 +200,10 @@ def test_openpyxl_backend_extract_print_areas_returns_empty_on_error( ) -> None: """ Ensure OpenpyxlBackend.extract_print_areas returns an empty dict when the workbook loader raises an error. - + Verifies that the backend handles errors from the underlying workbook opening function by returning an empty mapping of print areas. """ + def _raise(*_args: object, **_kwargs: object) -> None: raise RuntimeError("boom") @@ -253,10 +254,10 @@ class _Location: def __init__(self, row: int | None = None, col: int | None = None) -> None: """ Initialize the location with row and column values. - + Parameters: - row (int | None): Row index or None. - col (int | None): Column index or None. + row (int | None): Row index or None. + col (int | None): Column index or None. """ self.Row = row self.Column = col @@ -266,7 +267,7 @@ class _BreakItem: def __init__(self, row: int | None = None, col: int | None = None) -> None: """ Initialize the break item with an optional sheet location. - + Parameters: row (int | None): Row index (1-based) for the location, or None if unspecified. col (int | None): Column index (1-based) for the location, or None if unspecified. @@ -278,7 +279,7 @@ class _Breaks: def __init__(self, items: list[_BreakItem]) -> None: """ Initialize the Breaks collection from a list of break items. - + Parameters: items (list[_BreakItem]): Sequence of `_BreakItem` instances representing page break entries; ordering corresponds to 1-based access via `Item`. """ @@ -288,10 +289,10 @@ def __init__(self, items: list[_BreakItem]) -> None: def Item(self, index: int) -> _BreakItem: """ Return the break item at the given 1-based position. - + Parameters: index (int): 1-based position of the break to retrieve. - + Returns: _BreakItem: The break item at the specified position. """ @@ -302,7 +303,7 @@ class _RangeRows: def __init__(self, count: int) -> None: """ Initialize the breaks container with a specified item count. - + Parameters: count (int): Number of break items the container should report via its `Count` attribute. """ @@ -313,7 +314,7 @@ class _RangeCols: def __init__(self, count: int) -> None: """ Initialize the breaks container with a specified item count. - + Parameters: count (int): Number of break items the container should report via its `Count` attribute. """ @@ -339,7 +340,7 @@ class _SheetApi: def __init__(self) -> None: """ Initialize a fake sheet API used by COM backend tests with default page and range state. - + Creates default attributes: - DisplayPageBreaks set to False. - PageSetup populated with a default PrintArea. @@ -356,10 +357,10 @@ def __init__(self) -> None: def Range(self, _addr: str) -> _Range: """ Create and return a Range wrapper for the given Excel-style address. - + Parameters: _addr (str): Excel-style address or range string (e.g., "A1", "A1:B2", or "Sheet1!A1:B2"). - + Returns: _Range: An object representing the requested worksheet range. """ @@ -372,7 +373,7 @@ class _Sheet: def __init__(self) -> None: """ Initialize a mock sheet and attach its API. - + Sets the `api` attribute to a new `_SheetApi` instance used by tests to simulate a sheet's COM-like API. """ self.api = _SheetApi() @@ -382,7 +383,7 @@ class _DummyWorkbook: def __init__(self) -> None: """ Initialize a dummy workbook containing a single default sheet. - + The instance provides a `sheets` attribute set to a list with one `_Sheet` object. """ self.sheets = [_Sheet()] @@ -399,7 +400,7 @@ class _RestoreErrorSheetApi: def __init__(self) -> None: """ Initialize a mock sheet API with default page, range, and break attributes. - + Creates: - `_display`: boolean flag for DisplayPageBreaks (defaults to False). - `PageSetup`: a default page setup object. @@ -416,7 +417,7 @@ def __init__(self) -> None: def DisplayPageBreaks(self) -> bool: """ Get whether displaying page breaks is enabled on the sheet. - + Returns: `True` if page break display is enabled, `False` otherwise. """ @@ -426,10 +427,10 @@ def DisplayPageBreaks(self) -> bool: def DisplayPageBreaks(self, value: bool) -> None: """ Set the sheet's DisplayPageBreaks flag. - + Parameters: value (bool): True to enable display of automatic page breaks. Passing False will trigger a restore failure. - + Raises: RuntimeError: If `value` is False (restore failed). """ @@ -440,10 +441,10 @@ def DisplayPageBreaks(self, value: bool) -> None: def Range(self, _addr: str) -> _Range: """ Create and return a Range wrapper for the given Excel-style address. - + Parameters: _addr (str): Excel-style address or range string (e.g., "A1", "A1:B2", or "Sheet1!A1:B2"). - + Returns: _Range: An object representing the requested worksheet range. """ @@ -456,7 +457,7 @@ class _RestoreErrorSheet: def __init__(self) -> None: """ Create a sheet object whose underlying API simulates an error when restoring DisplayPageBreaks. - + This constructor assigns an instance of _RestoreErrorSheetApi to the `api` attribute so tests can exercise code paths that handle failures when restoring page-break state. """ self.api = _RestoreErrorSheetApi() @@ -466,7 +467,7 @@ class _RestoreErrorWorkbook: def __init__(self) -> None: """ Create a mock workbook containing a single sheet that raises an error when restoring DisplayPageBreaks. - + The instance exposes a `sheets` attribute set to a list with one _RestoreErrorSheet(), which is used to simulate failures during page-break restoration in tests. """ self.sheets = [_RestoreErrorSheet()] @@ -475,4 +476,4 @@ def __init__(self) -> None: def test_com_backend_extract_auto_page_breaks_restore_error() -> None: backend = ComBackend(_RestoreErrorWorkbook()) areas = backend.extract_auto_page_breaks() - assert "Sheet1" in areas \ No newline at end of file + assert "Sheet1" in areas diff --git a/tests/backends/test_print_areas_openpyxl.py b/tests/backends/test_print_areas_openpyxl.py index 31362e3..e4976f6 100644 --- a/tests/backends/test_print_areas_openpyxl.py +++ b/tests/backends/test_print_areas_openpyxl.py @@ -16,7 +16,7 @@ def _make_book_with_print_area(path: Path) -> None: """ Create a simple Excel workbook with a single sheet named "Sheet1", set its print area to "A1:B2", write "x" to cell A1, save it to the given path, and close the file. - + Parameters: path (Path): Filesystem path where the workbook will be saved. """ @@ -69,7 +69,7 @@ class _DefinedNames: def get(self, _name: str) -> _DefinedArea: """ Create a default defined area object. - + Returns: _DefinedArea: A new, empty/default defined-area instance. """ @@ -121,4 +121,4 @@ def test_append_print_areas_skips_invalid_ranges() -> None: areas: PrintAreaData = {} _append_print_areas(areas, "Sheet1", "A1:B2,INVALID") assert "Sheet1" in areas - assert len(areas["Sheet1"]) == 1 \ No newline at end of file + assert len(areas["Sheet1"]) == 1 diff --git a/tests/com/test_render_smoke.py b/tests/com/test_render_smoke.py index 6557a98..b61b1ba 100644 --- a/tests/com/test_render_smoke.py +++ b/tests/com/test_render_smoke.py @@ -39,7 +39,7 @@ def test_render_smoke_pdf_and_png(tmp_path: Path) -> None: def test_render_multiple_print_ranges_images(tmp_path: Path) -> None: """ Verify that processing a workbook with multiple print ranges across four sheets produces an images directory containing exactly four PNG files. - + Uses the test asset 'assets/multiple_print_ranges_4sheets.xlsx', runs process_excel with image output enabled, and asserts the generated images directory exists and contains four .png images. """ xlsx = ( @@ -60,4 +60,4 @@ def test_render_multiple_print_ranges_images(tmp_path: Path) -> None: images_dir = out_json.parent / f"{out_json.stem}_images" images = list(images_dir.glob("*.png")) assert images_dir.exists() - assert len(images) == 4 \ No newline at end of file + assert len(images) == 4 diff --git a/tests/core/test_cells_utils.py b/tests/core/test_cells_utils.py index 9a03248..75460c1 100644 --- a/tests/core/test_cells_utils.py +++ b/tests/core/test_cells_utils.py @@ -73,9 +73,10 @@ def test_detect_tables_openpyxl_respects_table_params( def test_normalize_formula_value_prefers_array_text() -> None: """ Verify that _normalize_formula_value prefers an array-like object's text and treats an empty string as no formula. - + Asserts that an object with a `text` attribute is converted to a formula string prefixed with '=' (e.g., "=SUM(A1:A3)"), and that an empty string is normalized to None. """ + class _ArrayFormulaLike: text = "SUM(A1:A3)" @@ -150,13 +151,13 @@ class _DummySheet: def range(self, _start: object, _end: object) -> _DummyRange: """ Return a new _DummyRange representing a requested cell range. - + Parameters: - _start (object): Start coordinate or cell reference for the range request (ignored by this dummy implementation). - _end (object): End coordinate or cell reference for the range request (ignored by this dummy implementation). - + _start (object): Start coordinate or cell reference for the range request (ignored by this dummy implementation). + _end (object): End coordinate or cell reference for the range request (ignored by this dummy implementation). + Returns: - _DummyRange: A fresh _DummyRange instance corresponding to the requested range. + _DummyRange: A fresh _DummyRange instance corresponding to the requested range. """ return _DummyRange() @@ -169,4 +170,4 @@ class _DummyWorkbook: assert sheet.formulas_map == { "=A1": [(1, 0)], "=SUM(A1)": [(2, 0)], - } \ No newline at end of file + } diff --git a/tests/core/test_mode_output.py b/tests/core/test_mode_output.py index 202a87a..ac70782 100644 --- a/tests/core/test_mode_output.py +++ b/tests/core/test_mode_output.py @@ -32,7 +32,7 @@ def _make_basic_book(path: Path) -> None: def _ensure_excel() -> None: """ Ensure Excel COM is available for tests and skip the current test if it is not. - + If the SKIP_COM_TESTS environment variable is set, this function skips the test. Otherwise it tries to start a hidden xlwings App and quits it; if starting the App fails, the function skips the test due to unavailable Excel COM. """ if os.getenv("SKIP_COM_TESTS"): @@ -195,4 +195,4 @@ def test_CLI_defaults_to_stdout(tmp_path: Path) -> None: ] result = subprocess.run(cmd, capture_output=True, text=True) assert result.returncode == 0 - assert '"book_name": "book.xlsx"' in result.stdout \ No newline at end of file + assert '"book_name": "book.xlsx"' in result.stdout diff --git a/tests/core/test_pipeline.py b/tests/core/test_pipeline.py index d12dde3..5fe17df 100644 --- a/tests/core/test_pipeline.py +++ b/tests/core/test_pipeline.py @@ -225,7 +225,7 @@ def test_resolve_extraction_inputs_warns_on_xls_formulas( def _warn_once(key: str, message: str) -> None: """ Record a warning key in the shared `calls` list while ignoring the message. - + Parameters: key (str): Identifier for the warning; appended to the module-level `calls` list. message (str): Ignored placeholder kept for compatibility with expected callback signature. @@ -402,11 +402,11 @@ def _fake( ) -> object: """ Provide a placeholder colors map for testing that is always empty. - + Parameters: include_default_background (bool): Accepted for signature compatibility; has no effect on the returned value. ignore_colors (set[str] | None): Accepted for signature compatibility; has no effect on the returned value. - + Returns: WorkbookColorsMap: An empty colors map with no sheets. """ @@ -446,7 +446,7 @@ def _fake_com( ) -> None: """ No-op placeholder that simulates a COM backend extraction step without producing any side effects. - + This function accepts a COM backend and related flags but intentionally performs no operations; it is used in tests as a stub implementation. """ _ = _backend @@ -462,11 +462,11 @@ def _fake_openpyxl( ) -> object: """ Return an empty WorkbookColorsMap regardless of inputs. - + Parameters: include_default_background (bool): Ignored; present for signature compatibility. ignore_colors (set[str] | None): Ignored; present for signature compatibility. - + Returns: WorkbookColorsMap: A colors map with no sheets. """ @@ -502,7 +502,7 @@ def test_step_extract_auto_page_breaks_com_sets_data( def _fake(_: ComBackend) -> dict[str, list[PrintArea]]: """ Return a stub mapping of sheet names to print areas containing a single 1x1 print area for "Sheet1". - + Returns: dict[str, list[PrintArea]]: Mapping where "Sheet1" maps to a list with one PrintArea covering row 1, column 0 to row 1, column 0. """ @@ -542,12 +542,12 @@ def _fake_colors( ) -> object: """ Return a fake workbook colors map used by tests. - + Parameters: _backend (OpenpyxlBackend): Ignored backend parameter retained for signature compatibility. include_default_background (bool): Whether the default background color would be included (ignored). ignore_colors (set[str] | None): Set of color names to ignore (ignored). - + Returns: object: A preconstructed colors map object used by tests. """ @@ -559,7 +559,7 @@ def _fake_colors( def _fake_formulas(_: OpenpyxlBackend) -> object: """ Return the pre-captured formulas_map object. - + Returns: The pre-captured `formulas_map` object. """ @@ -596,7 +596,7 @@ def test_step_extract_formulas_map_openpyxl_skips_on_failure( def _raise(_: OpenpyxlBackend) -> object: """ Always raises a RuntimeError with the message "boom". - + Raises: RuntimeError: always raised with message "boom". """ @@ -632,7 +632,7 @@ def test_step_extract_formulas_map_com_skips_on_failure( def _raise(_: ComBackend) -> object: """ Always raises a RuntimeError with message "boom". - + Raises: RuntimeError: Always raised by this helper. """ @@ -700,11 +700,11 @@ def test_step_extract_shapes_com_sets_data( def _fake(_: object, *, mode: str) -> dict[str, list[object]]: """ Provide a stub that supplies the module-level `shapes_data` mapping. - + Parameters: _ (object): Placeholder positional argument; ignored. mode (str): Mode selector; ignored. - + Returns: dict[str, list[object]]: Mapping of sheet names to lists of shape objects from `shapes_data`. """ @@ -739,10 +739,10 @@ def test_step_extract_charts_com_sets_data( def _fake(_: object, *, mode: str) -> list[object]: """ Return the captured charts list. - + Parameters: mode (str): Ignored; accepted for compatibility with callers. - + Returns: list[object]: The charts list captured from the enclosing scope. """ @@ -753,7 +753,7 @@ class _Sheet: def __init__(self, name: str) -> None: """ Initialize the instance with a display name. - + Parameters: name (str): The name to assign to the instance. """ @@ -788,7 +788,7 @@ def test_step_extract_print_areas_com_skips_when_present( def _raise(_: ComBackend) -> object: """ Raise a RuntimeError indicating this code path must not be invoked. - + This function always raises RuntimeError("should not be called"). """ raise RuntimeError("should not be called") @@ -820,7 +820,7 @@ def test_step_extract_print_areas_com_sets_data( def _fake(_: ComBackend) -> dict[str, list[PrintArea]]: """ Return a stub mapping of sheet names to print areas containing a single 1x1 print area for "Sheet1". - + Returns: dict[str, list[PrintArea]]: Mapping where "Sheet1" maps to a list with one PrintArea covering row 1, column 0 to row 1, column 0. """ @@ -859,11 +859,11 @@ def _fake_com( ) -> object: """ Return a colors map object suitable for use as a COM backend response. - + Parameters: include_default_background (bool): If true, the returned colors map should include the default background color. ignore_colors (set[str] | None): Optional set of color identifiers to exclude from the returned map; `None` means no colors are excluded. - + Returns: object: A colors map representing workbook-level color mappings. """ @@ -880,7 +880,7 @@ def _raise( ) -> object: """ Placeholder backend sentinel that always raises a RuntimeError when invoked. - + Raises: RuntimeError: Always raised with message "should not be called". """ @@ -916,13 +916,13 @@ def test_run_com_pipeline_executes_steps(tmp_path: Path) -> None: def _step(_: ExtractionInputs, artifacts: ExtractionArtifacts, __: object) -> None: """ Test pipeline step that simulates shape extraction. - + Sets artifacts.shape_data to a mapping for "Sheet1" containing a single Shape and records invocation by appending "called" to the outer `calls` list. - + Parameters: - _ (ExtractionInputs): Unused extraction inputs placeholder. - artifacts (ExtractionArtifacts): Artifacts object to populate with shape data. - __ (object): Unused context placeholder. + _ (ExtractionInputs): Unused extraction inputs placeholder. + artifacts (ExtractionArtifacts): Artifacts object to populate with shape data. + __ (object): Unused context placeholder. """ calls.append("called") artifacts.shape_data = {"Sheet1": [Shape(id=1, text="", l=0, t=0)]} @@ -954,7 +954,7 @@ class _Sheet: def __init__(self, name: str) -> None: """ Initialize the instance with a display name. - + Parameters: name (str): The name to assign to the instance. """ @@ -964,7 +964,7 @@ class _Sheets: def __init__(self) -> None: """ Initialize the object with a single default sheet named "Sheet1". - + Creates the internal mapping `self._sheets` and populates it with one `_Sheet` instance keyed by "Sheet1". """ self._sheets = {"Sheet1": _Sheet("Sheet1")} @@ -972,13 +972,13 @@ def __init__(self) -> None: def __getitem__(self, name: str) -> _Sheet: """ Access a worksheet by its name. - + Parameters: name (str): The name of the sheet to retrieve. - + Returns: _Sheet: The sheet object associated with `name`. - + Raises: KeyError: If no sheet with the given name exists. """ @@ -990,7 +990,7 @@ class _Workbook: def _pre_step(_: ExtractionInputs, artifacts: ExtractionArtifacts) -> None: """ Populate artifacts with default minimal cell and merged-cell data for a single sheet. - + Parameters: _ (ExtractionInputs): Unused extraction inputs placeholder. artifacts (ExtractionArtifacts): Mutable extraction artifacts that will be updated with @@ -1003,10 +1003,10 @@ def _pre_step(_: ExtractionInputs, artifacts: ExtractionArtifacts) -> None: def _fake_plan(_: ExtractionInputs) -> PipelinePlan: """ Create a fixed PipelinePlan for tests that forces COM usage and provides a single pre-COM step. - + Parameters: _ (ExtractionInputs): Ignored input; present to match the PipelinePlan factory signature. - + Returns: PipelinePlan: A plan with `pre_com_steps` set to a list containing `_pre_step`, `com_steps` empty, and `use_com` set to `True`. """ @@ -1015,9 +1015,9 @@ def _fake_plan(_: ExtractionInputs) -> PipelinePlan: def _fake_detect_tables(_: object) -> list[str]: """ Provide a detector that always reports no table ranges. - + The input workbook-like object is ignored. - + Returns: list[str]: An empty list of table range identifiers. """ @@ -1026,13 +1026,14 @@ def _fake_detect_tables(_: object) -> list[str]: def _fake_workbook(_: Path) -> object: """ Provide a context manager that yields a lightweight fake workbook for tests. - + Parameters: _ (Path): Ignored file path parameter retained to match the real backend signature. - + Returns: object: A context manager whose `__enter__` returns a new `_Workbook` instance and whose `__exit__` does not suppress exceptions (returns `None`). """ + class _Context: def __enter__(self) -> _Workbook: return _Workbook() @@ -1073,4 +1074,4 @@ def __exit__( result = run_extraction_pipeline(inputs) assert result.state.com_attempted is True assert result.state.com_succeeded is True - assert "Sheet1" in result.workbook.sheets \ No newline at end of file + assert "Sheet1" in result.workbook.sheets diff --git a/tests/core/test_pipeline_fallbacks.py b/tests/core/test_pipeline_fallbacks.py index 322d9bc..9600ef9 100644 --- a/tests/core/test_pipeline_fallbacks.py +++ b/tests/core/test_pipeline_fallbacks.py @@ -53,7 +53,7 @@ def test_pipeline_fallback_com_unavailable( ) -> None: """ Verifies that the extraction pipeline falls back when COM access is unavailable. - + Creates a basic workbook, forces the COM-access entry point to raise, runs the extraction pipeline, and asserts that the pipeline records a fallback due to COM being unavailable (`FallbackReason.COM_UNAVAILABLE`), did not attempt COM (`com_attempted is False`), and that the resulting sheet "Sheet1" exists, contains rows, and has no shapes or charts. """ path = tmp_path / "book.xlsx" @@ -128,4 +128,4 @@ def _raise( sheet = result.workbook.sheets["Sheet1"] assert sheet.shapes == [] assert sheet.charts == [] - assert sheet.rows \ No newline at end of file + assert sheet.rows diff --git a/tests/engine/test_engine.py b/tests/engine/test_engine.py index 3725036..816fd51 100644 --- a/tests/engine/test_engine.py +++ b/tests/engine/test_engine.py @@ -40,14 +40,14 @@ def fake_extract( ) -> WorkbookData: """ Test helper that simulates workbook extraction for unit tests. - + Records the received `mode` and `include_print_areas` into the outer `called` mapping and returns a minimal WorkbookData whose `book_name` is the input path's filename and whose `sheets` is empty. - + Parameters: path (Path): Path to the workbook; its filename is used for the returned WorkbookData.book_name. mode (str): Extraction mode passed through and recorded. include_print_areas (bool): Whether print areas were requested; the value is recorded in `called`. - + Returns: WorkbookData: A WorkbookData instance with `book_name` set to path.name and an empty `sheets` mapping. """ @@ -293,4 +293,4 @@ def fake_images(file_path: Path, images_dir: Path, *, dpi: int) -> None: assert calls["pdf_path"].suffix == ".pdf" assert isinstance(calls["images_dir"], Path) assert calls["images_dir"].name.endswith("_images") - assert calls["dpi"] == 144 \ No newline at end of file + assert calls["dpi"] == 144 diff --git a/tests/models/test_models_export.py b/tests/models/test_models_export.py index 2ea52c2..3b922f7 100644 --- a/tests/models/test_models_export.py +++ b/tests/models/test_models_export.py @@ -24,7 +24,7 @@ def _sheet() -> SheetData: """ Create a sample SheetData containing one row, no shapes or charts, and a single table candidate. - + Returns: SheetData: A SheetData instance with one CellRow (r=1, c={"0": "A"}), empty shapes and charts lists, and table_candidates set to ["A1:B2"]. """ @@ -161,4 +161,4 @@ def test_sheet_json_includes_merged_cells_schema() -> None: ) data = json.loads(sheet.to_json()) assert data["merged_cells"]["schema"] == ["r1", "c1", "r2", "c2", "v"] - assert data["merged_cells"]["items"][0] == [1, 0, 1, 1, "merged"] \ No newline at end of file + assert data["merged_cells"]["items"][0] == [1, 0, 1, 1, "merged"] diff --git a/tests/render/test_render_init.py b/tests/render/test_render_init.py index 3285144..394cb2c 100644 --- a/tests/render/test_render_init.py +++ b/tests/render/test_render_init.py @@ -585,7 +585,7 @@ class _PageSetup: def PrintArea(self) -> str: """ Simulate accessing a worksheet's PrintArea and always raise an error to emulate a failure. - + Raises: RuntimeError: Always raised to simulate an error when retrieving the PrintArea. """ @@ -606,7 +606,7 @@ class _WsApi: def __init__(self, name: str) -> None: """ Initialize the FakeSheet with the given Excel sheet name. - + Parameters: name (str): The sheet's name to assign to the object's `Name` attribute. """ @@ -616,7 +616,7 @@ class _Worksheets: def __init__(self) -> None: """ Initialize the fake PDF document stub. - + Sets the `Count` attribute to 2 to emulate a document with two pages. """ self.Count = 2 @@ -624,10 +624,10 @@ def __init__(self) -> None: def Item(self, index: int) -> _WsApi: """ Return a worksheet API stub for the sheet at the given index. - + Parameters: index (int): One-based index of the worksheet within the workbook. - + Returns: _WsApi: A worksheet API stub corresponding to the sheet at `index`. """ @@ -651,7 +651,7 @@ def test_export_pdf_propagates_render_error( def _raise() -> xw.App: """ Always raises a RenderError to simulate failure when obtaining an Excel application. - + Raises: RenderError: Always raised with the message "boom". """ @@ -683,7 +683,7 @@ class _SheetApi: def _fake_iter(_: xw.Book) -> list[tuple[int, str, _SheetApi]]: """ Return a single-item list that mimics iterating workbook sheets for tests. - + Returns: A list with one tuple (index, sheet name, sheet API stub): (0, "Sheet1", _SheetApi()). """ @@ -692,10 +692,10 @@ def _fake_iter(_: xw.Book) -> list[tuple[int, str, _SheetApi]]: def _fake_extract(_: _SheetApi) -> list[str]: """ Provide two fake print-area ranges for testing. - + Parameters: _ (_SheetApi): Ignored sheet API placeholder. - + Returns: list[str]: Two print-area ranges: "A1:B2" and "C3:D4". """ @@ -727,7 +727,7 @@ class _BadPageSetup: def PrintArea(self) -> str: """ Represents the worksheet's PrintArea setting as an Excel range string. - + Returns: str: The PrintArea range (e.g., "A1:B2"). """ @@ -737,10 +737,10 @@ def PrintArea(self) -> str: def PrintArea(self, _value: object) -> None: """ Simulated setter for PrintArea that always fails. - + Parameters: _value (object): Ignored; the provided value is not used because the setter always raises. - + Raises: RuntimeError: Always raised with the message "bad". """ @@ -754,7 +754,7 @@ def ExportAsFixedFormat( ) -> None: """ Simulate exporting a workbook/sheet to a fixed-format file by writing a minimal fake PDF header to the given path. - + Parameters: _file_format (int): Ignored numeric format indicator. _output_path (str): Filesystem path where the fake export file will be written. @@ -802,9 +802,9 @@ def _fake_render( ) -> list[Path]: """ Simulates rendering a PDF sheet to image files for tests. - + On the first invocation this function returns an empty list to simulate a transient empty render result; on subsequent invocations it returns a single Path inside output_dir named "{sheet_index+1:02d}_{safe_name}.png". - + Parameters: _pdfium: Ignored in the fake implementation (kept for signature compatibility). _pdf_path: Ignored in the fake implementation (kept for signature compatibility). @@ -813,7 +813,7 @@ def _fake_render( safe_name (str): Sanitized sheet name used in the filename. _dpi: Ignored in the fake implementation (kept for signature compatibility). _use_subprocess: Ignored in the fake implementation (kept for signature compatibility). - + Returns: list[Path]: Empty list on the first call, otherwise a list containing one Path pointing to the fake PNG file. """ @@ -861,7 +861,7 @@ class _FlakyPageSetup: def __init__(self) -> None: """ Initialize a PageSetup-like test stub with a default print area and a setter call counter. - + The instance starts with `_print_area` set to "A1" and `_set_calls` set to 0 to track how many times the print area setter has been invoked. """ self._print_area: object = "A1" @@ -871,7 +871,7 @@ def __init__(self) -> None: def PrintArea(self) -> object: """ Retrieve the current PrintArea value from the PageSetup stub. - + Returns: print_area (object): The stored PrintArea value (typically a string) or whatever was set on the stub. """ @@ -881,12 +881,12 @@ def PrintArea(self) -> object: def PrintArea(self, value: object) -> None: """ Set the PrintArea value on this stub PageSetup instance. - + Parameters: - value (object): The print area value to assign. - + value (object): The print area value to assign. + Raises: - RuntimeError: If the setter is invoked more than once (simulates a restore failure). + RuntimeError: If the setter is invoked more than once (simulates a restore failure). """ if self._set_calls >= 1: raise RuntimeError("restore failed") @@ -903,7 +903,7 @@ def ExportAsFixedFormat( ) -> None: """ Simulate exporting to a fixed format; this stub always raises an export error. - + Raises: RuntimeError: with message "export failed" when invoked. """ @@ -920,4 +920,4 @@ def ExportAsFixedFormat( pdf_path, ignore_print_areas=False, print_area="A1:B2", - ) \ No newline at end of file + ) diff --git a/tests/utils.py b/tests/utils.py index 8ed8bc0..8a00f0b 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -20,14 +20,14 @@ def parametrize( ) -> Callable[[Callable[P, R]], Callable[P, R]]: """ Return a decorator that parametrizes a test callable with the given argument names and values. - + Parameters: argnames: One or more parameter names (single string or sequence of strings) to inject into the test callable. argvalues: An iterable of values or value-tuples to use for each generated test case. indirect: If True or a sequence of names, treat corresponding parameters as fixtures and resolve them indirectly. ids: Optional iterable of case identifiers or a callable that produces an identifier for each value. scope: Optional fixture scope to apply when parameters are used as fixtures ("session", "package", "module", "class", or "function"). - + Returns: decorator: A decorator that applies the specified parametrization to a callable while preserving its signature. """ @@ -40,4 +40,4 @@ def parametrize( ids=ids, scope=scope, ), - ) \ No newline at end of file + ) From d256bb98e868d0b0a4a92282cc884d8dca212261 Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Fri, 23 Jan 2026 11:03:33 +0900 Subject: [PATCH 11/12] =?UTF-8?q?=E4=BF=AE=E6=AD=A3:=20extract=5Fsheet=5Ff?= =?UTF-8?q?ormulas=5Fmap=5Fcom=E3=81=AE=E6=88=BB=E3=82=8A=E5=80=A4?= =?UTF-8?q?=E3=81=AE=E5=9E=8B=E6=B3=A8=E9=87=88=E3=82=92=E6=9B=B4=E6=96=B0?= =?UTF-8?q?=E3=81=97=E3=80=81=E5=88=97=E3=82=A4=E3=83=B3=E3=83=87=E3=83=83?= =?UTF-8?q?=E3=82=AF=E3=82=B9=E3=81=8C0=E3=83=99=E3=83=BC=E3=82=B9?= =?UTF-8?q?=E3=81=A7=E3=81=82=E3=82=8B=E3=81=93=E3=81=A8=E3=82=92=E6=98=8E?= =?UTF-8?q?=E7=A2=BA=E3=81=AB=E3=81=97=E3=81=BE=E3=81=97=E3=81=9F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/exstruct/core/cells.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/exstruct/core/cells.py b/src/exstruct/core/cells.py index d6d9644..1ad915a 100644 --- a/src/exstruct/core/cells.py +++ b/src/exstruct/core/cells.py @@ -156,7 +156,7 @@ def extract_sheet_formulas_map_com(workbook: xw.Book) -> WorkbookFormulasMap: workbook: xlwings Book instance whose sheets will be scanned for formulas. Returns: - WorkbookFormulasMap: maps sheet names to SheetFormulasMap objects. Each SheetFormulasMap.formulas_map maps a normalized formula string (consistent representation, e.g., beginning with "=") to a list of (row, column) tuples representing cell locations using Excel 1-based indices. + WorkbookFormulasMap: maps sheet names to SheetFormulasMap objects. Each SheetFormulasMap.formulas_map maps a normalized formula string (consistent representation, e.g., beginning with "=") to a list of (row, column) tuples where row is 1-based and column is 0-based. """ sheets: dict[str, SheetFormulasMap] = {} for sheet in workbook.sheets: From 887b4334cc4e77bdedb838b99b8215bc0a35ffd6 Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Fri, 23 Jan 2026 11:18:55 +0900 Subject: [PATCH 12/12] =?UTF-8?q?=E3=83=AA=E3=83=AA=E3=83=BC=E3=82=B9:=20v?= =?UTF-8?q?0.3.7=20=E3=81=A7=E6=95=B0=E5=BC=8F=E5=8F=96=E5=BE=97=E6=A9=9F?= =?UTF-8?q?=E8=83=BD=E3=82=92=E8=BF=BD=E5=8A=A0=E3=81=97=E3=80=81=E6=A7=8B?= =?UTF-8?q?=E9=80=A0=E5=8C=96=E5=87=BA=E5=8A=9B=E3=82=92=E6=8B=A1=E5=BC=B5?= =?UTF-8?q?=E3=80=82`formulas=5Fmap`=20=E3=81=AE=E6=8A=BD=E5=87=BA?= =?UTF-8?q?=E3=82=92=E8=BF=BD=E5=8A=A0=E3=81=97=E3=80=81=E3=83=91=E3=82=A4?= =?UTF-8?q?=E3=83=97=E3=83=A9=E3=82=A4=E3=83=B3=E3=80=81=E3=83=A2=E3=83=87?= =?UTF-8?q?=E3=83=AB=E3=80=81=E3=83=90=E3=83=83=E3=82=AF=E3=82=A8=E3=83=B3?= =?UTF-8?q?=E3=83=89=E3=81=8C=E3=82=A8=E3=83=B3=E3=83=89=E3=83=84=E3=83=BC?= =?UTF-8?q?=E3=82=A8=E3=83=B3=E3=83=89=E3=81=A7=20`formulas=5Fmap`=20?= =?UTF-8?q?=E3=82=92=E4=BC=9D=E6=92=AD=E3=80=82=E5=8D=B0=E5=88=B7=E3=82=A8?= =?UTF-8?q?=E3=83=AA=E3=82=A2=E3=81=AE=E3=82=A8=E3=82=AF=E3=82=B9=E3=83=9D?= =?UTF-8?q?=E3=83=BC=E3=83=88=E3=81=AE=E5=A0=85=E7=89=A2=E6=80=A7=E3=82=92?= =?UTF-8?q?=E5=90=91=E4=B8=8A=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.ja.md | 7 ++++--- README.md | 1 + docs/README.en.md | 1 + docs/README.ja.md | 1 + docs/release-notes/v0.3.7.md | 19 +++++++++++++++++++ mkdocs.yml | 1 + pyproject.toml | 2 +- schemas/sheet.json | 21 +++++++++++++++++++++ schemas/workbook.json | 21 +++++++++++++++++++++ src/exstruct/render/__init__.py | 4 ++-- uv.lock | 2 +- 11 files changed, 73 insertions(+), 7 deletions(-) create mode 100644 docs/release-notes/v0.3.7.md diff --git a/README.ja.md b/README.ja.md index 87f7c24..23fdeb6 100644 --- a/README.ja.md +++ b/README.ja.md @@ -2,7 +2,7 @@ [![PyPI version](https://badge.fury.io/py/exstruct.svg)](https://pypi.org/project/exstruct/) [![PyPI Downloads](https://static.pepy.tech/personalized-badge/exstruct?period=total&units=INTERNATIONAL_SYSTEM&left_color=BLACK&right_color=GREEN&left_text=downloads)](https://pepy.tech/projects/exstruct) ![Licence: BSD-3-Clause](https://img.shields.io/badge/license-BSD--3--Clause-blue?style=flat-square) [![pytest](https://github.com/harumiWeb/exstruct/actions/workflows/pytest.yml/badge.svg)](https://github.com/harumiWeb/exstruct/actions/workflows/pytest.yml) [![Codacy Badge](https://app.codacy.com/project/badge/Grade/e081cb4f634e4175b259eb7c34f54f60)](https://app.codacy.com/gh/harumiWeb/exstruct/dashboard?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_grade) [![codecov](https://codecov.io/gh/harumiWeb/exstruct/graph/badge.svg?token=2XI1O8TTA9)](https://codecov.io/gh/harumiWeb/exstruct) -![ExStruct Image](/assets/icon.webp) +![ExStruct Image](docs/assets/icon.webp) ExStruct は Excel ワークブックを読み取り、構造化データ(セル・テーブル候補・図形・チャート・SmartArt・印刷範囲ビュー)をデフォルトで JSON に出力します。必要に応じて YAML/TOON も選択でき、COM/Excel 環境ではリッチ抽出、非 COM 環境ではセル+テーブル候補+印刷範囲へのフォールバックで安全に動作します。LLM/RAG 向けに検出ヒューリスティックや出力モードを調整可能です。 @@ -10,6 +10,7 @@ ExStruct は Excel ワークブックを読み取り、構造化データ(セ - **Excel → 構造化 JSON**: セル、図形、チャート、SmartArt、テーブル候補、セル結合範囲、印刷範囲/自動改ページ範囲(PrintArea/PrintAreaView)をシート単位・範囲単位で出力。 - **出力モード**: `light`(セル+テーブル候補のみ)、`standard`(テキスト付き図形+矢印、チャート、SmartArt、セル結合範囲)、`verbose`(全図形を幅高さ付きで出力、セルのハイパーリンクも出力)。 +- **数式取得**: `formulas_map`(数式文字列 → セル座標)を openpyxl/COM で取得。`verbose` 既定、`include_formulas_map` で制御。 - **フォーマット**: JSON(デフォルトはコンパクト、`--pretty` で整形)、YAML、TOON(任意依存)。 - **テーブル検出のチューニング**: API でヒューリスティックを動的に変更可能。 - **ハイパーリンク抽出**: `verbose` モード(または `include_cell_links=True` 指定)でセルのリンクを `links` に出力。 @@ -160,7 +161,7 @@ exstruct input.xlsx --pdf --image --dpi 144 - 図形のみで作成したフローチャート (下画像が実際のサンプル Excel シート) -![Sample Excel](/assets/demo_sheet.png) +![Sample Excel](docs/assets/demo_sheet.png) サンプル Excel: `sample/sample.xlsx` ### 1. Input: Excel Sheet Overview @@ -339,7 +340,7 @@ flowchart TD ### Excel データ -![一般的な申請書Excel](/assets/demo_form.ja.png) +![一般的な申請書Excel](docs/assets/demo_form.ja.png) ### ExStruct JSON diff --git a/README.md b/README.md index 7588c0e..6ebff3a 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,7 @@ ExStruct reads Excel workbooks and outputs structured data (cells, table candida - **Excel → Structured JSON**: cells, shapes, charts, smartart, table candidates, print areas/views, and auto page-break areas per sheet. - **Output modes**: `light` (cells + table candidates + print areas; no COM, shapes/charts empty), `standard` (texted shapes + arrows, charts, smartart, merged cell ranges, print areas), `verbose` (all shapes with width/height, charts with size, merged cell ranges, print areas). Verbose also emits cell hyperlinks and `colors_map`. Size output is flag-controlled. +- **Formula map extraction**: emits `formulas_map` (formula string -> cell coordinates) via openpyxl/COM; enabled by default in `verbose` or via `include_formulas_map`. - **Auto page-break export (COM only)**: capture Excel-computed auto page breaks and write per-area JSON/YAML/TOON when requested (CLI option appears only when COM is available). - **Formats**: JSON (compact by default, `--pretty` available), YAML, TOON (optional dependencies). - **Table detection tuning**: adjust heuristics at runtime via API. diff --git a/docs/README.en.md b/docs/README.en.md index 97e63de..39df415 100644 --- a/docs/README.en.md +++ b/docs/README.en.md @@ -12,6 +12,7 @@ ExStruct reads Excel workbooks and outputs structured data (cells, table candida - **Excel → Structured JSON**: cells, shapes, charts, smartart, table candidates, print areas/views, and auto page-break areas per sheet. - **Output modes**: `light` (cells + table candidates + print areas; no COM, shapes/charts empty), `standard` (texted shapes + arrows, charts, smartart, merged cell ranges, print areas), `verbose` (all shapes with width/height, charts with size, merged cell ranges, print areas). Verbose also emits cell hyperlinks, `colors_map`, and `formulas_map`. Size output is flag-controlled. +- **Formula map extraction**: emits `formulas_map` (formula string -> cell coordinates) via openpyxl/COM; enabled by default in `verbose` or via `include_formulas_map`. - **Auto page-break export (COM only)**: capture Excel-computed auto page breaks and write per-area JSON/YAML/TOON when requested (CLI option appears only when COM is available). - **Formats**: JSON (compact by default, `--pretty` available), YAML, TOON (optional dependencies). - **Table detection tuning**: adjust heuristics at runtime via API. diff --git a/docs/README.ja.md b/docs/README.ja.md index e051e04..a2732af 100644 --- a/docs/README.ja.md +++ b/docs/README.ja.md @@ -10,6 +10,7 @@ ExStruct は Excel ワークブックを読み取り、構造化データ(セ - **Excel → 構造化 JSON**: セル、図形、チャート、SmartArt、テーブル候補、セル結合範囲、印刷範囲/自動改ページ範囲(PrintArea/PrintAreaView)をシート単位・範囲単位で出力。 - **出力モード**: `light`(セル+テーブル候補のみ)、`standard`(テキスト付き図形+矢印、チャート、SmartArt、セル結合範囲)、`verbose`(全図形を幅高さ付きで出力、セルのハイパーリンク/`colors_map`/`formulas_map`も出力)。 +- **数式取得**: `formulas_map`(数式文字列 → セル座標)を openpyxl/COM で取得。`verbose` 既定、`include_formulas_map` で制御。 - **フォーマット**: JSON(デフォルトはコンパクト、`--pretty` で整形)、YAML、TOON(任意依存)。 - **テーブル検出のチューニング**: API でヒューリスティックを動的に変更可能。 - **ハイパーリンク抽出**: `verbose` モード(または `include_cell_links=True` 指定)でセルのリンクを `links` に出力。 diff --git a/docs/release-notes/v0.3.7.md b/docs/release-notes/v0.3.7.md new file mode 100644 index 0000000..8672766 --- /dev/null +++ b/docs/release-notes/v0.3.7.md @@ -0,0 +1,19 @@ +# v0.3.7 Release Notes + +This release adds formula extraction to the structured output, expanding the +pipeline, models, and backends while keeping the existing modes and fallbacks. + +## Highlights + +- Added `formulas_map` extraction (formula string -> cell coordinates) via + openpyxl for .xlsx/.xlsm and COM for .xls, with `include_formulas_map` option + and verbose default behavior. +- Pipeline, engine, and models now propagate `formulas_map` end-to-end, with + updated samples and documentation. +- Rendering robustness improved for print-area exports (safer page numbering + and error handling during PrintArea restoration). + +## Notes + +- `formulas_map` is emitted in `verbose` by default; use `include_formulas_map` + to enable/disable explicitly. diff --git a/mkdocs.yml b/mkdocs.yml index 33a0b0f..9eb5bd8 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -27,6 +27,7 @@ nav: - CLI Guide: cli.md - Concept / Why ExStruct?: concept.md - Release Notes: + - v0.3.7: release-notes/v0.3.7.md - v0.3.6: release-notes/v0.3.6.md - v0.3.5: release-notes/v0.3.5.md - v0.3.2: release-notes/v0.3.2.md diff --git a/pyproject.toml b/pyproject.toml index 4093734..1e6cca7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "exstruct" -version = "0.3.6" +version = "0.3.7" description = "Excel to structured JSON (tables, shapes, charts) for LLM/RAG pipelines" readme = "README.md" license = { file = "LICENSE" } diff --git a/schemas/sheet.json b/schemas/sheet.json index 0e39edd..4a1eb5d 100644 --- a/schemas/sheet.json +++ b/schemas/sheet.json @@ -718,6 +718,27 @@ "title": "Colors Map", "type": "object" }, + "formulas_map": { + "additionalProperties": { + "items": { + "maxItems": 2, + "minItems": 2, + "prefixItems": [ + { + "type": "integer" + }, + { + "type": "integer" + } + ], + "type": "array" + }, + "type": "array" + }, + "description": "Mapping of formula strings to lists of (row, column) tuples where row is 1-based and column is 0-based.", + "title": "Formulas Map", + "type": "object" + }, "merged_cells": { "anyOf": [ { diff --git a/schemas/workbook.json b/schemas/workbook.json index 576ef67..eb99d41 100644 --- a/schemas/workbook.json +++ b/schemas/workbook.json @@ -594,6 +594,27 @@ "title": "Colors Map", "type": "object" }, + "formulas_map": { + "additionalProperties": { + "items": { + "maxItems": 2, + "minItems": 2, + "prefixItems": [ + { + "type": "integer" + }, + { + "type": "integer" + } + ], + "type": "array" + }, + "type": "array" + }, + "description": "Mapping of formula strings to lists of (row, column) tuples where row is 1-based and column is 0-based.", + "title": "Formulas Map", + "type": "object" + }, "merged_cells": { "anyOf": [ { diff --git a/src/exstruct/render/__init__.py b/src/exstruct/render/__init__.py index f70df28..e805b7e 100644 --- a/src/exstruct/render/__init__.py +++ b/src/exstruct/render/__init__.py @@ -354,9 +354,9 @@ def _export_sheet_pdf( 0, str(pdf_path), IgnorePrintAreas=ignore_print_areas ) except TypeError: - if ignore_print_areas and page_setup is None: + if ignore_print_areas: try: - page_setup = getattr(sheet_api, "PageSetup", None) + page_setup = page_setup or getattr(sheet_api, "PageSetup", None) if page_setup is not None: page_setup.PrintArea = "" except Exception as exc: diff --git a/uv.lock b/uv.lock index 0d704c1..491e721 100644 --- a/uv.lock +++ b/uv.lock @@ -298,7 +298,7 @@ wheels = [ [[package]] name = "exstruct" -version = "0.3.6" +version = "0.3.7" source = { editable = "." } dependencies = [ { name = "numpy" },