Skip to content

Commit a31df57

Browse files
committed
Add TextWithBackref
1 parent e202861 commit a31df57

File tree

1 file changed

+157
-10
lines changed

1 file changed

+157
-10
lines changed

jsondoc/extract_rich_text.py

Lines changed: 157 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import Dict, List, Optional, Union
1+
from typing import Dict, List, Union
22

33
from pydantic import BaseModel
44

@@ -19,22 +19,165 @@ class TextWithBackref(BaseModel):
1919
backrefs: list[BackRef]
2020

2121

22-
def extract_rich_text_from_page(
22+
def extract_text_with_backref_from_page(
2323
page: Page, include_annotations: bool = False
24-
) -> Dict[str, Union[str, List[Dict]]]:
24+
) -> TextWithBackref:
25+
"""
26+
Extract all rich text content from a JSON-DOC page as a single string
27+
with backrefs tracking the block origins.
28+
29+
Args:
30+
page: A JSON-DOC Page object
31+
include_annotations: If True, includes formatting info (not used in backref tracking)
32+
33+
Returns:
34+
TextWithBackref: Object containing concatenated text and backrefs
35+
"""
36+
concat_text = ""
37+
backrefs = []
38+
39+
# Extract title
40+
title_text = ""
41+
if page.properties.title and page.properties.title.title:
42+
for rich_text in page.properties.title.title:
43+
title_text += rich_text.plain_text
44+
45+
if title_text:
46+
begin_idx = len(concat_text)
47+
concat_text += title_text
48+
end_idx = len(concat_text)
49+
# Add a backref for the page title using the page's ID
50+
backrefs.append(BackRef(block_id=page.id, begin_idx=begin_idx, end_idx=end_idx))
51+
# Add a newline after the title
52+
concat_text += "\n\n"
53+
54+
# Process all blocks recursively and collect their text with backrefs
55+
blocks_with_text = _extract_blocks_with_text(page.children, include_annotations)
56+
57+
# Add all blocks to the concatenated text with their respective backrefs
58+
for block_id, block_text in blocks_with_text:
59+
if block_text:
60+
begin_idx = len(concat_text)
61+
concat_text += block_text
62+
end_idx = len(concat_text)
63+
64+
backrefs.append(
65+
BackRef(block_id=block_id, begin_idx=begin_idx, end_idx=end_idx)
66+
)
67+
68+
# Add a space after each block
69+
concat_text += " "
70+
71+
return TextWithBackref(text=concat_text.strip(), backrefs=backrefs)
72+
73+
74+
def _extract_blocks_with_text(
75+
blocks: List[BlockBase], include_annotations: bool = False
76+
) -> List[tuple[str, str]]:
77+
"""
78+
Extract text from blocks and return a list of (block_id, text) tuples.
79+
80+
Args:
81+
blocks: List of blocks to process
82+
include_annotations: Whether to include annotations (not used in this implementation)
83+
84+
Returns:
85+
List of (block_id, text) tuples
86+
"""
87+
result = []
88+
89+
for block in blocks:
90+
# Get text from the current block
91+
block_text = _extract_text_from_single_block(block)
92+
if block_text:
93+
result.append((block.id, block_text))
94+
95+
# Process child blocks recursively
96+
if hasattr(block, "children") and block.children:
97+
child_results = _extract_blocks_with_text(
98+
block.children, include_annotations
99+
)
100+
result.extend(child_results)
101+
102+
return result
103+
104+
105+
def _extract_text_from_single_block(block: BlockBase) -> str:
106+
"""
107+
Extract text from a single block without processing its children.
108+
109+
Args:
110+
block: The block to extract text from
111+
112+
Returns:
113+
The text content of the block
114+
"""
115+
result = []
116+
117+
# Extract rich text if the block supports it
118+
if block_supports_rich_text(block):
119+
try:
120+
rich_text_list = get_rich_text_from_block(block)
121+
for rich_text in rich_text_list:
122+
result.append(rich_text.plain_text)
123+
except ValueError:
124+
pass
125+
126+
# Extract captions from blocks that support them
127+
if block.type == "image" and hasattr(block.image, "caption"):
128+
for caption_text in block.image.caption:
129+
result.append(caption_text.plain_text)
130+
elif block.type == "code" and hasattr(block.code, "caption"):
131+
for caption_text in block.code.caption:
132+
result.append(caption_text.plain_text)
133+
134+
# Handle special blocks like tables
135+
if block.type == "table_row" and hasattr(block.table_row, "cells"):
136+
for cell in block.table_row.cells:
137+
if isinstance(cell, list):
138+
for item in cell:
139+
if hasattr(item, "plain_text"):
140+
result.append(item.plain_text)
141+
142+
return " ".join(result)
143+
144+
145+
def _extract_text_from_block(
146+
block: BlockBase, include_annotations: bool = False
147+
) -> str:
148+
"""
149+
Extract all text from a single block, including its children.
150+
151+
Args:
152+
block: The block to extract text from
153+
include_annotations: Whether to include annotations (not used in this implementation)
154+
155+
Returns:
156+
A string with all text from the block
157+
"""
158+
# Extract text from the current block
159+
result = [_extract_text_from_single_block(block)]
160+
161+
# Process child blocks recursively
162+
if hasattr(block, "children") and block.children:
163+
for child in block.children:
164+
child_text = _extract_text_from_block(child, include_annotations)
165+
if child_text:
166+
result.append(child_text)
167+
168+
return " ".join([text for text in result if text])
169+
170+
171+
def extract_rich_text_from_page(page: Page, include_annotations: bool = False) -> Dict:
25172
"""
26173
Extract all rich text content from a JSON-DOC page.
27174
28175
Args:
29176
page: A JSON-DOC Page object
30-
include_annotations: If True, includes formatting info (bold, italic, etc.) in the output
177+
include_annotations: If True, includes formatting info in the output
31178
32179
Returns:
33-
A dictionary containing:
34-
- 'title': The page title text
35-
- 'content': A list of text content from all blocks, each item is either:
36-
- A string (if include_annotations=False)
37-
- A dict with 'text' and 'annotations' (if include_annotations=True)
180+
Dictionary containing title and content lists
38181
"""
39182
result = {"title": "", "content": []}
40183

@@ -53,7 +196,11 @@ def extract_rich_text_from_page(
53196
return result
54197

55198

56-
def _process_rich_text_items(rich_text_list, include_annotations, result):
199+
def _process_rich_text_items(
200+
rich_text_list: list[RichTextBase],
201+
include_annotations: bool,
202+
result: list,
203+
) -> None:
57204
"""
58205
Helper function to process a list of rich text items and append them to the result.
59206

0 commit comments

Comments
 (0)