1- from typing import Dict , List , Optional , Union
1+ from typing import Dict , List , Union
22
33from pydantic import BaseModel
44
@@ -19,22 +19,165 @@ class TextWithBackref(BaseModel):
1919 backrefs : list [BackRef ]
2020
2121
22- def extract_rich_text_from_page (
22+ def extract_text_with_backref_from_page (
2323 page : Page , include_annotations : bool = False
24- ) -> Dict [str , Union [str , List [Dict ]]]:
24+ ) -> TextWithBackref :
25+ """
26+ Extract all rich text content from a JSON-DOC page as a single string
27+ with backrefs tracking the block origins.
28+
29+ Args:
30+ page: A JSON-DOC Page object
31+ include_annotations: If True, includes formatting info (not used in backref tracking)
32+
33+ Returns:
34+ TextWithBackref: Object containing concatenated text and backrefs
35+ """
36+ concat_text = ""
37+ backrefs = []
38+
39+ # Extract title
40+ title_text = ""
41+ if page .properties .title and page .properties .title .title :
42+ for rich_text in page .properties .title .title :
43+ title_text += rich_text .plain_text
44+
45+ if title_text :
46+ begin_idx = len (concat_text )
47+ concat_text += title_text
48+ end_idx = len (concat_text )
49+ # Add a backref for the page title using the page's ID
50+ backrefs .append (BackRef (block_id = page .id , begin_idx = begin_idx , end_idx = end_idx ))
51+ # Add a newline after the title
52+ concat_text += "\n \n "
53+
54+ # Process all blocks recursively and collect their text with backrefs
55+ blocks_with_text = _extract_blocks_with_text (page .children , include_annotations )
56+
57+ # Add all blocks to the concatenated text with their respective backrefs
58+ for block_id , block_text in blocks_with_text :
59+ if block_text :
60+ begin_idx = len (concat_text )
61+ concat_text += block_text
62+ end_idx = len (concat_text )
63+
64+ backrefs .append (
65+ BackRef (block_id = block_id , begin_idx = begin_idx , end_idx = end_idx )
66+ )
67+
68+ # Add a space after each block
69+ concat_text += " "
70+
71+ return TextWithBackref (text = concat_text .strip (), backrefs = backrefs )
72+
73+
74+ def _extract_blocks_with_text (
75+ blocks : List [BlockBase ], include_annotations : bool = False
76+ ) -> List [tuple [str , str ]]:
77+ """
78+ Extract text from blocks and return a list of (block_id, text) tuples.
79+
80+ Args:
81+ blocks: List of blocks to process
82+ include_annotations: Whether to include annotations (not used in this implementation)
83+
84+ Returns:
85+ List of (block_id, text) tuples
86+ """
87+ result = []
88+
89+ for block in blocks :
90+ # Get text from the current block
91+ block_text = _extract_text_from_single_block (block )
92+ if block_text :
93+ result .append ((block .id , block_text ))
94+
95+ # Process child blocks recursively
96+ if hasattr (block , "children" ) and block .children :
97+ child_results = _extract_blocks_with_text (
98+ block .children , include_annotations
99+ )
100+ result .extend (child_results )
101+
102+ return result
103+
104+
105+ def _extract_text_from_single_block (block : BlockBase ) -> str :
106+ """
107+ Extract text from a single block without processing its children.
108+
109+ Args:
110+ block: The block to extract text from
111+
112+ Returns:
113+ The text content of the block
114+ """
115+ result = []
116+
117+ # Extract rich text if the block supports it
118+ if block_supports_rich_text (block ):
119+ try :
120+ rich_text_list = get_rich_text_from_block (block )
121+ for rich_text in rich_text_list :
122+ result .append (rich_text .plain_text )
123+ except ValueError :
124+ pass
125+
126+ # Extract captions from blocks that support them
127+ if block .type == "image" and hasattr (block .image , "caption" ):
128+ for caption_text in block .image .caption :
129+ result .append (caption_text .plain_text )
130+ elif block .type == "code" and hasattr (block .code , "caption" ):
131+ for caption_text in block .code .caption :
132+ result .append (caption_text .plain_text )
133+
134+ # Handle special blocks like tables
135+ if block .type == "table_row" and hasattr (block .table_row , "cells" ):
136+ for cell in block .table_row .cells :
137+ if isinstance (cell , list ):
138+ for item in cell :
139+ if hasattr (item , "plain_text" ):
140+ result .append (item .plain_text )
141+
142+ return " " .join (result )
143+
144+
145+ def _extract_text_from_block (
146+ block : BlockBase , include_annotations : bool = False
147+ ) -> str :
148+ """
149+ Extract all text from a single block, including its children.
150+
151+ Args:
152+ block: The block to extract text from
153+ include_annotations: Whether to include annotations (not used in this implementation)
154+
155+ Returns:
156+ A string with all text from the block
157+ """
158+ # Extract text from the current block
159+ result = [_extract_text_from_single_block (block )]
160+
161+ # Process child blocks recursively
162+ if hasattr (block , "children" ) and block .children :
163+ for child in block .children :
164+ child_text = _extract_text_from_block (child , include_annotations )
165+ if child_text :
166+ result .append (child_text )
167+
168+ return " " .join ([text for text in result if text ])
169+
170+
171+ def extract_rich_text_from_page (page : Page , include_annotations : bool = False ) -> Dict :
25172 """
26173 Extract all rich text content from a JSON-DOC page.
27174
28175 Args:
29176 page: A JSON-DOC Page object
30- include_annotations: If True, includes formatting info (bold, italic, etc.) in the output
177+ include_annotations: If True, includes formatting info in the output
31178
32179 Returns:
33- A dictionary containing:
34- - 'title': The page title text
35- - 'content': A list of text content from all blocks, each item is either:
36- - A string (if include_annotations=False)
37- - A dict with 'text' and 'annotations' (if include_annotations=True)
180+ Dictionary containing title and content lists
38181 """
39182 result = {"title" : "" , "content" : []}
40183
@@ -53,7 +196,11 @@ def extract_rich_text_from_page(
53196 return result
54197
55198
56- def _process_rich_text_items (rich_text_list , include_annotations , result ):
199+ def _process_rich_text_items (
200+ rich_text_list : list [RichTextBase ],
201+ include_annotations : bool ,
202+ result : list ,
203+ ) -> None :
57204 """
58205 Helper function to process a list of rich text items and append them to the result.
59206
0 commit comments