diff --git a/Customers/Eve Legal/reflow_markdown_with_line_numbers.py b/Customers/Eve Legal/reflow_markdown_with_line_numbers.py new file mode 100644 index 00000000..8f0cdb27 --- /dev/null +++ b/Customers/Eve Legal/reflow_markdown_with_line_numbers.py @@ -0,0 +1,332 @@ +""" +Reflow Content Understanding JSON output to include line numbers inline with text. + +This script reads the JSON output from Azure Content Understanding and generates +a new markdown output where line numbers (commonly found in legal documents, +depositions, and transcripts) are included inline with the corresponding text. + +Content Understanding's default behavior groups line numbers (which appear on the +left margin of pages) separately from the main text content. This script uses +the bounding box coordinates from the 'source' field to determine vertical position +and match line numbers with their corresponding text lines. + +Usage: + python reflow_markdown_with_line_numbers.py [--output ] [--page ] + +Example: + python reflow_markdown_with_line_numbers.py test_output/document.json --page 1 --output reflowed.md +""" + +import argparse +import json +import re +from dataclasses import dataclass +from pathlib import Path +from typing import Optional + + +@dataclass +class LineElement: + """Represents a line or element from the document with its position.""" + content: str + y_position: float # Top Y coordinate + x_position: float # Left X coordinate + page_number: int + is_line_number: bool = False + + def __repr__(self): + return f"LineElement(content='{self.content[:30]}...', y={self.y_position:.2f}, x={self.x_position:.2f}, is_num={self.is_line_number})" + + +def parse_source_coordinates(source: str) -> tuple[int, float, float, float, float]: + """ + Parse the source coordinate string from Content Understanding. + + The source format is: D(pageNumber,x1,y1,x2,y2,x3,y3,x4,y4) + where the points represent a quadrilateral (upper-left, upper-right, lower-right, lower-left) + + Args: + source: The source string from Content Understanding JSON + + Returns: + Tuple of (page_number, left_x, top_y, right_x, bottom_y) + """ + # Match the D(...) pattern and extract values + match = re.match(r'D\((\d+),([^)]+)\)', source) + if not match: + raise ValueError(f"Invalid source format: {source}") + + page_number = int(match.group(1)) + coords = [float(x) for x in match.group(2).split(',')] + + if len(coords) == 8: + # Bounding polygon format: x1,y1,x2,y2,x3,y3,x4,y4 + # Points are: upper-left, upper-right, lower-right, lower-left + x1, y1, x2, y2, x3, y3, x4, y4 = coords + left_x = min(x1, x4) + top_y = min(y1, y2) + right_x = max(x2, x3) + bottom_y = max(y3, y4) + elif len(coords) == 4: + # Axis-aligned bounding box format: left, top, width, height + left_x, top_y, width, height = coords + right_x = left_x + width + bottom_y = top_y + height + else: + raise ValueError(f"Unexpected coordinate count in source: {source}") + + return page_number, left_x, top_y, right_x, bottom_y + + +def is_line_number(content: str) -> bool: + """Check if the content appears to be a line number (1-99).""" + return content.strip().isdigit() and 1 <= int(content.strip()) <= 99 + + +def is_noise_element(content: str) -> bool: + """Check if content is noise (bullets, single dots) that should be filtered.""" + content = content.strip() + # Filter out single bullets/dots that CU uses for structure + return content in ['·', '•', '∙'] or (len(content) == 1 and not content.isalnum()) + + +def extract_lines_from_page(page_data: dict) -> list[LineElement]: + """ + Extract all lines from a page and categorize them. + + Args: + page_data: The page object from Content Understanding JSON + + Returns: + List of LineElement objects with position information + """ + elements = [] + page_number = page_data.get('pageNumber', 1) + + # Process lines array which contains the text content + for line in page_data.get('lines', []): + content = line.get('content', '').strip() + source = line.get('source', '') + + if not source or not content: + continue + + try: + parsed_page, left_x, top_y, right_x, bottom_y = parse_source_coordinates(source) + + # Skip noise elements (bullets, etc.) + if is_noise_element(content): + continue + + element = LineElement( + content=content, + y_position=top_y, + x_position=left_x, + page_number=parsed_page, + is_line_number=is_line_number(content) + ) + elements.append(element) + + except ValueError as e: + print(f"Warning: Could not parse source for line '{content[:30]}...': {e}") + continue + + return elements + + +def group_lines_by_vertical_position(elements: list[LineElement], + y_tolerance: float = 0.15) -> list[list[LineElement]]: + """ + Group elements that appear on the same horizontal line (same Y position). + + Args: + elements: List of LineElement objects + y_tolerance: Tolerance for considering elements on the same line (in inches for PDFs) + + Returns: + List of groups, where each group contains elements on the same line + """ + if not elements: + return [] + + # Sort by Y position (top to bottom) + sorted_elements = sorted(elements, key=lambda e: e.y_position) + + groups = [] + current_group = [sorted_elements[0]] + current_y = sorted_elements[0].y_position + + for element in sorted_elements[1:]: + if abs(element.y_position - current_y) <= y_tolerance: + # Same line + current_group.append(element) + else: + # New line + groups.append(current_group) + current_group = [element] + current_y = element.y_position + + # Don't forget the last group + if current_group: + groups.append(current_group) + + return groups + + +def reflow_page_with_line_numbers(page_data: dict, + separator: str = " | ") -> str: + """ + Reflow a single page's content to include line numbers inline. + + Args: + page_data: The page object from Content Understanding JSON + separator: String to separate line number from content + + Returns: + Reflowed markdown string for this page + """ + elements = extract_lines_from_page(page_data) + + if not elements: + return "" + + # Group by vertical position + line_groups = group_lines_by_vertical_position(elements) + + output_lines = [] + + for group in line_groups: + # Sort elements within the group by X position (left to right) + group.sort(key=lambda e: e.x_position) + + # Separate line numbers from content + line_numbers = [e for e in group if e.is_line_number] + content_elements = [e for e in group if not e.is_line_number] + + if not content_elements: + # Skip lines with only line numbers (shouldn't happen but safety check) + continue + + # Combine content elements + combined_content = ' '.join(e.content for e in content_elements) + + # Prepend line number if found + if line_numbers: + # Use the first (leftmost) line number + line_num = line_numbers[0].content + output_lines.append(f"{line_num}{separator}{combined_content}") + else: + # No line number for this line (e.g., headers, footers) + output_lines.append(combined_content) + + return '\n'.join(output_lines) + + +def reflow_document(json_data: dict, + target_page: Optional[int] = None, + separator: str = " | ") -> str: + """ + Reflow an entire document or specific page with line numbers inline. + + Args: + json_data: The full JSON response from Content Understanding + target_page: If specified, only process this page number (1-indexed) + separator: String to separate line number from content + + Returns: + Reflowed markdown string + """ + contents = json_data.get('result', {}).get('contents', []) + + if not contents: + raise ValueError("No contents found in JSON data") + + # Get the first content (document) + content = contents[0] + + if content.get('kind') != 'document': + print(f"Warning: Content kind is '{content.get('kind')}', expected 'document'") + + pages = content.get('pages', []) + + if not pages: + raise ValueError("No pages found in document content") + + output_parts = [] + + for page in pages: + page_number = page.get('pageNumber', 0) + + if target_page is not None and page_number != target_page: + continue + + page_output = reflow_page_with_line_numbers(page, separator) + + if page_output: + if target_page is None: + output_parts.append(f"\n") + output_parts.append(page_output) + output_parts.append("") # Blank line between pages + + return '\n'.join(output_parts) + + +def main(): + parser = argparse.ArgumentParser( + description='Reflow Content Understanding JSON to include line numbers inline with text.', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + Process all pages: + python reflow_markdown_with_line_numbers.py document.json + + Process specific page: + python reflow_markdown_with_line_numbers.py document.json --page 30 + + Custom output file and separator: + python reflow_markdown_with_line_numbers.py document.json --output reflowed.md --separator " | " +""" + ) + + parser.add_argument('input_json', type=str, + help='Path to the Content Understanding JSON output file') + parser.add_argument('--output', '-o', type=str, default=None, + help='Output file path (default: print to stdout)') + parser.add_argument('--page', '-p', type=int, default=None, + help='Process only this page number (1-indexed)') + parser.add_argument('--separator', '-s', type=str, default=' | ', + help='Separator between line number and content (default: " | ")') + + args = parser.parse_args() + + # Read input JSON + input_path = Path(args.input_json) + if not input_path.exists(): + print(f"Error: Input file not found: {input_path}") + return 1 + + with open(input_path, 'r', encoding='utf-8') as f: + json_data = json.load(f) + + # Process document + try: + result = reflow_document(json_data, args.page, args.separator) + except ValueError as e: + print(f"Error: {e}") + return 1 + + # Output result + if args.output: + output_path = Path(args.output) + output_path.parent.mkdir(parents=True, exist_ok=True) + with open(output_path, 'w', encoding='utf-8') as f: + f.write(result) + print(f"Output written to: {output_path}") + else: + print(result) + + return 0 + + +if __name__ == '__main__': + exit(main()) diff --git a/data/legal_examples/Trascript Example.pdf b/data/legal_examples/Trascript Example.pdf new file mode 100644 index 00000000..6036de6b Binary files /dev/null and b/data/legal_examples/Trascript Example.pdf differ diff --git a/notebooks/legal_transcript_reflow.ipynb b/notebooks/legal_transcript_reflow.ipynb new file mode 100644 index 00000000..685dab24 --- /dev/null +++ b/notebooks/legal_transcript_reflow.ipynb @@ -0,0 +1,1070 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "84ae1183", + "metadata": {}, + "source": [ + "# Legal Transcript Line Number Reflow\n", + "\n", + "This notebook demonstrates how to process legal documents (depositions, court transcripts, trial records) with Azure Content Understanding and reflow the output to include inline line numbers.\n", + "\n", + "## The Challenge\n", + "\n", + "Legal transcripts have a standardized format with **line numbers in the left margin** (typically 1-25 per page). These line numbers are critical for:\n", + "- Citing specific testimony in legal briefs\n", + "- Cross-referencing during depositions and trials\n", + "- Creating accurate legal summaries\n", + "\n", + "By default, Content Understanding's markdown output groups these margin line numbers separately from the main text content. This notebook shows how to **reflow the output** to include line numbers inline with each text line.\n", + "\n", + "## Workflow\n", + "1. **Load PDF** - Read the local legal transcript file\n", + "2. **Content Extraction** - Use Azure Content Understanding to extract text with position data\n", + "3. **Reflow** - Match line numbers with text using bounding box coordinates\n", + "4. **Output** - Generate markdown with inline line numbers (e.g., `1 | witness testimony...`)" + ] + }, + { + "cell_type": "markdown", + "id": "9fae8a24", + "metadata": {}, + "source": [ + "## Prerequisites\n", + "1. Ensure your Azure AI service is configured by following the [configuration steps](../README.md#configure-azure-ai-service-resource).\n", + "2. Install the required packages to run this sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1756b078", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -r ../requirements.txt" + ] + }, + { + "cell_type": "markdown", + "id": "6c13e310", + "metadata": {}, + "source": [ + "## Create Azure AI Content Understanding Client" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "6480fbb8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Client created successfully\n", + " Endpoint: https://mmi-usw3-eft-foundry.services.ai.azure.com/\n", + " Credential: Subscription Key\n", + " API Version: 2025-11-01\n" + ] + } + ], + "source": [ + "from datetime import datetime\n", + "import logging\n", + "import os\n", + "import sys\n", + "from typing import Any, Optional\n", + "from dotenv import find_dotenv, load_dotenv\n", + "\n", + "# Add the parent directory to the Python path to import the helper modules\n", + "sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'python'))\n", + "from content_understanding_client import AzureContentUnderstandingClient\n", + "from extension.sample_helper import save_json_to_file \n", + "from azure.identity import DefaultAzureCredential\n", + "\n", + "load_dotenv(find_dotenv())\n", + "logging.basicConfig(level=logging.INFO)\n", + "\n", + "# For authentication, you can use either token-based auth or subscription key\n", + "AZURE_AI_ENDPOINT = os.getenv(\"AZURE_AI_ENDPOINT\")\n", + "AZURE_AI_API_KEY = os.getenv(\"AZURE_AI_API_KEY\")\n", + "API_VERSION = \"2025-11-01\"\n", + "\n", + "# Create token provider for Azure AD authentication\n", + "def token_provider():\n", + " credential = DefaultAzureCredential()\n", + " token = credential.get_token(\"https://cognitiveservices.azure.com/.default\")\n", + " return token.token\n", + "\n", + "# Create the Content Understanding client\n", + "try:\n", + " client = AzureContentUnderstandingClient(\n", + " endpoint=AZURE_AI_ENDPOINT,\n", + " api_version=API_VERSION,\n", + " subscription_key=AZURE_AI_API_KEY,\n", + " token_provider=token_provider if not AZURE_AI_API_KEY else None,\n", + " x_ms_useragent=\"azure-ai-content-understanding-python-sample-legal-reflow\"\n", + " )\n", + " credential_type = \"Subscription Key\" if AZURE_AI_API_KEY else \"Azure AD Token\"\n", + " print(f\"✅ Client created successfully\")\n", + " print(f\" Endpoint: {AZURE_AI_ENDPOINT}\")\n", + " print(f\" Credential: {credential_type}\")\n", + " print(f\" API Version: {API_VERSION}\")\n", + "except Exception as e:\n", + " print(f\"❌ Failed to create client: {e}\")\n", + " raise" + ] + }, + { + "cell_type": "markdown", + "id": "4bb5f26a", + "metadata": {}, + "source": [ + "## Configure Model Deployments\n", + "\n", + "> **💡 Note:** This step is only required **once per Azure Content Understanding resource**." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "f4941027", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "📋 Configuring default model deployments...\n", + "✅ Default model deployments configured successfully\n" + ] + } + ], + "source": [ + "# Get model deployment names from environment variables\n", + "GPT_4_1_DEPLOYMENT = os.getenv(\"GPT_4_1_DEPLOYMENT\")\n", + "GPT_4_1_MINI_DEPLOYMENT = os.getenv(\"GPT_4_1_MINI_DEPLOYMENT\")\n", + "TEXT_EMBEDDING_3_LARGE_DEPLOYMENT = os.getenv(\"TEXT_EMBEDDING_3_LARGE_DEPLOYMENT\")\n", + "\n", + "# Check if required deployments are configured\n", + "missing_deployments = []\n", + "if not GPT_4_1_DEPLOYMENT:\n", + " missing_deployments.append(\"GPT_4_1_DEPLOYMENT\")\n", + "if not GPT_4_1_MINI_DEPLOYMENT:\n", + " missing_deployments.append(\"GPT_4_1_MINI_DEPLOYMENT\")\n", + "if not TEXT_EMBEDDING_3_LARGE_DEPLOYMENT:\n", + " missing_deployments.append(\"TEXT_EMBEDDING_3_LARGE_DEPLOYMENT\")\n", + "\n", + "if missing_deployments:\n", + " print(f\"⚠️ Warning: Missing model deployment configuration(s): {missing_deployments}\")\n", + " print(\" Add these to your .env file and restart the kernel.\")\n", + "else:\n", + " print(f\"📋 Configuring default model deployments...\")\n", + " try:\n", + " result = client.update_defaults({\n", + " \"gpt-4.1\": GPT_4_1_DEPLOYMENT,\n", + " \"gpt-4.1-mini\": GPT_4_1_MINI_DEPLOYMENT,\n", + " \"text-embedding-3-large\": TEXT_EMBEDDING_3_LARGE_DEPLOYMENT\n", + " })\n", + " print(f\"✅ Default model deployments configured successfully\")\n", + " except Exception as e:\n", + " print(f\"❌ Failed to configure defaults: {e}\")\n", + " raise" + ] + }, + { + "cell_type": "markdown", + "id": "b8544696", + "metadata": {}, + "source": [ + "## Analyze Legal Transcript\n", + "\n", + "We'll use a publicly available deposition transcript from the Internet Archive. This is a real legal document with the standard line-numbered format used in depositions.\n", + "\n", + "**Sample Document:** [Farr Deposition Transcript](https://archive.org/details/799436-farr-deposition-transcript) (15 pages, Public Domain)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "18932e9f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🔍 Analyzing legal transcript from local file...\n", + " Document: c:\\src\\azure-ai-content-understanding-python\\data\\legal_examples\\Trascript Example.pdf\n", + " Analyzer: prebuilt-layout\n", + " File size: 1,666,047 bytes\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:content_understanding_client:Analyzing binary file c:\\src\\azure-ai-content-understanding-python\\data\\legal_examples\\Trascript Example.pdf with analyzer: prebuilt-layout\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⏳ Waiting for analysis to complete...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:content_understanding_client:Request 9e72763b-ecdf-4b8d-8f9b-a6733d30b6b1 in progress ...\n", + "INFO:content_understanding_client:Request 9e72763b-ecdf-4b8d-8f9b-a6733d30b6b1 in progress ...\n", + "INFO:content_understanding_client:Request 9e72763b-ecdf-4b8d-8f9b-a6733d30b6b1 in progress ...\n", + "INFO:content_understanding_client:Request result is ready after 7.11 seconds.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Analysis completed!\n", + "\n", + "📄 Document Information:\n", + " Pages: 1 - 52\n", + " Total pages: 52\n", + "💾 Analysis result saved to: test_output\\legal_transcript_analysis_20260122_162606.json\n", + "\n", + "💾 Full analysis saved to: test_output\\legal_transcript_analysis_20260122_162606.json\n" + ] + } + ], + "source": [ + "# Analyze legal transcript from local file\n", + "# Using the transcript example from the data/legal_examples folder\n", + "document_path = os.path.join(os.path.dirname(os.getcwd()), 'data', 'legal_examples', 'Trascript Example.pdf')\n", + "analyzer_id = 'prebuilt-layout'\n", + "\n", + "print(f\"🔍 Analyzing legal transcript from local file...\")\n", + "print(f\" Document: {document_path}\")\n", + "print(f\" Analyzer: {analyzer_id}\")\n", + "\n", + "# Verify file exists\n", + "if not os.path.exists(document_path):\n", + " raise FileNotFoundError(f\"Document not found: {document_path}\")\n", + "\n", + "file_size = os.path.getsize(document_path)\n", + "print(f\" File size: {file_size:,} bytes\")\n", + "\n", + "# Analyze the document using binary file path\n", + "response = client.begin_analyze_binary(\n", + " analyzer_id=analyzer_id,\n", + " file_location=document_path\n", + ")\n", + "\n", + "print(f\"⏳ Waiting for analysis to complete...\")\n", + "result = client.poll_result(response)\n", + "print(f\"✅ Analysis completed!\")\n", + "\n", + "# Get document info\n", + "contents = result.get(\"result\", {}).get(\"contents\", [])\n", + "if contents:\n", + " content = contents[0]\n", + " if content.get(\"kind\") == \"document\":\n", + " print(f\"\\n📄 Document Information:\")\n", + " print(f\" Pages: {content.get('startPageNumber')} - {content.get('endPageNumber')}\")\n", + " print(f\" Total pages: {content.get('endPageNumber') - content.get('startPageNumber') + 1}\")\n", + "\n", + "# Save the full result for processing\n", + "saved_json_path = save_json_to_file(result, filename_prefix=\"legal_transcript_analysis\")\n", + "print(f\"\\n💾 Full analysis saved to: {saved_json_path}\")" + ] + }, + { + "cell_type": "markdown", + "id": "c45dcbdb", + "metadata": {}, + "source": [ + "## View Default Markdown Output\n", + "\n", + "Let's first look at Content Understanding's default markdown output. Notice how the **line numbers are grouped separately** at the bottom of each page's content rather than inline with the text." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "b5b60915", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "📄 Default Markdown Output (first 2000 chars):\n", + "============================================================\n", + "# (B&W) PROTECTED BY MINNESOTA TOBACCO LITIGATION PROTECTIVE ORDER\n", + "\n", + "SUPERIOR COURT OF NEW JERSEY\n", + "MERCER COUNTY-LAW DIVISION,\n", + "DOCKET NO. L-90-2940\n", + "\n", + ":\n", + "\n", + ":\n", + "\n", + "IN RE:\n", + "IN THE MATTER OF\n", + "SUSAN MICHAUD\n", + "\n", + ":\n", + "\n", + "DEPOSITION OF:\n", + "\n", + ":\n", + "\n", + "Susan Michaud\n", + "\n", + ":\n", + "\n", + ":\n", + "\n", + "Transcript of proceedings taken on July 13, 1990,\n", + "at 1 pm, at the office of Mason, Griffin & Pierson, 101 Poor\n", + "Farm Road, Princeton, NJ 08540.\n", + "\n", + "682499390\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "# (B&W) PROTECTED BY MINNESOTA TOBACCO LITIGATION PROTECTIVE ORDER\n", + "\n", + "1\n", + "2\n", + "3\n", + "4\n", + "5\n", + "6\n", + "7\n", + "8\n", + "9\n", + "10\n", + "11\n", + "12\n", + "13\n", + "14\n", + "15\n", + "16\n", + "17\n", + "18\n", + "19\n", + "20\n", + "21\n", + "22\n", + "23\n", + "24\n", + "25\n", + "\n", + "2\n", + "\n", + "APPEARANCES\n", + "\n", + "On behalf of\n", + "Susan Michaud:\n", + "\n", + "MASON, GRIFFIN & PIERSON\n", + "BY: Stephanie J. Briody, Esq.\n", + "101 Poor Farm Road\n", + "Princton, NJ 08540\n", + "\n", + "On behalf of Dr. Alfred\n", + "Cook, Dr. Charles Howard &\n", + "Princeton Radiology Assoc.\n", + "\n", + "JACKSON & VAURIO\n", + "BY: John Zen Jackson, Esq.\n", + "1000 Herrontown Road\n", + "Princeton, NJ 08540\n", + "\n", + "682499391\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "## (B&W) PROTECTED BY MINNESOTA TOBACCO LITIGATION PROTECTIVE ORDER\n", + "\n", + "3\n", + "\n", + "1\n", + "2\n", + "3\n", + "4\n", + "5\n", + "6\n", + "7\n", + "8\n", + "9\n", + "10\n", + "11\n", + "12\n", + "13\n", + "14\n", + "15\n", + "16\n", + "17\n", + "18\n", + "19\n", + "20\n", + "21\n", + "22\n", + "23\n", + "24\n", + "25\n", + "\n", + "INDEX\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
WITNESS:DIRECTCROSSREDIRECTRECROSS
Susan Michaud4404444
\n", + "\n", + "\n", + "EXHIBITS:\n", + "Diagram (P-1)\n", + "\n", + "EVIDENCE\n", + "\n", + "IDENTIFICATION\n", + "\n", + "23\n", + "\n", + "682499392\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "### (B&W) PROTECTED BY MINNESOTA TOBACCO LITIGATION PROTECTIVE ORDER\n", + "\n", + "4\n", + "\n", + "Susan Michaud, M-I-C-H-A-U-D, sworn by the Notary Public,\n", + "testified as follows.\n", + "\n", + "DIRECT EXAMINATION BY\n", + "\n", + "MS. BRIODY:\n", + "\n", + "Q.\n", + "Susan, how old are you at the present time?\n", + "\n", + "A.\n", + "Just turned thirty-eight.\n", + "\n", + "Q.\n", + "And are you married?\n", + "\n", + "A. Yes, I am.\n", + "\n", + "Q.\n", + "And for how many years have you been married?\n", + "\n", + "============================================================\n", + "\n", + "... (Total length: 66709 characters)\n" + ] + } + ], + "source": [ + "# Show the default markdown output (first 2000 characters)\n", + "markdown = content.get(\"markdown\", \"\")\n", + "\n", + "print(\"📄 Default Markdown Output (first 2000 chars):\")\n", + "print(\"=\" * 60)\n", + "print(markdown[:2000])\n", + "print(\"=\" * 60)\n", + "print(f\"\\n... (Total length: {len(markdown)} characters)\")" + ] + }, + { + "cell_type": "markdown", + "id": "eb101887", + "metadata": {}, + "source": [ + "## How Reflow Works\n", + "\n", + "The reflow algorithm uses **bounding box coordinates** from the JSON output to match line numbers with their corresponding text:\n", + "\n", + "### Step 1: Parse Coordinates\n", + "Every element in CU's JSON has a `source` field with position data:\n", + "```\n", + "\"source\": \"D(1,1.0309,1.1277,1.131,1.1277,1.131,1.2711,1.0309,1.2711)\"\n", + " D(page, x1,y1, x2,y2, x3,y3, x4,y4)\n", + "```\n", + "\n", + "### Step 2: Group by Vertical Position\n", + "Elements with similar Y values (within ~0.15 inches) are on the same horizontal line.\n", + "\n", + "### Step 3: Sort Left-to-Right\n", + "Within each group, sort by X coordinate. Line numbers (X ≈ 1.0\") come before text content (X ≈ 1.3\"+).\n", + "\n", + "### Step 4: Combine\n", + "Pair line numbers with their corresponding text and output as `N | text content`." + ] + }, + { + "cell_type": "markdown", + "id": "13abdb4f", + "metadata": {}, + "source": [ + "## Reflow Functions\n", + "\n", + "Here are the core functions for reflowing Content Understanding output to include inline line numbers:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "4a421c01", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Reflow functions loaded successfully!\n" + ] + } + ], + "source": [ + "import re\n", + "from dataclasses import dataclass\n", + "from typing import Optional\n", + "\n", + "\n", + "@dataclass\n", + "class LineElement:\n", + " \"\"\"Represents a line or element from the document with its position.\"\"\"\n", + " content: str\n", + " y_position: float # Top Y coordinate\n", + " x_position: float # Left X coordinate\n", + " page_number: int\n", + " is_line_number: bool = False\n", + "\n", + "\n", + "def parse_source_coordinates(source: str) -> tuple[int, float, float, float, float]:\n", + " \"\"\"\n", + " Parse the source coordinate string from Content Understanding.\n", + " \n", + " The source format is: D(pageNumber,x1,y1,x2,y2,x3,y3,x4,y4)\n", + " where the points represent a quadrilateral (upper-left, upper-right, lower-right, lower-left)\n", + " \n", + " Returns:\n", + " Tuple of (page_number, left_x, top_y, right_x, bottom_y)\n", + " \"\"\"\n", + " match = re.match(r'D\\((\\d+),([^)]+)\\)', source)\n", + " if not match:\n", + " raise ValueError(f\"Invalid source format: {source}\")\n", + " \n", + " page_number = int(match.group(1))\n", + " coords = [float(x) for x in match.group(2).split(',')]\n", + " \n", + " if len(coords) == 8:\n", + " # Bounding polygon: x1,y1,x2,y2,x3,y3,x4,y4\n", + " x1, y1, x2, y2, x3, y3, x4, y4 = coords\n", + " left_x = min(x1, x4)\n", + " top_y = min(y1, y2)\n", + " elif len(coords) == 4:\n", + " # Axis-aligned bounding box: left, top, width, height\n", + " left_x, top_y, width, height = coords\n", + " else:\n", + " raise ValueError(f\"Unexpected coordinate count: {source}\")\n", + " \n", + " return page_number, left_x, top_y, 0, 0 # We only need left_x and top_y\n", + "\n", + "\n", + "def is_line_number(content: str) -> bool:\n", + " \"\"\"Check if content is a line number (1-99).\"\"\"\n", + " return content.strip().isdigit() and 1 <= int(content.strip()) <= 99\n", + "\n", + "\n", + "def is_noise_element(content: str) -> bool:\n", + " \"\"\"Check if content is noise (bullets, single dots) that should be filtered.\"\"\"\n", + " content = content.strip()\n", + " return content in ['·', '•', '∙'] or (len(content) == 1 and not content.isalnum())\n", + "\n", + "\n", + "def extract_lines_from_page(page_data: dict) -> list[LineElement]:\n", + " \"\"\"Extract all lines from a page with position information.\"\"\"\n", + " elements = []\n", + " page_number = page_data.get('pageNumber', 1)\n", + " \n", + " for line in page_data.get('lines', []):\n", + " content = line.get('content', '').strip()\n", + " source = line.get('source', '')\n", + " \n", + " if not source or not content or is_noise_element(content):\n", + " continue\n", + " \n", + " try:\n", + " parsed_page, left_x, top_y, _, _ = parse_source_coordinates(source)\n", + " element = LineElement(\n", + " content=content,\n", + " y_position=top_y,\n", + " x_position=left_x,\n", + " page_number=parsed_page,\n", + " is_line_number=is_line_number(content)\n", + " )\n", + " elements.append(element)\n", + " except ValueError:\n", + " continue\n", + " \n", + " return elements\n", + "\n", + "\n", + "def group_lines_by_vertical_position(elements: list[LineElement], \n", + " y_tolerance: float = 0.15) -> list[list[LineElement]]:\n", + " \"\"\"Group elements that appear on the same horizontal line.\"\"\"\n", + " if not elements:\n", + " return []\n", + " \n", + " sorted_elements = sorted(elements, key=lambda e: e.y_position)\n", + " groups = []\n", + " current_group = [sorted_elements[0]]\n", + " current_y = sorted_elements[0].y_position\n", + " \n", + " for element in sorted_elements[1:]:\n", + " if abs(element.y_position - current_y) <= y_tolerance:\n", + " current_group.append(element)\n", + " else:\n", + " groups.append(current_group)\n", + " current_group = [element]\n", + " current_y = element.y_position\n", + " \n", + " if current_group:\n", + " groups.append(current_group)\n", + " \n", + " return groups\n", + "\n", + "\n", + "def reflow_page_with_line_numbers(page_data: dict, separator: str = \" | \") -> str:\n", + " \"\"\"Reflow a single page's content to include line numbers inline.\"\"\"\n", + " elements = extract_lines_from_page(page_data)\n", + " if not elements:\n", + " return \"\"\n", + " \n", + " line_groups = group_lines_by_vertical_position(elements)\n", + " output_lines = []\n", + " \n", + " for group in line_groups:\n", + " # Sort by X position (left to right)\n", + " group.sort(key=lambda e: e.x_position)\n", + " \n", + " line_numbers = [e for e in group if e.is_line_number]\n", + " content_elements = [e for e in group if not e.is_line_number]\n", + " \n", + " if not content_elements:\n", + " continue\n", + " \n", + " combined_content = ' '.join(e.content for e in content_elements)\n", + " \n", + " if line_numbers:\n", + " line_num = line_numbers[0].content\n", + " output_lines.append(f\"{line_num}{separator}{combined_content}\")\n", + " else:\n", + " output_lines.append(combined_content)\n", + " \n", + " return '\\n'.join(output_lines)\n", + "\n", + "\n", + "def reflow_document(json_data: dict, target_page: Optional[int] = None, \n", + " separator: str = \" | \") -> str:\n", + " \"\"\"Reflow an entire document or specific page with line numbers inline.\"\"\"\n", + " contents = json_data.get('result', {}).get('contents', [])\n", + " if not contents:\n", + " raise ValueError(\"No contents found in JSON data\")\n", + " \n", + " content = contents[0]\n", + " pages = content.get('pages', [])\n", + " if not pages:\n", + " raise ValueError(\"No pages found in document content\")\n", + " \n", + " output_parts = []\n", + " \n", + " for page in pages:\n", + " page_number = page.get('pageNumber', 0)\n", + " if target_page is not None and page_number != target_page:\n", + " continue\n", + " \n", + " page_output = reflow_page_with_line_numbers(page, separator)\n", + " if page_output:\n", + " if target_page is None:\n", + " output_parts.append(f\"\\n\\n\")\n", + " output_parts.append(page_output)\n", + " \n", + " return '\\n'.join(output_parts)\n", + "\n", + "print(\"✅ Reflow functions loaded successfully!\")" + ] + }, + { + "cell_type": "markdown", + "id": "2d8d139d", + "metadata": {}, + "source": [ + "## Reflow a Single Page\n", + "\n", + "Let's reflow page 3 of the transcript to see the line numbers inline with the text:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "7ed1b4ab", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "📄 Reflowed Output for Page 3:\n", + "============================================================\n", + "(B&W) PROTECTED BY MINNESOTA TOBACCO LITIGATION PROTECTIVE ORDER\n", + "1 | INDEX\n", + "2 | WITNESS: DIRECT CROSS REDIRECT RECROSS\n", + "3 | Susan Michaud\n", + "6 | EXHIBITS: EVIDENCE IDENTIFICATION\n", + "7 | Diagram (P-1)\n", + "682499392\n", + "http://legacy.library.ucsf.e6u/tid/fuq07a00/pdf.industrydocuments.ucsf.edu/docs/khhl0001\n", + "============================================================\n" + ] + } + ], + "source": [ + "# Reflow a single page (page 3)\n", + "page_to_reflow = 3\n", + "\n", + "print(f\"📄 Reflowed Output for Page {page_to_reflow}:\")\n", + "print(\"=\" * 60)\n", + "reflowed_page = reflow_document(result, target_page=page_to_reflow)\n", + "print(reflowed_page)\n", + "print(\"=\" * 60)" + ] + }, + { + "cell_type": "markdown", + "id": "a3a635c8", + "metadata": {}, + "source": [ + "## Reflow Entire Document\n", + "\n", + "Now let's reflow the entire document and save it to a file:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "cc9a0906", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "📄 Reflowing entire document...\n", + "✅ Reflowed document saved to: c:\\src\\azure-ai-content-understanding-python\\notebooks\\test_output\\legal_transcript_reflowed.md\n", + " Total characters: 65678\n", + "\n", + "📄 Preview (first 3000 characters):\n", + "============================================================\n", + "\n", + "\n", + "\n", + "(B&W) PROTECTED BY MINNESOTA TOBACCO LITIGATION PROTECTIVE ORDER\n", + "SUPERIOR COURT OF NEW JERSEY\n", + "MERCER COUNTY-LAW DIVISION,\n", + "DOCKET NO. L-90-2940\n", + "IN RE: IN THE MATTER OF\n", + "SUSAN MICHAUD\n", + "DEPOSITION OF:\n", + "Susan Michaud\n", + "Transcript of proceedings taken on July 13, 1990,\n", + "at 1 pm, at the office of Mason, Griffin & Pierson, 101 Poor\n", + "Farm Road, Princeton, NJ 08540.\n", + "682499390\n", + "http://legacy.library.ucsf.e6u/tid/fuq07a00/pdfv.industrydocuments.ucsf.edu/docs/khhl0001\n", + "\n", + "\n", + "\n", + "(B&W) PROTECTED BY MINNESOTA TOBACCO LITIGATION PROTECTIVE ORDER\n", + "2 | APPEARANCES\n", + "3 | On behalf of\n", + "Susan Michaud: MASON, GRIFFIN & PIERSON\n", + "4 | BY: Stephanie J. Briody, Esq.\n", + "101 Poor Farm Road\n", + "5 | Princton, NJ 08540\n", + "6 | On behalf of Dr. Alfred\n", + "Cook, Dr. Charles Howard &\n", + "7 | Princeton Radiology Assoc. JACKSON & VAURIO\n", + "BY: John Zen Jackson, Esq.\n", + "8 | 1000 Herrontown Road\n", + "Princeton, NJ 08540\n", + "682499391\n", + "http://legacy.library.ucsf.e6u/tid/fuq07a00/pdfv.industrydocumėnts.ucsf.edu/docs/khhl0001\n", + "\n", + "\n", + "\n", + "(B&W) PROTECTED BY MINNESOTA TOBACCO LITIGATION PROTECTIVE ORDER\n", + "1 | INDEX\n", + "2 | WITNESS: DIRECT CROSS REDIRECT RECROSS\n", + "3 | Susan Michaud\n", + "6 | EXHIBITS: EVIDENCE IDENTIFICATION\n", + "7 | Diagram (P-1)\n", + "682499392\n", + "http://legacy.library.ucsf.e6u/tid/fuq07a00/pdf.industrydocuments.ucsf.edu/docs/khhl0001\n", + "\n", + "\n", + "\n", + "(B&W) PROTECTED BY MINNESOTA TOBACCO LITIGATION PROTECTIVE ORDER\n", + "1 | Susan Michaud, M-I-C-H-A-U-D, sworn by the Notary Public,\n", + "2 | testified as follows.\n", + "3 | DIRECT EXAMINATION BY\n", + "4 | MS. BRIODY:\n", + "5 | Q. Susan, how old are you at the present time?\n", + "6 | A. Just turned thirty-eight.\n", + "7 | Q. And are you married?\n", + "8 | A. Yes, I am.\n", + "9 | Q. And for how many years have you been married?\n", + "10 | A. Nineteen.\n", + "11 | Q. What year were you married?\n", + "12 | A. '71.\n", + "13 | Q. And to whom are you married?\n", + "14 | A. Thomas Michaud.\n", + "15 | Q. Do you have any children?\n", + "16 | A. Yes, I have one.\n", + "17 | Q. Is it a boy or a girl?\n", + "18 | A. A fourteen year old boy, almost fifteen.\n", + "19 | Q. What's his name?\n", + "20 | A. Matthew.\n", + "21 | Q. Did you go to high school in Princeton?\n", + "22 | A. Yes.\n", + "23 | Q. And what is your educational background?\n", + "24 | A. I have about thirty college credits beyond high school\n", + "25 | and that's all.\n", + "682499393\n", + "http://legacy.library.ucsf.edu/tid/fuq07a00/pdfv.industrydocuments.ucsf.edu/docs/khhl0001\n", + "\n", + "\n", + "\n", + "(B&W) PROTECTED BY MINNESOTA TOBACCO LITIGATION PROTECTIVE ORDER\n", + "1 | Q. Where did you get those credits?\n", + "2 | A. At Mercer County Community College.\n", + "3 | Q. Did you grow up in the Princeton area?\n", + "4 | A. Yes.\n", + "5 | Q. Where did you go to middle school or junior high\n", + "6 | school?\n", + "7 | A. Princeton community--Community Park.\n", + "8 | Q. It's called Community Park?\n", + "9 | A. It's called Community Park.\n", + "10 | Q. In Princeton?\n", + "11 | A. Yes.\n", + "12 | Q. For whom do you work?\n", + "13 | A. Nassau Federal Savings and Loan.\n", + "14 | Q. And what kind of work do you do for them?\n", + "15 | A. I am the director of their Human Resource Department.\n", + "16 | Q. And for how long have you worked for the bank?\n", + "17 | A. In September it will be thr\n", + "============================================================\n" + ] + } + ], + "source": [ + "# Reflow the entire document\n", + "print(\"📄 Reflowing entire document...\")\n", + "reflowed_document = reflow_document(result)\n", + "\n", + "# Save to file\n", + "output_path = os.path.join(os.getcwd(), 'test_output', 'legal_transcript_reflowed.md')\n", + "os.makedirs(os.path.dirname(output_path), exist_ok=True)\n", + "\n", + "with open(output_path, 'w', encoding='utf-8') as f:\n", + " f.write(reflowed_document)\n", + "\n", + "print(f\"✅ Reflowed document saved to: {output_path}\")\n", + "print(f\" Total characters: {len(reflowed_document)}\")\n", + "\n", + "# Show first 3000 characters\n", + "print(\"\\n📄 Preview (first 3000 characters):\")\n", + "print(\"=\" * 60)\n", + "print(reflowed_document[:3000])\n", + "print(\"=\" * 60)" + ] + }, + { + "cell_type": "markdown", + "id": "28ae13e7", + "metadata": {}, + "source": [ + "## Compare: Before vs After\n", + "\n", + "Let's compare the default output with the reflowed output for a specific page:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2d5dd7e7", + "metadata": {}, + "outputs": [], + "source": [ + "# Compare default vs reflowed for page 3\n", + "page_number = 3\n", + "\n", + "# Get the original markdown (extract just page 3 content - approximation)\n", + "original_lines = markdown.split('\\n')\n", + "\n", + "print(\"📊 COMPARISON: Default vs Reflowed Output\")\n", + "print(\"\\n\" + \"=\" * 30 + \" DEFAULT OUTPUT \" + \"=\" * 30)\n", + "print(\"(Line numbers grouped separately at page bottom)\")\n", + "print(\"-\" * 76)\n", + "\n", + "# Show a sample of the default output\n", + "sample_start = 200\n", + "sample_end = 800\n", + "print(markdown[sample_start:sample_end])\n", + "print(\"...\")\n", + "\n", + "print(\"\\n\" + \"=\" * 30 + \" REFLOWED OUTPUT \" + \"=\" * 29)\n", + "print(\"(Line numbers inline with text)\")\n", + "print(\"-\" * 76)\n", + "\n", + "# Show the reflowed output for page 3\n", + "reflowed_page = reflow_document(result, target_page=page_number)\n", + "print(reflowed_page[:800])\n", + "print(\"...\")" + ] + }, + { + "cell_type": "markdown", + "id": "a0fc1e27", + "metadata": {}, + "source": [ + "## Using the Standalone Script\n", + "\n", + "For batch processing or command-line usage, you can use the standalone script located at `python/reflow_markdown_with_line_numbers.py`:\n", + "\n", + "```bash\n", + "# Process a specific page\n", + "python python/reflow_markdown_with_line_numbers.py analysis.json --page 3\n", + "\n", + "# Process all pages and save to file\n", + "python python/reflow_markdown_with_line_numbers.py analysis.json --output reflowed.md\n", + "\n", + "# Custom separator\n", + "python python/reflow_markdown_with_line_numbers.py analysis.json --separator \" | \" --output reflowed.md\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "580894a1", + "metadata": {}, + "outputs": [], + "source": [ + "# Example: Run the standalone script on our saved JSON\n", + "import subprocess\n", + "\n", + "script_path = os.path.join(os.path.dirname(os.getcwd()), 'python', 'reflow_markdown_with_line_numbers.py')\n", + "output_file = os.path.join(os.getcwd(), 'test_output', 'legal_transcript_reflowed_script.md')\n", + "\n", + "print(f\"🔧 Running standalone reflow script...\")\n", + "print(f\" Input: {saved_json_path}\")\n", + "print(f\" Output: {output_file}\")\n", + "\n", + "result_code = subprocess.run(\n", + " ['python', script_path, saved_json_path, '--output', output_file],\n", + " capture_output=True,\n", + " text=True\n", + ")\n", + "\n", + "if result_code.returncode == 0:\n", + " print(f\"✅ Script completed successfully!\")\n", + " print(result_code.stdout)\n", + "else:\n", + " print(f\"❌ Script failed:\")\n", + " print(result_code.stderr)" + ] + }, + { + "cell_type": "markdown", + "id": "0ac0ae60", + "metadata": {}, + "source": [ + "## Summary\n", + "\n", + "This notebook demonstrated how to:\n", + "\n", + "1. **Extract content** from legal transcripts using Azure Content Understanding's `prebuilt-layout` analyzer\n", + "2. **Understand the JSON structure** including the `source` field with bounding box coordinates\n", + "3. **Reflow the output** to include line numbers inline with text by:\n", + " - Parsing bounding box coordinates to determine element positions\n", + " - Grouping elements by vertical position (Y coordinate)\n", + " - Matching line numbers with their corresponding text content\n", + "4. **Use the standalone script** for batch processing\n", + "\n", + "### Use Cases\n", + "\n", + "This technique is valuable for:\n", + "- **Legal document processing** - Depositions, trial transcripts, court records\n", + "- **Academic citations** - Line-numbered source materials\n", + "- **Content indexing** - Building searchable databases with line-level citations\n", + "- **AI-powered legal research** - RAG applications that need line-accurate references\n", + "\n", + "### Learn More\n", + "\n", + "- [Content Understanding Document Elements](https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/document/elements)\n", + "- [Document Markdown Representation](https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/document/markdown)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/move_training_data_across_analyzers.ipynb b/notebooks/move_training_data_across_analyzers.ipynb new file mode 100644 index 00000000..428d9dbb --- /dev/null +++ b/notebooks/move_training_data_across_analyzers.ipynb @@ -0,0 +1,1299 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e3ff63c1", + "metadata": {}, + "source": [ + "# Move Training Data Across Analyzers\n", + "\n", + "This notebook demonstrates how to reuse training data from an existing analyzer when creating a new analyzer in the same Azure AI Content Understanding resource.\n", + "\n", + "## Overview\n", + "\n", + "When you have an analyzer with training data and want to create a new analyzer using the same labeled examples, you can reference the existing blob storage location without duplicating or moving the data.\n", + "\n", + "### Benefits\n", + "- **No data duplication**: Reuse existing training data without copying\n", + "- **Same resource**: Both analyzers access the same blob storage\n", + "- **Field portability**: Maintain stable `fieldId`s across analyzers\n", + "- **Rapid iteration**: Test schema variations quickly\n", + "\n", + "### Prerequisites\n", + "1. An existing analyzer with training data already configured\n", + "2. Azure AI service configured by following the [configuration steps](../README.md#configure-azure-ai-service-resource)\n", + "3. Required packages installed" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2f76b866", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -r ../requirements.txt" + ] + }, + { + "cell_type": "markdown", + "id": "a0032373", + "metadata": {}, + "source": [ + "## Create Azure AI Content Understanding Client\n", + "\n", + "> The [AzureContentUnderstandingClient](../python/content_understanding_client.py) is a utility class providing functions to interact with the Content Understanding API. Before the official release of the Content Understanding SDK, this acts as a lightweight SDK.\n", + "\n", + "> ⚠️ **Important**: Update the code below to match your Azure authentication method. Look for the `# IMPORTANT` comments and modify those sections accordingly.\n", + "\n", + "> ⚠️ **Note**: Using a subscription key works, but using a token provider with Azure Active Directory (AAD) is safer and highly recommended for production environments." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bcea7936", + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "import json\n", + "import os\n", + "import sys\n", + "import uuid\n", + "from pathlib import Path\n", + "from dotenv import find_dotenv, load_dotenv\n", + "from azure.identity import DefaultAzureCredential, get_bearer_token_provider\n", + "\n", + "load_dotenv(find_dotenv())\n", + "logging.basicConfig(level=logging.INFO)\n", + "\n", + "# For authentication, you can use either token-based authentication or a subscription key; only one method is required.\n", + "AZURE_AI_ENDPOINT = os.getenv(\"AZURE_AI_ENDPOINT\")\n", + "# IMPORTANT: Replace with your actual subscription key or set it in the \".env\" file if not using token authentication.\n", + "AZURE_AI_API_KEY = os.getenv(\"AZURE_AI_API_KEY\")\n", + "AZURE_AI_API_VERSION = os.getenv(\"AZURE_AI_API_VERSION\", \"2025-05-01-preview\")\n", + "\n", + "# Add the parent directory to the path to use shared modules\n", + "parent_dir = Path(Path.cwd()).parent\n", + "sys.path.append(str(parent_dir))\n", + "from python.content_understanding_client import AzureContentUnderstandingClient\n", + "\n", + "credential = DefaultAzureCredential()\n", + "token_provider = get_bearer_token_provider(credential, \"https://cognitiveservices.azure.com/.default\")\n", + "\n", + "client = AzureContentUnderstandingClient(\n", + " endpoint=AZURE_AI_ENDPOINT,\n", + " api_version=AZURE_AI_API_VERSION,\n", + " # IMPORTANT: Comment out token_provider if using subscription key\n", + " token_provider=token_provider,\n", + " # IMPORTANT: Uncomment this if using subscription key\n", + " # subscription_key=AZURE_AI_API_KEY,\n", + " x_ms_useragent=\"azure-ai-content-understanding-python/move_training_data\",\n", + ")\n", + "\n", + "print(\"✅ Content Understanding client initialized successfully!\")" + ] + }, + { + "cell_type": "markdown", + "id": "92e5f27f", + "metadata": {}, + "source": [ + "## Step 1: List Available Analyzers\n", + "\n", + "First, let's see what analyzers are available in your resource. We'll look for analyzers that have training data configured." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fcbc218a", + "metadata": {}, + "outputs": [], + "source": [ + "# Get all analyzers in your resource\n", + "all_analyzers = client.get_all_analyzers()\n", + "analyzers_list = all_analyzers.get('value', [])\n", + "\n", + "print(f\"Found {len(analyzers_list)} analyzer(s) in your resource\\n\")\n", + "\n", + "# Display analyzer names and IDs\n", + "if analyzers_list:\n", + " print(\"Available analyzers:\")\n", + " for idx, analyzer in enumerate(analyzers_list, 1):\n", + " analyzer_id = analyzer.get('analyzerId', 'N/A')\n", + " analyzer_name = analyzer.get('name', 'N/A')\n", + " print(f\"{idx}. ID: {analyzer_id}\")\n", + " print(f\" Name: {analyzer_name}\")\n", + " print()\n", + "else:\n", + " print(\"No analyzers found. Please create an analyzer with training data first.\")\n", + " print(\"See: notebooks/analyzer_training.ipynb for guidance.\")" + ] + }, + { + "cell_type": "markdown", + "id": "8e6ae2ac", + "metadata": {}, + "source": [ + "## Step 2: Select Source Analyzer\n", + "\n", + "Specify the ID of the analyzer whose training data you want to reuse.\n", + "\n", + "Set `SOURCE_ANALYZER_ID` to an existing analyzer ID from the list above" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9772b0f5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Source Analyzer ID: invoiceLabeledData\n" + ] + } + ], + "source": [ + "# OPTION 1: Specify an existing analyzer ID that has training data\n", + "\n", + "# ⚠️ REQUIRED: Replace \"MyAnalyzer\" with your actual analyzer ID from the list above\n", + "# You can find available analyzer IDs in the output of the previous cell\n", + "SOURCE_ANALYZER_ID = \"MyAnalyzer\" # ← CHANGE THIS!\n", + "\n", + "# Uncomment to use the first analyzer from the list\n", + "# if analyzers_list:\n", + "# SOURCE_ANALYZER_ID = analyzers_list[0].get('id')\n", + "# print(f\"Using first analyzer: {SOURCE_ANALYZER_ID}\")\n", + "\n", + "print(f\"Source Analyzer ID: {SOURCE_ANALYZER_ID}\")" + ] + }, + { + "cell_type": "markdown", + "id": "d9b1bc93", + "metadata": {}, + "source": [ + "## Step 3: Retrieve Source Analyzer Details\n", + "\n", + "Now we'll fetch the complete definition of the source analyzer, including its training data configuration." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "b2c9ae0c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Source Analyzer: invoiceLabeledData\n", + "Name: N/A\n", + "Description: \n", + "\n", + "Full analyzer definition:\n", + "{\n", + " \"analyzerId\": \"invoiceLabeledData\",\n", + " \"description\": \"\",\n", + " \"tags\": {\n", + " \"projectId\": \"d7afeaa4-fe05-4df7-bd7c-46f3a94a96cb\",\n", + " \"templateId\": \"document-2025-05-01\"\n", + " },\n", + " \"createdAt\": \"2025-10-22T22:03:08Z\",\n", + " \"lastModifiedAt\": \"2025-10-22T22:03:11Z\",\n", + " \"baseAnalyzerId\": \"prebuilt-documentAnalyzer\",\n", + " \"config\": {\n", + " \"returnDetails\": true,\n", + " \"enableOcr\": true,\n", + " \"enableLayout\": true,\n", + " \"enableFormula\": false,\n", + " \"disableContentFiltering\": false,\n", + " \"tableFormat\": \"html\",\n", + " \"estimateFieldSourceAndConfidence\": false\n", + " },\n", + " \"fieldSchema\": {\n", + " \"fields\": {\n", + " \"CompanyName\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Name of the pharmaceutical company involved in the rebate program\"\n", + " },\n", + " \"ProductDetails\": {\n", + " \"type\": \"array\",\n", + " \"description\": \"List of products with rebate and unit details\",\n", + " \"items\": {\n", + " \"type\": \"object\",\n", + " \"description\": \"Details of a single product\",\n", + " \"properties\": {\n", + " \"ProductPackageCode\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Code representing the product or package\"\n", + " },\n", + " \"ProductName\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Name of the product\"\n", + " },\n", + " \"FfsimcoRecordId\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Record ID for FFSIMCO\"\n", + " },\n", + " \"RebatePerUnit\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Rebate amount per unit of the product\"\n", + " },\n", + " \"AdjustedRebatePerUnit\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Adjusted rebate amount per unit\"\n", + " },\n", + " \"UnitsInvoiced\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Number of units invoiced\"\n", + " },\n", + " \"UnitsPaid\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Number of units for which payment was made\"\n", + " },\n", + " \"RebateAmountInvoiced\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Total rebate amount invoiced\"\n", + " },\n", + " \"RebateAmountPaid\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Total rebate amount paid\"\n", + " }\n", + " }\n", + " }\n", + " },\n", + " \"TotalPaid\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Total payment amount \"\n", + " }\n", + " }\n", + " },\n", + " \"trainingData\": {\n", + " \"containerUrl\": \"https://staistudiote203841201294.blob.core.windows.net/7c123b64-9378-4fa7-a807-081efa839c00-cu\",\n", + " \"kind\": \"blob\",\n", + " \"prefix\": \"labelingProjects/d7afeaa4-fe05-4df7-bd7c-46f3a94a96cb/train\"\n", + " },\n", + " \"warnings\": [],\n", + " \"status\": \"ready\",\n", + " \"processingLocation\": \"geography\",\n", + " \"mode\": \"standard\"\n", + "}\n" + ] + } + ], + "source": [ + "# Get detailed information about the source analyzer\n", + "source_analyzer = client.get_analyzer_detail_by_id(SOURCE_ANALYZER_ID)\n", + "\n", + "print(f\"Source Analyzer: {SOURCE_ANALYZER_ID}\")\n", + "print(f\"Name: {source_analyzer.get('name', 'N/A')}\")\n", + "print(f\"Description: {source_analyzer.get('description', 'N/A')}\")\n", + "print(\"\\nFull analyzer definition:\")\n", + "print(json.dumps(source_analyzer, indent=2))" + ] + }, + { + "cell_type": "markdown", + "id": "3eb0b65d", + "metadata": {}, + "source": [ + "## Step 4: Extract Training Data Configuration\n", + "\n", + "Extract the training data configuration from the source analyzer. This includes:\n", + "- **trainingData**: The blob container location with labeled examples\n", + "- **fieldSchema**: The field definitions\n", + "- **tags**: Project and template metadata (important for Azure AI Foundry project association)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "7c57655f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "📦 Training Data Configuration:\n", + "{\n", + " \"containerUrl\": \"https://staistudiote203841201294.blob.core.windows.net/7c123b64-9378-4fa7-a807-081efa839c00-cu\",\n", + " \"kind\": \"blob\",\n", + " \"prefix\": \"labelingProjects/d7afeaa4-fe05-4df7-bd7c-46f3a94a96cb/train\"\n", + "}\n", + "\n", + "✅ Found training data at: https://staistudiote203841201294.blob.core.windows.net/7c123b64-9378-4fa7-a807-081efa839c00-cu\n", + " Path prefix: labelingProjects/d7afeaa4-fe05-4df7-bd7c-46f3a94a96cb/train\n", + "\n", + "📚 Knowledge Sources Configuration:\n", + "No knowledge sources configured (this is normal for standard mode)\n", + "\n", + "📋 Field Schema:\n", + "{\n", + " \"fields\": {\n", + " \"CompanyName\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Name of the pharmaceutical company involved in the rebate program\"\n", + " },\n", + " \"ProductDetails\": {\n", + " \"type\": \"array\",\n", + " \"description\": \"List of products with rebate and unit details\",\n", + " \"items\": {\n", + " \"type\": \"object\",\n", + " \"description\": \"Details of a single product\",\n", + " \"properties\": {\n", + " \"ProductPackageCode\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Code representing the product or package\"\n", + " },\n", + " \"ProductName\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Name of the product\"\n", + " },\n", + " \"FfsimcoRecordId\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Record ID for FFSIMCO\"\n", + " },\n", + " \"RebatePerUnit\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Rebate amount per unit of the product\"\n", + " },\n", + " \"AdjustedRebatePerUnit\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Adjusted rebate amount per unit\"\n", + " },\n", + " \"UnitsInvoiced\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Number of units invoiced\"\n", + " },\n", + " \"UnitsPaid\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Number of units for which payment was made\"\n", + " },\n", + " \"RebateAmountInvoiced\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Total rebate amount invoiced\"\n", + " },\n", + " \"RebateAmountPaid\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Total rebate amount paid\"\n", + " }\n", + " }\n", + " }\n", + " },\n", + " \"TotalPaid\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Total payment amount \"\n", + " }\n", + " }\n", + "}\n", + "\n", + "🏷️ Tags (Project & Template Metadata):\n", + "{\n", + " \"projectId\": \"d7afeaa4-fe05-4df7-bd7c-46f3a94a96cb\",\n", + " \"templateId\": \"document-2025-05-01\"\n", + "}\n", + "\n", + "✅ Found Project ID: d7afeaa4-fe05-4df7-bd7c-46f3a94a96cb\n", + "✅ Found Template ID: document-2025-05-01\n", + "\n", + "💡 These tags will be copied to ensure the new analyzer appears in the same Azure AI Foundry project.\n" + ] + } + ], + "source": [ + "# Extract training data configuration\n", + "training_data_config = source_analyzer.get('trainingData')\n", + "knowledge_sources_config = source_analyzer.get('knowledgeSources')\n", + "field_schema = source_analyzer.get('fieldSchema', {})\n", + "tags = source_analyzer.get('tags', {})\n", + "\n", + "print(\"📦 Training Data Configuration:\")\n", + "if training_data_config:\n", + " print(json.dumps(training_data_config, indent=2))\n", + " container_url = training_data_config.get('containerUrl', 'N/A')\n", + " prefix = training_data_config.get('prefix', '')\n", + " print(f\"\\n✅ Found training data at: {container_url}\")\n", + " print(f\" Path prefix: {prefix}\")\n", + "else:\n", + " print(\"⚠️ No training data found in this analyzer.\")\n", + " print(\" Please select an analyzer that has training data configured.\")\n", + "\n", + "print(\"\\n📚 Knowledge Sources Configuration:\")\n", + "if knowledge_sources_config:\n", + " print(json.dumps(knowledge_sources_config, indent=2))\n", + "else:\n", + " print(\"No knowledge sources configured (this is normal for standard mode)\")\n", + "\n", + "print(\"\\n📋 Field Schema:\")\n", + "print(json.dumps(field_schema, indent=2))\n", + "\n", + "print(\"\\n🏷️ Tags (Project & Template Metadata):\")\n", + "if tags:\n", + " print(json.dumps(tags, indent=2))\n", + " project_id = tags.get('projectId')\n", + " template_id = tags.get('templateId')\n", + " if project_id:\n", + " print(f\"\\n✅ Found Project ID: {project_id}\")\n", + " if template_id:\n", + " print(f\"✅ Found Template ID: {template_id}\")\n", + " print(\"\\n💡 These tags will be copied to ensure the new analyzer appears in the same Azure AI Foundry project.\")\n", + "else:\n", + " print(\"No tags found (the new analyzer may not be associated with a Foundry project)\")" + ] + }, + { + "cell_type": "markdown", + "id": "e7770461", + "metadata": {}, + "source": [ + "## Step 5: Create New Analyzer with Existing Training Data\n", + "\n", + "Now we'll create a new analyzer that references the same training data. This new analyzer will:\n", + "- Use the same blob storage container and path\n", + "- Start with the same field schema (you can modify this)\n", + "- Have its own unique ID\n", + "- **Include the same tags** (projectId and templateId) to ensure it appears in the correct Azure AI Foundry project\n", + "\n", + "### Key Points:\n", + "- **Same resource**: Both analyzers are in the same Azure AI resource\n", + "- **No data duplication**: The training data stays in one place\n", + "- **Same project**: Tags ensure the analyzer appears in the same Foundry project\n", + "- **Independent lifecycle**: Each analyzer can be updated or deleted independently" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "98b0c9c3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Including tags from source analyzer (ensures correct project association in Foundry)\n", + " Project ID: d7afeaa4-fe05-4df7-bd7c-46f3a94a96cb\n", + " Template ID: document-2025-05-01\n", + "\n", + "Creating new analyzer: cloned-analyzer-c073f24d-5659-42ed-8ac8-b083bde79a9b\n", + "\n", + "New analyzer payload (ordered to match API structure):\n", + "{\n", + " \"description\": \"Created from invoiceLabeledData with reused training data\",\n", + " \"tags\": {\n", + " \"projectId\": \"d7afeaa4-fe05-4df7-bd7c-46f3a94a96cb\",\n", + " \"templateId\": \"document-2025-05-01\"\n", + " },\n", + " \"baseAnalyzerId\": \"prebuilt-documentAnalyzer\",\n", + " \"config\": {\n", + " \"returnDetails\": true,\n", + " \"enableOcr\": true,\n", + " \"enableLayout\": true,\n", + " \"enableFormula\": false,\n", + " \"disableContentFiltering\": false,\n", + " \"tableFormat\": \"html\",\n", + " \"estimateFieldSourceAndConfidence\": false\n", + " },\n", + " \"fieldSchema\": {\n", + " \"fields\": {\n", + " \"CompanyName\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Name of the pharmaceutical company involved in the rebate program\"\n", + " },\n", + " \"ProductDetails\": {\n", + " \"type\": \"array\",\n", + " \"description\": \"List of products with rebate and unit details\",\n", + " \"items\": {\n", + " \"type\": \"object\",\n", + " \"description\": \"Details of a single product\",\n", + " \"properties\": {\n", + " \"ProductPackageCode\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Code representing the product or package\"\n", + " },\n", + " \"ProductName\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Name of the product\"\n", + " },\n", + " \"FfsimcoRecordId\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Record ID for FFSIMCO\"\n", + " },\n", + " \"RebatePerUnit\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Rebate amount per unit of the product\"\n", + " },\n", + " \"AdjustedRebatePerUnit\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Adjusted rebate amount per unit\"\n", + " },\n", + " \"UnitsInvoiced\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Number of units invoiced\"\n", + " },\n", + " \"UnitsPaid\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Number of units for which payment was made\"\n", + " },\n", + " \"RebateAmountInvoiced\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Total rebate amount invoiced\"\n", + " },\n", + " \"RebateAmountPaid\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Total rebate amount paid\"\n", + " }\n", + " }\n", + " }\n", + " },\n", + " \"TotalPaid\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Total payment amount \"\n", + " }\n", + " }\n", + " },\n", + " \"mode\": \"standard\"\n", + "}\n", + "\n", + "📦 Training data will be configured separately:\n", + " Container URL: https://staistudiote203841201294.blob.core.windows.net/7c123b64-9378-4fa7-a807-081efa839c00-cu\n", + " Prefix: labelingProjects/d7afeaa4-fe05-4df7-bd7c-46f3a94a96cb/train\n" + ] + } + ], + "source": [ + "# Verify we have training data before proceeding\n", + "if not training_data_config:\n", + " raise ValueError(\n", + " \"Cannot proceed: Source analyzer does not have training data. \"\n", + " \"Please select an analyzer with training data or create one using the optional cell above.\"\n", + " )\n", + "\n", + "# Create a new analyzer ID\n", + "# Analyzer names must be 1-64 characters and only contain letters, numbers, dots, underscores, or hyphens\n", + "NEW_ANALYZER_ID = \"cloned-analyzer-\" + str(uuid.uuid4())\n", + "\n", + "# Build the new analyzer payload in the correct order matching the API structure\n", + "# Note: Read-only fields like createdAt, lastModifiedAt, status, etc. are omitted as they're set by the service\n", + "new_analyzer_payload = {}\n", + "\n", + "# 1. Analyzer ID (not needed as it's passed separately, but kept for reference)\n", + "# new_analyzer_payload[\"analyzerId\"] = NEW_ANALYZER_ID\n", + "\n", + "# 2. Description\n", + "new_analyzer_payload[\"description\"] = f\"Created from {SOURCE_ANALYZER_ID} with reused training data\"\n", + "\n", + "# 3. Tags (projectId and templateId) - IMPORTANT for Foundry project association\n", + "if tags:\n", + " new_analyzer_payload[\"tags\"] = tags\n", + " print(\"✅ Including tags from source analyzer (ensures correct project association in Foundry)\")\n", + " print(f\" Project ID: {tags.get('projectId', 'N/A')}\")\n", + " print(f\" Template ID: {tags.get('templateId', 'N/A')}\")\n", + "else:\n", + " print(\"⚠️ No tags found in source analyzer - new analyzer may not appear in Foundry project\")\n", + "\n", + "# 4. Base Analyzer ID (if present)\n", + "if 'baseAnalyzerId' in source_analyzer:\n", + " new_analyzer_payload['baseAnalyzerId'] = source_analyzer['baseAnalyzerId']\n", + "\n", + "# 5. Config settings\n", + "if 'config' in source_analyzer:\n", + " new_analyzer_payload['config'] = source_analyzer['config']\n", + "\n", + "# 6. Field Schema\n", + "new_analyzer_payload[\"fieldSchema\"] = field_schema\n", + "\n", + "# 7. Training Data - Will be passed separately to begin_create_analyzer()\n", + "# Note: We extract the container URL and prefix to pass as separate parameters\n", + "training_container_sas_url = training_data_config.get('containerUrl', '')\n", + "training_container_prefix = training_data_config.get('prefix', '')\n", + "\n", + "# 8. Knowledge Sources (if present - typically for Pro mode)\n", + "# Extract these separately if they exist\n", + "pro_mode_container_sas_url = \"\"\n", + "pro_mode_container_prefix = \"\"\n", + "if knowledge_sources_config and isinstance(knowledge_sources_config, list) and len(knowledge_sources_config) > 0:\n", + " # Get the first knowledge source (typically there's only one)\n", + " first_knowledge_source = knowledge_sources_config[0]\n", + " pro_mode_container_sas_url = first_knowledge_source.get('containerUrl', '')\n", + " pro_mode_container_prefix = first_knowledge_source.get('prefix', '')\n", + "\n", + "# 9. Mode (if present)\n", + "if 'mode' in source_analyzer:\n", + " new_analyzer_payload['mode'] = source_analyzer['mode']\n", + "\n", + "print(f\"\\nCreating new analyzer: {NEW_ANALYZER_ID}\")\n", + "print(\"\\nNew analyzer payload (ordered to match API structure):\")\n", + "print(json.dumps(new_analyzer_payload, indent=2))\n", + "\n", + "print(\"\\n📦 Training data will be configured separately:\")\n", + "print(f\" Container URL: {training_container_sas_url}\")\n", + "print(f\" Prefix: {training_container_prefix}\")\n", + "\n", + "if pro_mode_container_sas_url:\n", + " print(\"\\n📚 Pro mode reference docs will be configured separately:\")\n", + " print(f\" Container URL: {pro_mode_container_sas_url}\")\n", + " print(f\" Prefix: {pro_mode_container_prefix}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "385a0867", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:python.content_understanding_client:Analyzer cloned-analyzer-c073f24d-5659-42ed-8ac8-b083bde79a9b create request accepted.\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request result is ready after 152.25 seconds.\n", + "INFO:python.content_understanding_client:Request result is ready after 152.25 seconds.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Successfully created new analyzer: cloned-analyzer-c073f24d-5659-42ed-8ac8-b083bde79a9b\n", + "\n", + "Creation result:\n", + "{\n", + " \"id\": \"a22ddf12-3156-4a9a-9675-7b85789a8686\",\n", + " \"status\": \"Succeeded\",\n", + " \"result\": {\n", + " \"analyzerId\": \"cloned-analyzer-c073f24d-5659-42ed-8ac8-b083bde79a9b\",\n", + " \"description\": \"Created from invoiceLabeledData with reused training data\",\n", + " \"tags\": {\n", + " \"projectId\": \"d7afeaa4-fe05-4df7-bd7c-46f3a94a96cb\",\n", + " \"templateId\": \"document-2025-05-01\"\n", + " },\n", + " \"createdAt\": \"2025-10-22T22:44:56Z\",\n", + " \"lastModifiedAt\": \"2025-10-22T22:47:27Z\",\n", + " \"baseAnalyzerId\": \"prebuilt-documentAnalyzer\",\n", + " \"config\": {\n", + " \"returnDetails\": true,\n", + " \"enableOcr\": true,\n", + " \"enableLayout\": true,\n", + " \"enableFormula\": false,\n", + " \"disableContentFiltering\": false,\n", + " \"tableFormat\": \"html\",\n", + " \"estimateFieldSourceAndConfidence\": false\n", + " },\n", + " \"fieldSchema\": {\n", + " \"fields\": {\n", + " \"CompanyName\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Name of the pharmaceutical company involved in the rebate program\"\n", + " },\n", + " \"ProductDetails\": {\n", + " \"type\": \"array\",\n", + " \"description\": \"List of products with rebate and unit details\",\n", + " \"items\": {\n", + " \"type\": \"object\",\n", + " \"description\": \"Details of a single product\",\n", + " \"properties\": {\n", + " \"ProductPackageCode\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Code representing the product or package\"\n", + " },\n", + " \"ProductName\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Name of the product\"\n", + " },\n", + " \"FfsimcoRecordId\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Record ID for FFSIMCO\"\n", + " },\n", + " \"RebatePerUnit\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Rebate amount per unit of the product\"\n", + " },\n", + " \"AdjustedRebatePerUnit\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Adjusted rebate amount per unit\"\n", + " },\n", + " \"UnitsInvoiced\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Number of units invoiced\"\n", + " },\n", + " \"UnitsPaid\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Number of units for which payment was made\"\n", + " },\n", + " \"RebateAmountInvoiced\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Total rebate amount invoiced\"\n", + " },\n", + " \"RebateAmountPaid\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Total rebate amount paid\"\n", + " }\n", + " }\n", + " }\n", + " },\n", + " \"TotalPaid\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Total payment amount \"\n", + " }\n", + " }\n", + " },\n", + " \"trainingData\": {\n", + " \"containerUrl\": \"https://staistudiote203841201294.blob.core.windows.net/7c123b64-9378-4fa7-a807-081efa839c00-cu\",\n", + " \"kind\": \"blob\",\n", + " \"prefix\": \"labelingProjects/d7afeaa4-fe05-4df7-bd7c-46f3a94a96cb/train/\"\n", + " },\n", + " \"warnings\": [],\n", + " \"status\": \"ready\",\n", + " \"processingLocation\": \"geography\",\n", + " \"mode\": \"standard\"\n", + " }\n", + "}\n" + ] + } + ], + "source": [ + "# Create the new analyzer\n", + "# Pass training data and knowledge sources as separate parameters\n", + "response = client.begin_create_analyzer(\n", + " NEW_ANALYZER_ID,\n", + " analyzer_template=new_analyzer_payload,\n", + " training_storage_container_sas_url=training_container_sas_url,\n", + " training_storage_container_path_prefix=training_container_prefix,\n", + ")\n", + "\n", + "result = client.poll_result(response)\n", + "\n", + "if result and result.get('status') == 'Succeeded':\n", + " print(f\"✅ Successfully created new analyzer: {NEW_ANALYZER_ID}\")\n", + " print(\"\\nCreation result:\")\n", + " print(json.dumps(result, indent=2))\n", + "else:\n", + " print(\"⚠️ Analyzer creation encountered an issue.\")\n", + " print(json.dumps(result, indent=2))" + ] + }, + { + "cell_type": "markdown", + "id": "63295659", + "metadata": {}, + "source": [ + "## Step 6: Verify the New Analyzer\n", + "\n", + "Let's confirm the new analyzer was created correctly and is using the same training data." + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "685ff06f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "New Analyzer: cloned-analyzer-c073f24d-5659-42ed-8ac8-b083bde79a9b\n", + "Name: N/A\n", + "Description: Created from invoiceLabeledData with reused training data\n", + "\n", + "Training Data Configuration:\n", + "{\n", + " \"containerUrl\": \"https://staistudiote203841201294.blob.core.windows.net/7c123b64-9378-4fa7-a807-081efa839c00-cu\",\n", + " \"kind\": \"blob\",\n", + " \"prefix\": \"labelingProjects/d7afeaa4-fe05-4df7-bd7c-46f3a94a96cb/train/\"\n", + "}\n", + "\n", + "✅ Verification successful: Both analyzers reference the same training data location!\n" + ] + } + ], + "source": [ + "# Get details of the newly created analyzer\n", + "new_analyzer = client.get_analyzer_detail_by_id(NEW_ANALYZER_ID)\n", + "\n", + "print(f\"New Analyzer: {NEW_ANALYZER_ID}\")\n", + "print(f\"Name: {new_analyzer.get('name', 'N/A')}\")\n", + "print(f\"Description: {new_analyzer.get('description', 'N/A')}\")\n", + "print(\"\\nTraining Data Configuration:\")\n", + "print(json.dumps(new_analyzer.get('trainingData', {}), indent=2))\n", + "\n", + "# Verify the training data location matches\n", + "new_training_data = new_analyzer.get('trainingData', {})\n", + "original_container = training_data_config.get('containerUrl', '')\n", + "new_container = new_training_data.get('containerUrl', '')\n", + "\n", + "if original_container == new_container:\n", + " print(\"\\n✅ Verification successful: Both analyzers reference the same training data location!\")\n", + "else:\n", + " print(\"\\n⚠️ Warning: Training data locations don't match.\")\n", + " print(f\"Original: {original_container}\")\n", + " print(f\"New: {new_container}\")" + ] + }, + { + "cell_type": "markdown", + "id": "fe3352c9", + "metadata": {}, + "source": [ + "## Step 7: Test Both Analyzers\n", + "\n", + "Now let's test both analyzers with a sample file to verify they both work correctly with the shared training data." + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "cc934efd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Testing with file: ../data/receipt.png\n" + ] + } + ], + "source": [ + "# Specify a test file - adjust this path based on your analyzer type\n", + "# For receipt analyzers:\n", + "test_file = \"../data/receipt.png\"\n", + "\n", + "# For invoice analyzers:\n", + "# test_file = \"../data/invoice.pdf\"\n", + "\n", + "# For custom documents:\n", + "# test_file = \"../data/your-document.pdf\"\n", + "\n", + "# Verify the file exists\n", + "if not Path(test_file).exists():\n", + " print(f\"⚠️ Test file not found: {test_file}\")\n", + " print(\"Please adjust the test_file path to match your use case.\")\n", + "else:\n", + " print(f\"Testing with file: {test_file}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "273dd85c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "📝 Analyzing with SOURCE analyzer: invoiceLabeledData\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:python.content_understanding_client:Analyzing file ../data/receipt.png with analyzer: invoiceLabeledData\n", + "INFO:python.content_understanding_client:Request 80b00372-a498-4564-9ff1-1e6901778a2d in progress ...\n", + "INFO:python.content_understanding_client:Request 80b00372-a498-4564-9ff1-1e6901778a2d in progress ...\n", + "INFO:python.content_understanding_client:Request 80b00372-a498-4564-9ff1-1e6901778a2d in progress ...\n", + "INFO:python.content_understanding_client:Request 80b00372-a498-4564-9ff1-1e6901778a2d in progress ...\n", + "INFO:python.content_understanding_client:Request result is ready after 4.71 seconds.\n", + "INFO:python.content_understanding_client:Request result is ready after 4.71 seconds.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Source Analyzer Results:\n", + "Extracted 3 field(s)\n", + " - CompanyName: {'type': 'string', 'valueString': 'Contoso'}\n", + " - ProductDetails: {'type': 'array'}\n", + " - TotalPaid: {'type': 'number', 'valueNumber': 2516.28}\n" + ] + } + ], + "source": [ + "# Test the original analyzer\n", + "if Path(test_file).exists():\n", + " print(f\"\\n📝 Analyzing with SOURCE analyzer: {SOURCE_ANALYZER_ID}\")\n", + " response_source = client.begin_analyze(SOURCE_ANALYZER_ID, file_location=test_file)\n", + " result_source = client.poll_result(response_source)\n", + " \n", + " print(\"\\nSource Analyzer Results:\")\n", + " # Print a summary of extracted fields\n", + " if result_source.get('status') == 'Succeeded':\n", + " result_data = result_source.get('result', {})\n", + " fields = result_data.get('contents', [{}])[0].get('fields', {})\n", + " print(f\"Extracted {len(fields)} field(s)\")\n", + " for field_name, field_value in fields.items():\n", + " print(f\" - {field_name}: {field_value}\")\n", + " else:\n", + " print(json.dumps(result_source, indent=2))" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "e9654313", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "📝 Analyzing with NEW analyzer: cloned-analyzer-c073f24d-5659-42ed-8ac8-b083bde79a9b\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:python.content_understanding_client:Analyzing file ../data/receipt.png with analyzer: cloned-analyzer-c073f24d-5659-42ed-8ac8-b083bde79a9b\n", + "INFO:python.content_understanding_client:Request 5d982b83-4b1c-4e99-b045-48e36cb5a7e3 in progress ...\n", + "INFO:python.content_understanding_client:Request 5d982b83-4b1c-4e99-b045-48e36cb5a7e3 in progress ...\n", + "INFO:python.content_understanding_client:Request 5d982b83-4b1c-4e99-b045-48e36cb5a7e3 in progress ...\n", + "INFO:python.content_understanding_client:Request 5d982b83-4b1c-4e99-b045-48e36cb5a7e3 in progress ...\n", + "INFO:python.content_understanding_client:Request result is ready after 4.72 seconds.\n", + "INFO:python.content_understanding_client:Request result is ready after 4.72 seconds.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "New Analyzer Results:\n", + "Extracted 3 field(s)\n", + " - CompanyName: {'type': 'string', 'valueString': 'Contoso'}\n", + " - ProductDetails: {'type': 'array'}\n", + " - TotalPaid: {'type': 'number', 'valueNumber': 2516.28}\n", + "\n", + "✅ Both analyzers successfully processed the file using the shared training data!\n" + ] + } + ], + "source": [ + "# Test the new analyzer\n", + "if Path(test_file).exists():\n", + " print(f\"\\n📝 Analyzing with NEW analyzer: {NEW_ANALYZER_ID}\")\n", + " response_new = client.begin_analyze(NEW_ANALYZER_ID, file_location=test_file)\n", + " result_new = client.poll_result(response_new)\n", + " \n", + " print(\"\\nNew Analyzer Results:\")\n", + " # Print a summary of extracted fields\n", + " if result_new.get('status') == 'Succeeded':\n", + " result_data = result_new.get('result', {})\n", + " fields = result_data.get('contents', [{}])[0].get('fields', {})\n", + " print(f\"Extracted {len(fields)} field(s)\")\n", + " for field_name, field_value in fields.items():\n", + " print(f\" - {field_name}: {field_value}\")\n", + " else:\n", + " print(json.dumps(result_new, indent=2))\n", + " \n", + " print(\"\\n✅ Both analyzers successfully processed the file using the shared training data!\")" + ] + }, + { + "cell_type": "markdown", + "id": "f913b6dd", + "metadata": {}, + "source": [ + "## Step 8: Compare Results (Optional)\n", + "\n", + "Let's compare the full results from both analyzers side by side." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c6467b3f", + "metadata": {}, + "outputs": [], + "source": [ + "if Path(test_file).exists():\n", + " print(\"=\" * 80)\n", + " print(\"SOURCE ANALYZER FULL RESULTS\")\n", + " print(\"=\" * 80)\n", + " print(json.dumps(result_source, indent=2))\n", + " \n", + " print(\"\\n\" + \"=\" * 80)\n", + " print(\"NEW ANALYZER FULL RESULTS\")\n", + " print(\"=\" * 80)\n", + " print(json.dumps(result_new, indent=2))" + ] + }, + { + "cell_type": "markdown", + "id": "5f65f05c", + "metadata": {}, + "source": [ + "## Step 9: Cleanup (Optional)\n", + "\n", + "If you want to clean up the test analyzers, you can delete them. In production, you typically keep analyzers for reuse.\n", + "\n", + "⚠️ **Warning**: This will permanently delete the analyzer. The training data in blob storage will remain unaffected." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "00cde3ff", + "metadata": {}, + "outputs": [], + "source": [ + "# Uncomment to delete the new analyzer\n", + "# print(f\"Deleting new analyzer: {NEW_ANALYZER_ID}\")\n", + "# client.delete_analyzer(NEW_ANALYZER_ID)\n", + "# print(\"✅ New analyzer deleted\")\n", + "\n", + "# Uncomment to also delete the source analyzer (be careful!)\n", + "# print(f\"Deleting source analyzer: {SOURCE_ANALYZER_ID}\")\n", + "# client.delete_analyzer(SOURCE_ANALYZER_ID)\n", + "# print(\"✅ Source analyzer deleted\")" + ] + }, + { + "cell_type": "markdown", + "id": "d952dfef", + "metadata": {}, + "source": [ + "## Summary\n", + "\n", + "🎉 **Congratulations!** You have successfully:\n", + "\n", + "✅ Retrieved an existing analyzer with training data \n", + "✅ Extracted the training data configuration \n", + "✅ Created a new analyzer referencing the same training data \n", + "✅ Verified both analyzers work correctly \n", + "✅ Tested both analyzers with a sample file \n", + "\n", + "### Key Takeaways\n", + "\n", + "- **No data duplication**: Both analyzers reference the same blob storage location\n", + "- **Same resource**: Both analyzers use the same authentication and access permissions\n", + "- **Field portability**: You can maintain stable `fieldId`s across different analyzer versions\n", + "- **Rapid iteration**: Test schema changes quickly without re-uploading training data\n", + "\n", + "### Best Practices\n", + "\n", + "1. **Stable field IDs**: Keep `fieldId`s consistent across analyzers for easier migration\n", + "2. **Version control**: Maintain analyzer schemas in source control\n", + "3. **Documentation**: Document which blob paths contain which training datasets\n", + "4. **Testing**: Always test a new analyzer before deleting the original\n", + "5. **Naming conventions**: Use descriptive analyzer IDs that indicate purpose and version\n", + "\n", + "### Next Steps\n", + "\n", + "- Modify the field schema in the new analyzer to test different configurations\n", + "- Add additional training data to improve both analyzers\n", + "- Use this pattern to create A/B testing scenarios\n", + "- Explore other notebooks:\n", + " - [analyzer_training.ipynb](./analyzer_training.ipynb) - Create analyzers with training data\n", + " - [field_extraction.ipynb](./field_extraction.ipynb) - Extract fields from documents\n", + " - [management.ipynb](./management.ipynb) - Manage analyzer lifecycle" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/python/di_to_cu_migration_tool/README.md b/python/di_to_cu_migration_tool/README.md index e473ad0a..737a4ba5 100644 --- a/python/di_to_cu_migration_tool/README.md +++ b/python/di_to_cu_migration_tool/README.md @@ -1,13 +1,13 @@ # Document Intelligence to Content Understanding Migration Tool (Python) -Welcome! This tool helps convert your Document Intelligence (DI) datasets to the Content Understanding (CU) **Preview.2** 2025-05-01-preview format, as used in AI Foundry. The following DI versions are supported: +Welcome! This tool helps convert your Document Intelligence (DI) datasets to the Content Understanding (CU) **GA** 2025-11-01 format, as used in AI Foundry. The following DI versions are supported: - Custom Extraction Model DI 3.1 GA (2023-07-31) to DI 4.0 GA (2024-11-30) (Document Intelligence Studio) → DI-version = neural - Document Field Extraction Model 4.0 Preview (2024-07-31-preview) (AI Foundry / AI Services / Vision + Document / Document Field Extraction) → DI-version = generative To identify the version of your Document Intelligence dataset, please consult the sample documents in this folder to match your format. You can also verify the version by reviewing your DI project's user experience. For instance, Custom Extraction DI 3.1/4.0 GA appears in Document Intelligence Studio (https://documentintelligence.ai.azure.com/studio), whereas Document Field Extraction DI 4.0 Preview is only available on Azure AI Foundry's preview service (https://ai.azure.com/explore/aiservices/vision/document/extraction). -For migrating from these DI versions to Content Understanding Preview.2, this tool first converts the DI dataset into a CU-compatible format. After conversion, you can create a Content Understanding Analyzer trained on your converted CU dataset. Additionally, you have the option to test its quality against any sample documents. +For migrating from these DI versions to Content Understanding GA (2025-11-01), this tool first converts the DI dataset into a CU-compatible format. After conversion, you can create a Content Understanding Analyzer trained on your converted CU dataset. Additionally, you have the option to test its quality against any sample documents. ## Details About the Tools @@ -27,8 +27,26 @@ Here is a detailed breakdown of the three CLI tools and their functionality: * **call_analyze.py** * This CLI tool verifies that the migration completed successfully and assesses the quality of the created analyzer. + ## Setup +## Prerequisites + +⚠️ **IMPORTANT: Before using this migration tool**, ensure your Azure AI Foundry resource is properly configured for Content Understanding: + +1. **Configure Default Model Deployments**: You must set default model deployments in your Content Understanding in your Foundry Resource before creating or running analyzers. + + To do this walk through the prerequisites here: + - [REST API Quickstart Guide](https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/quickstart/use-rest-api?tabs=portal%2Cdocument) + + For more details about defaults checkout this documentation: + - [Models and Deployments Documentation](https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/concepts/models-deployments) + +2. **Verify you can create and use a basic Content Understanding analyzer** in your Azure AI Foundry resource before attempting migration. This ensures all prerequisites are met. + +3. Complete all setup steps outlined in the REST API documentation above, including authentication and model deployment configuration. + +### Tool Setup Please follow these steps to set up the tool: 1. Install dependencies by running: @@ -43,7 +61,7 @@ Please follow these steps to set up the tool: - **SUBSCRIPTION_KEY:** Update to your Azure AI Service API Key or Subscription ID to authenticate the API requests. - Locate your API Key here: ![Azure AI Service Endpoints With Keys](assets/endpoint-with-keys.png) - If using Azure Active Directory (AAD), please refer to your Subscription ID: ![Azure AI Service Subscription ID](assets/subscription-id.png) - - **API_VERSION:** This is preset to the CU Preview.2 version; no changes are needed. + - **API_VERSION:** This is preset to the CU GA version (2025-11-01); no changes are needed. ## How to Locate Your Document Field Extraction Dataset for Migration @@ -73,8 +91,12 @@ To obtain SAS URLs for a file or folder for any container URL arguments, please 3. Configure permissions and expiry for your SAS URL as follows: - For the **DI source dataset**, please select permissions: _**Read & List**_ +https://jfilcikditestdata.blob.core.windows.net/didata?sv=2025-07-05&spr=https&st=2025-12-16T22%3A17%3A06Z&se=2025-12-17T22%3A17%3A06Z&sr=c&sp=rl&sig=nvUIelZQ9yWEJx3jA%2FjUOIdHn6OVnp5gvKSJ3zgzwvE%3D + - For the **CU target dataset**, please select permissions: _**Read, Add, Create, & Write**_ +https://jfilcikditestdata.blob.core.windows.net/cudata?sv=2025-07-05&spr=https&st=2025-12-16T22%3A19%3A39Z&se=2025-12-17T22%3A19%3A39Z&sr=c&sp=racwl&sig=K82dxEFNpYhuf5JRq3xJ4vc5SYE8A7FfsBnTJbB1VJY%3D + After configuring, click **Generate SAS Token and URL** and copy the URL shown under **Blob SAS URL**. ![Generate SAS Pop-Up](assets/generate-sas-pop-up.png) @@ -155,7 +177,7 @@ Below are common issues you might encounter when creating an analyzer or running - **400 Bad Request** errors: Please validate the following: - The endpoint URL is valid. Example: - `https://yourEndpoint/contentunderstanding/analyzers/yourAnalyzerID?api-version=2025-05-01-preview` + `https://yourEndpoint/contentunderstanding/analyzers/yourAnalyzerID?api-version=2025-11-01` - Your converted CU dataset respects the naming constraints below. If needed, please manually correct the `analyzer.json` fields: - Field names start with a letter or underscore - Field name length must be between 1 and 64 characters @@ -174,7 +196,7 @@ Below are common issues you might encounter when creating an analyzer or running - **400 Bad Request**: This implies that you might have an incorrect endpoint or SAS URL. Please ensure that your endpoint is valid and that you are using the correct SAS URL for the document: - `https://yourendpoint/contentunderstanding/analyzers/yourAnalyzerID:analyze?api-version=2025-05-01-preview` + `https://yourendpoint/contentunderstanding/analyzers/yourAnalyzerID:analyze?api-version=2025-11-01` Confirm you are using the correct SAS URL for the document. - **401 Unauthorized**: @@ -189,4 +211,4 @@ Below are common issues you might encounter when creating an analyzer or running 2. Signature field types (e.g., in previous DI versions) are not yet supported in Content Understanding. These will be ignored during migration when creating the analyzer. 3. The content of your training documents is retained in the CU model's metadata, under storage specifically. You can find more details at: https://learn.microsoft.com/en-us/legal/cognitive-services/content-understanding/transparency-note?toc=%2Fazure%2Fai-services%2Fcontent-understanding%2Ftoc.json&bc=%2Fazure%2Fai-services%2Fcontent-understanding%2Fbreadcrumb%2Ftoc.json -4. All conversions are for Content Understanding preview.2 version only. \ No newline at end of file +4. All conversions are for Content Understanding GA (2025-11-01) version. \ No newline at end of file diff --git a/python/di_to_cu_migration_tool/constants.py b/python/di_to_cu_migration_tool/constants.py index 09dc9721..73f9e0ce 100644 --- a/python/di_to_cu_migration_tool/constants.py +++ b/python/di_to_cu_migration_tool/constants.py @@ -1,6 +1,6 @@ # Supported DI versions DI_VERSIONS = ["generative", "neural"] -CU_API_VERSION = "2025-05-01-preview" +CU_API_VERSION = "2025-11-01" # constants MAX_FIELD_COUNT = 100 diff --git a/python/di_to_cu_migration_tool/cu_converter_generative.py b/python/di_to_cu_migration_tool/cu_converter_generative.py index f27938d1..f384dc78 100644 --- a/python/di_to_cu_migration_tool/cu_converter_generative.py +++ b/python/di_to_cu_migration_tool/cu_converter_generative.py @@ -48,7 +48,7 @@ def format_angle(angle: float) -> float: formatted_num = f"{rounded_angle:.7f}".rstrip('0') # Remove trailing zeros return float(formatted_num) -def convert_fields_to_analyzer(fields_json_path: Path, analyzer_prefix: Optional[str], target_dir: Path, field_definitions: FieldDefinitions) -> dict: +def convert_fields_to_analyzer(fields_json_path: Path, analyzer_prefix: Optional[str], target_dir: Path, field_definitions: FieldDefinitions, target_container_sas_url: str = None, target_blob_folder: str = None) -> dict: """ Convert DI 4.0 preview Custom Document fields.json to analyzer.json format. Args: @@ -79,7 +79,11 @@ def convert_fields_to_analyzer(fields_json_path: Path, analyzer_prefix: Optional # build analyzer.json appropriately analyzer_data = { "analyzerId": analyzer_id, - "baseAnalyzerId": "prebuilt-documentAnalyzer", + "baseAnalyzerId": "prebuilt-document", + "models": { + "completion": "gpt-4.1", + "embedding": "text-embedding-3-large" + }, "config": { "returnDetails": True, # Add the following line as a temp workaround before service issue is fixed. @@ -121,6 +125,17 @@ def convert_fields_to_analyzer(fields_json_path: Path, analyzer_prefix: Optional else: analyzer_json_path = fields_json_path.parent / 'analyzer.json' + # Add knowledgeSources section if container info is provided + if target_container_sas_url and target_blob_folder: + analyzer_data["knowledgeSources"] = [ + { + "kind": "labeledData", + "containerUrl": target_container_sas_url, + "prefix": target_blob_folder, + "fileListPath": "" + } + ] + # Ensure target directory exists analyzer_json_path.parent.mkdir(parents=True, exist_ok=True) diff --git a/python/di_to_cu_migration_tool/cu_converter_neural.py b/python/di_to_cu_migration_tool/cu_converter_neural.py index d825f10e..64d4d33b 100644 --- a/python/di_to_cu_migration_tool/cu_converter_neural.py +++ b/python/di_to_cu_migration_tool/cu_converter_neural.py @@ -37,7 +37,7 @@ def convert_bounding_regions_to_source(page_number: int, polygon: list) -> str: source = f"D({page_number},{polygon_str})" return source -def convert_fields_to_analyzer_neural(fields_json_path: Path, analyzer_prefix: Optional[str], target_dir: Optional[Path], field_definitions: FieldDefinitions) -> Tuple[dict, dict]: +def convert_fields_to_analyzer_neural(fields_json_path: Path, analyzer_prefix: Optional[str], target_dir: Optional[Path], field_definitions: FieldDefinitions, target_container_sas_url: str = None, target_blob_folder: str = None) -> Tuple[dict, dict]: """ Convert DI 3.1/4.0GA Custom Neural fields.json to analyzer.json format. Args: @@ -67,7 +67,11 @@ def convert_fields_to_analyzer_neural(fields_json_path: Path, analyzer_prefix: O # Build analyzer.json content analyzer_data = { "analyzerId": analyzer_prefix, - "baseAnalyzerId": "prebuilt-documentAnalyzer", + "baseAnalyzerId": "prebuilt-document", + "models": { + "completion": "gpt-4.1", + "embedding": "text-embedding-3-large" + }, "config": { "returnDetails": True, # Add the following line as a temp workaround before service issue is fixed. @@ -132,6 +136,17 @@ def convert_fields_to_analyzer_neural(fields_json_path: Path, analyzer_prefix: O else: analyzer_json_path = fields_json_path.parent / 'analyzer.json' + # Add knowledgeSources section if container info is provided + if target_container_sas_url and target_blob_folder: + analyzer_data["knowledgeSources"] = [ + { + "kind": "labeledData", + "containerUrl": target_container_sas_url, + "prefix": target_blob_folder, + "fileListPath": "" + } + ] + # Ensure target directory exists analyzer_json_path.parent.mkdir(parents=True, exist_ok=True) diff --git a/python/di_to_cu_migration_tool/di_to_cu_converter.py b/python/di_to_cu_migration_tool/di_to_cu_converter.py index 5de14d91..c84111b8 100644 --- a/python/di_to_cu_migration_tool/di_to_cu_converter.py +++ b/python/di_to_cu_migration_tool/di_to_cu_converter.py @@ -8,7 +8,7 @@ import shutil import tempfile import typer -from typing import Tuple +from typing import Optional, Tuple # imports from external packages (in requirements.txt) from rich import print # For colored output @@ -161,7 +161,7 @@ def main( print(f"[yellow]WARNING: The following signatures were removed from the dataset: {removed_signatures}[/yellow]\n") print("Second: Running DI to CU dataset conversion...") - analyzer_data, ocr_files = running_cu_conversion(temp_dir, temp_target_dir, DI_version, analyzer_prefix, removed_signatures) + analyzer_data, ocr_files = running_cu_conversion(temp_dir, temp_target_dir, DI_version, analyzer_prefix, removed_signatures, target_container_sas_url, target_blob_folder) # Run OCR on the pdf files run_cu_layout_ocr(ocr_files, temp_target_dir, subscription_key) @@ -232,15 +232,17 @@ def running_field_type_conversion(temp_source_dir: Path, temp_dir: Path, DI_vers return removed_signatures -def running_cu_conversion(temp_dir: Path, temp_target_dir: Path, DI_version: str, analyzer_prefix: str, removed_signatures: list) -> Tuple[dict, list]: +def running_cu_conversion(temp_dir: Path, temp_target_dir: Path, DI_version: str, analyzer_prefix: Optional[str], removed_signatures: list, target_container_sas_url: str, target_blob_folder: str) -> Tuple[dict, list]: """ - Function to run the DI to CU conversion + Function to run the CU conversion Args: temp_dir (Path): The path to the source directory temp_target_dir (Path): The path to the target directory DI_version (str): The version of DI being used analyzer_prefix (str): The prefix for the analyzer name removed_signatures (list): The list of removed signatures that will not be used in the CU converter + target_container_sas_url (str): The target container SAS URL for training data + target_blob_folder (str): The target blob folder prefix for training data """ # Creating a FieldDefinitons object to handle the converison of definitions in the fields.json field_definitions = FieldDefinitions() @@ -251,9 +253,9 @@ def running_cu_conversion(temp_dir: Path, temp_target_dir: Path, DI_version: str assert fields_path.exists(), "fields.json is needed. Fields.json is missing from the given dataset." if DI_version == "generative": - analyzer_data = cu_converter_generative.convert_fields_to_analyzer(fields_path, analyzer_prefix, temp_target_dir, field_definitions) + analyzer_data = cu_converter_generative.convert_fields_to_analyzer(fields_path, analyzer_prefix, temp_target_dir, field_definitions, target_container_sas_url, target_blob_folder) elif DI_version == "neural": - analyzer_data, fields_dict = cu_converter_neural.convert_fields_to_analyzer_neural(fields_path, analyzer_prefix, temp_target_dir, field_definitions) + analyzer_data, fields_dict = cu_converter_neural.convert_fields_to_analyzer_neural(fields_path, analyzer_prefix, temp_target_dir, field_definitions, target_container_sas_url, target_blob_folder) ocr_files = [] # List to store paths to pdf files to get OCR results from later for file in files: diff --git a/python/di_to_cu_migration_tool/get_ocr.py b/python/di_to_cu_migration_tool/get_ocr.py index a1b849bf..32c0584f 100644 --- a/python/di_to_cu_migration_tool/get_ocr.py +++ b/python/di_to_cu_migration_tool/get_ocr.py @@ -70,7 +70,11 @@ def build_analyzer(credential, current_token, host, api_version, subscriptionKey request_body = { "analyzerId": analyzer_id, "description": "Sample analyzer", - "baseAnalyzerId": "prebuilt-documentAnalyzer", + "baseAnalyzerId": "prebuilt-document", + "models": { + "completion": "gpt-4.1", + "embedding": "text-embedding-3-large" + }, "config": { "returnDetails": True, "enableOcr": True, @@ -82,8 +86,7 @@ def build_analyzer(credential, current_token, host, api_version, subscriptionKey "fieldSchema": {}, "warnings": [], "status": "ready", - "processingLocation": "geography", - "mode": "standard" + "processingLocation": "geography" } endpoint = f"{host}/contentunderstanding/analyzers/{analyzer_id}?api-version={api_version}" print("[yellow]Creating sample analyzer to attain CU Layout results...[/yellow]") @@ -138,9 +141,8 @@ def run_cu_layout_ocr(input_files: list, output_dir_string: str, subscription_ke output_dir = Path(output_dir_string) output_dir.mkdir(parents=True, exist_ok=True) - # Need to create analyzer with empty schema - analyzer_id = build_analyzer(credential, current_token, host, api_version, subscription_key) - url = f"{host}/contentunderstanding/analyzers/{analyzer_id}:analyze?api-version={api_version}" + # Use prebuilt-read analyzer directly - no need to create a custom analyzer + url = f"{host}/contentunderstanding/analyzers/prebuilt-read:analyze?api-version={api_version}" for file in input_files: try: @@ -150,7 +152,7 @@ def run_cu_layout_ocr(input_files: list, output_dir_string: str, subscription_ke current_token = get_token(credential, current_token) headers = { "Authorization": f"Bearer {current_token.token}", - "Apim-Subscription-id": f"{subscription_key}", + "Ocp-Apim-Subscription-Key": f"{subscription_key}", "Content-Type": "application/pdf", } diff --git a/python/di_to_cu_migration_tool/sample_documents/analyzer_result.json b/python/di_to_cu_migration_tool/sample_documents/analyzer_result.json index bfa151fd..f1507dcd 100644 --- a/python/di_to_cu_migration_tool/sample_documents/analyzer_result.json +++ b/python/di_to_cu_migration_tool/sample_documents/analyzer_result.json @@ -3,7 +3,7 @@ "status": "Succeeded", "result": { "analyzerId": "mySampleAnalyzer", - "apiVersion": "2025-05-01-preview", + "apiVersion": "2025-11-01", "createdAt": "2025-05-30T15:47:15Z", "warnings": [], "contents": [