From ec3de7c81d2889e674b091f46ce34e4f132593e8 Mon Sep 17 00:00:00 2001 From: Joe Filcik Date: Thu, 23 Oct 2025 04:24:14 -0400 Subject: [PATCH 1/8] Example of how to move training data --- .../move_training_data_across_analyzers.ipynb | 3541 +++++++++++++++++ 1 file changed, 3541 insertions(+) create mode 100644 notebooks/move_training_data_across_analyzers.ipynb diff --git a/notebooks/move_training_data_across_analyzers.ipynb b/notebooks/move_training_data_across_analyzers.ipynb new file mode 100644 index 0000000..4117155 --- /dev/null +++ b/notebooks/move_training_data_across_analyzers.ipynb @@ -0,0 +1,3541 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e3ff63c1", + "metadata": {}, + "source": [ + "# Move Training Data Across Analyzers\n", + "\n", + "This notebook demonstrates how to reuse training data from an existing analyzer when creating a new analyzer in the same Azure AI Content Understanding resource.\n", + "\n", + "## Overview\n", + "\n", + "When you have an analyzer with training data and want to create a new analyzer using the same labeled examples, you can reference the existing blob storage location without duplicating or moving the data.\n", + "\n", + "### Benefits\n", + "- **No data duplication**: Reuse existing training data without copying\n", + "- **Same resource**: Both analyzers access the same blob storage\n", + "- **Field portability**: Maintain stable `fieldId`s across analyzers\n", + "- **Rapid iteration**: Test schema variations quickly\n", + "\n", + "### Prerequisites\n", + "1. An existing analyzer with training data already configured\n", + "2. Azure AI service configured by following the [configuration steps](../README.md#configure-azure-ai-service-resource)\n", + "3. Required packages installed" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "2f76b866", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Defaulting to user installation because normal site-packages is not writeable\n", + "Requirement already satisfied: aiohttp in /home/vscode/.local/lib/python3.11/site-packages (from -r ../requirements.txt (line 1)) (3.12.15)\n", + "Requirement already satisfied: azure-identity in /home/vscode/.local/lib/python3.11/site-packages (from -r ../requirements.txt (line 2)) (1.25.0)\n", + "Requirement already satisfied: azure-storage-blob in /home/vscode/.local/lib/python3.11/site-packages (from -r ../requirements.txt (line 3)) (12.26.0)\n", + "Requirement already satisfied: python-dotenv in /home/vscode/.local/lib/python3.11/site-packages (from -r ../requirements.txt (line 4)) (1.1.1)\n", + "Requirement already satisfied: requests in /home/vscode/.local/lib/python3.11/site-packages (from -r ../requirements.txt (line 5)) (2.32.5)\n", + "Requirement already satisfied: Pillow in /home/vscode/.local/lib/python3.11/site-packages (from -r ../requirements.txt (line 6)) (11.3.0)\n", + "Requirement already satisfied: aiohappyeyeballs>=2.5.0 in /home/vscode/.local/lib/python3.11/site-packages (from aiohttp->-r ../requirements.txt (line 1)) (2.6.1)\n", + "Requirement already satisfied: aiosignal>=1.4.0 in /home/vscode/.local/lib/python3.11/site-packages (from aiohttp->-r ../requirements.txt (line 1)) (1.4.0)\n", + "Requirement already satisfied: attrs>=17.3.0 in /home/vscode/.local/lib/python3.11/site-packages (from aiohttp->-r ../requirements.txt (line 1)) (25.3.0)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /home/vscode/.local/lib/python3.11/site-packages (from aiohttp->-r ../requirements.txt (line 1)) (1.7.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /home/vscode/.local/lib/python3.11/site-packages (from aiohttp->-r ../requirements.txt (line 1)) (6.6.4)\n", + "Requirement already satisfied: propcache>=0.2.0 in /home/vscode/.local/lib/python3.11/site-packages (from aiohttp->-r ../requirements.txt (line 1)) (0.3.2)\n", + "Requirement already satisfied: yarl<2.0,>=1.17.0 in /home/vscode/.local/lib/python3.11/site-packages (from aiohttp->-r ../requirements.txt (line 1)) (1.20.1)\n", + "Requirement already satisfied: aiohttp in /home/vscode/.local/lib/python3.11/site-packages (from -r ../requirements.txt (line 1)) (3.12.15)\n", + "Requirement already satisfied: azure-identity in /home/vscode/.local/lib/python3.11/site-packages (from -r ../requirements.txt (line 2)) (1.25.0)\n", + "Requirement already satisfied: azure-storage-blob in /home/vscode/.local/lib/python3.11/site-packages (from -r ../requirements.txt (line 3)) (12.26.0)\n", + "Requirement already satisfied: python-dotenv in /home/vscode/.local/lib/python3.11/site-packages (from -r ../requirements.txt (line 4)) (1.1.1)\n", + "Requirement already satisfied: requests in /home/vscode/.local/lib/python3.11/site-packages (from -r ../requirements.txt (line 5)) (2.32.5)\n", + "Requirement already satisfied: Pillow in /home/vscode/.local/lib/python3.11/site-packages (from -r ../requirements.txt (line 6)) (11.3.0)\n", + "Requirement already satisfied: aiohappyeyeballs>=2.5.0 in /home/vscode/.local/lib/python3.11/site-packages (from aiohttp->-r ../requirements.txt (line 1)) (2.6.1)\n", + "Requirement already satisfied: aiosignal>=1.4.0 in /home/vscode/.local/lib/python3.11/site-packages (from aiohttp->-r ../requirements.txt (line 1)) (1.4.0)\n", + "Requirement already satisfied: attrs>=17.3.0 in /home/vscode/.local/lib/python3.11/site-packages (from aiohttp->-r ../requirements.txt (line 1)) (25.3.0)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /home/vscode/.local/lib/python3.11/site-packages (from aiohttp->-r ../requirements.txt (line 1)) (1.7.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /home/vscode/.local/lib/python3.11/site-packages (from aiohttp->-r ../requirements.txt (line 1)) (6.6.4)\n", + "Requirement already satisfied: propcache>=0.2.0 in /home/vscode/.local/lib/python3.11/site-packages (from aiohttp->-r ../requirements.txt (line 1)) (0.3.2)\n", + "Requirement already satisfied: yarl<2.0,>=1.17.0 in /home/vscode/.local/lib/python3.11/site-packages (from aiohttp->-r ../requirements.txt (line 1)) (1.20.1)\n", + "Requirement already satisfied: azure-core>=1.31.0 in /home/vscode/.local/lib/python3.11/site-packages (from azure-identity->-r ../requirements.txt (line 2)) (1.35.1)\n", + "Requirement already satisfied: cryptography>=2.5 in /home/vscode/.local/lib/python3.11/site-packages (from azure-identity->-r ../requirements.txt (line 2)) (46.0.2)\n", + "Requirement already satisfied: msal>=1.30.0 in /home/vscode/.local/lib/python3.11/site-packages (from azure-identity->-r ../requirements.txt (line 2)) (1.34.0)\n", + "Requirement already satisfied: msal-extensions>=1.2.0 in /home/vscode/.local/lib/python3.11/site-packages (from azure-identity->-r ../requirements.txt (line 2)) (1.3.1)\n", + "Requirement already satisfied: typing-extensions>=4.0.0 in /home/vscode/.local/lib/python3.11/site-packages (from azure-identity->-r ../requirements.txt (line 2)) (4.15.0)\n", + "Requirement already satisfied: isodate>=0.6.1 in /home/vscode/.local/lib/python3.11/site-packages (from azure-storage-blob->-r ../requirements.txt (line 3)) (0.7.2)\n", + "Requirement already satisfied: charset_normalizer<4,>=2 in /home/vscode/.local/lib/python3.11/site-packages (from requests->-r ../requirements.txt (line 5)) (3.4.3)\n", + "Requirement already satisfied: idna<4,>=2.5 in /home/vscode/.local/lib/python3.11/site-packages (from requests->-r ../requirements.txt (line 5)) (3.10)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /home/vscode/.local/lib/python3.11/site-packages (from requests->-r ../requirements.txt (line 5)) (2.5.0)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /home/vscode/.local/lib/python3.11/site-packages (from requests->-r ../requirements.txt (line 5)) (2025.8.3)\n", + "Requirement already satisfied: azure-core>=1.31.0 in /home/vscode/.local/lib/python3.11/site-packages (from azure-identity->-r ../requirements.txt (line 2)) (1.35.1)\n", + "Requirement already satisfied: cryptography>=2.5 in /home/vscode/.local/lib/python3.11/site-packages (from azure-identity->-r ../requirements.txt (line 2)) (46.0.2)\n", + "Requirement already satisfied: msal>=1.30.0 in /home/vscode/.local/lib/python3.11/site-packages (from azure-identity->-r ../requirements.txt (line 2)) (1.34.0)\n", + "Requirement already satisfied: msal-extensions>=1.2.0 in /home/vscode/.local/lib/python3.11/site-packages (from azure-identity->-r ../requirements.txt (line 2)) (1.3.1)\n", + "Requirement already satisfied: typing-extensions>=4.0.0 in /home/vscode/.local/lib/python3.11/site-packages (from azure-identity->-r ../requirements.txt (line 2)) (4.15.0)\n", + "Requirement already satisfied: isodate>=0.6.1 in /home/vscode/.local/lib/python3.11/site-packages (from azure-storage-blob->-r ../requirements.txt (line 3)) (0.7.2)\n", + "Requirement already satisfied: charset_normalizer<4,>=2 in /home/vscode/.local/lib/python3.11/site-packages (from requests->-r ../requirements.txt (line 5)) (3.4.3)\n", + "Requirement already satisfied: idna<4,>=2.5 in /home/vscode/.local/lib/python3.11/site-packages (from requests->-r ../requirements.txt (line 5)) (3.10)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /home/vscode/.local/lib/python3.11/site-packages (from requests->-r ../requirements.txt (line 5)) (2.5.0)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /home/vscode/.local/lib/python3.11/site-packages (from requests->-r ../requirements.txt (line 5)) (2025.8.3)\n", + "Requirement already satisfied: six>=1.11.0 in /home/vscode/.local/lib/python3.11/site-packages (from azure-core>=1.31.0->azure-identity->-r ../requirements.txt (line 2)) (1.17.0)\n", + "Requirement already satisfied: six>=1.11.0 in /home/vscode/.local/lib/python3.11/site-packages (from azure-core>=1.31.0->azure-identity->-r ../requirements.txt (line 2)) (1.17.0)\n", + "Requirement already satisfied: cffi>=2.0.0 in /home/vscode/.local/lib/python3.11/site-packages (from cryptography>=2.5->azure-identity->-r ../requirements.txt (line 2)) (2.0.0)\n", + "Requirement already satisfied: PyJWT<3,>=1.0.0 in /home/vscode/.local/lib/python3.11/site-packages (from PyJWT[crypto]<3,>=1.0.0->msal>=1.30.0->azure-identity->-r ../requirements.txt (line 2)) (2.10.1)\n", + "Requirement already satisfied: cffi>=2.0.0 in /home/vscode/.local/lib/python3.11/site-packages (from cryptography>=2.5->azure-identity->-r ../requirements.txt (line 2)) (2.0.0)\n", + "Requirement already satisfied: PyJWT<3,>=1.0.0 in /home/vscode/.local/lib/python3.11/site-packages (from PyJWT[crypto]<3,>=1.0.0->msal>=1.30.0->azure-identity->-r ../requirements.txt (line 2)) (2.10.1)\n", + "Requirement already satisfied: pycparser in /home/vscode/.local/lib/python3.11/site-packages (from cffi>=2.0.0->cryptography>=2.5->azure-identity->-r ../requirements.txt (line 2)) (2.23)\n", + "Requirement already satisfied: pycparser in /home/vscode/.local/lib/python3.11/site-packages (from cffi>=2.0.0->cryptography>=2.5->azure-identity->-r ../requirements.txt (line 2)) (2.23)\n", + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.2\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.2\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", + "Note: you may need to restart the kernel to use updated packages.\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install -r ../requirements.txt" + ] + }, + { + "cell_type": "markdown", + "id": "a0032373", + "metadata": {}, + "source": [ + "## Create Azure AI Content Understanding Client\n", + "\n", + "> The [AzureContentUnderstandingClient](../python/content_understanding_client.py) is a utility class providing functions to interact with the Content Understanding API. Before the official release of the Content Understanding SDK, this acts as a lightweight SDK.\n", + "\n", + "> ⚠️ **Important**: Update the code below to match your Azure authentication method. Look for the `# IMPORTANT` comments and modify those sections accordingly.\n", + "\n", + "> ⚠️ **Note**: Using a subscription key works, but using a token provider with Azure Active Directory (AAD) is safer and highly recommended for production environments." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "bcea7936", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:azure.identity._credentials.environment:No environment configuration found.\n", + "INFO:azure.identity._credentials.managed_identity:ManagedIdentityCredential will use IMDS\n", + "INFO:azure.core.pipeline.policies.http_logging_policy:Request URL: 'http://169.254.169.254/metadata/identity/oauth2/token?api-version=REDACTED&resource=REDACTED'\n", + "Request method: 'GET'\n", + "Request headers:\n", + " 'User-Agent': 'azsdk-python-identity/1.25.0 Python/3.11.13 (Linux-6.8.0-1030-azure-x86_64-with-glibc2.41)'\n", + "No body was attached to the request\n", + "INFO:azure.identity._credentials.managed_identity:ManagedIdentityCredential will use IMDS\n", + "INFO:azure.core.pipeline.policies.http_logging_policy:Request URL: 'http://169.254.169.254/metadata/identity/oauth2/token?api-version=REDACTED&resource=REDACTED'\n", + "Request method: 'GET'\n", + "Request headers:\n", + " 'User-Agent': 'azsdk-python-identity/1.25.0 Python/3.11.13 (Linux-6.8.0-1030-azure-x86_64-with-glibc2.41)'\n", + "No body was attached to the request\n", + "INFO:azure.core.pipeline.policies.http_logging_policy:Response status: 400\n", + "Response headers:\n", + " 'Content-Type': 'application/json; charset=utf-8'\n", + " 'Server': 'IMDS/150.870.65.1854'\n", + " 'x-ms-request-id': '7683a8fc-6110-4d17-ba92-e7986c8af8e0'\n", + " 'Date': 'Wed, 22 Oct 2025 22:06:40 GMT'\n", + " 'Content-Length': '88'\n", + "INFO:azure.core.pipeline.policies.http_logging_policy:Request URL: 'http://169.254.169.254/metadata/identity/oauth2/token?api-version=REDACTED&resource=REDACTED'\n", + "Request method: 'GET'\n", + "Request headers:\n", + " 'Metadata': 'REDACTED'\n", + " 'User-Agent': 'azsdk-python-identity/1.25.0 Python/3.11.13 (Linux-6.8.0-1030-azure-x86_64-with-glibc2.41)'\n", + "No body was attached to the request\n", + "INFO:azure.core.pipeline.policies.http_logging_policy:Response status: 400\n", + "Response headers:\n", + " 'Content-Type': 'application/json; charset=utf-8'\n", + " 'Server': 'IMDS/150.870.65.1854'\n", + " 'x-ms-request-id': '31ec0b5d-182f-4981-8624-34083dd1c063'\n", + " 'Date': 'Wed, 22 Oct 2025 22:06:40 GMT'\n", + " 'Content-Length': '68'\n", + "INFO:azure.core.pipeline.policies.http_logging_policy:Response status: 400\n", + "Response headers:\n", + " 'Content-Type': 'application/json; charset=utf-8'\n", + " 'Server': 'IMDS/150.870.65.1854'\n", + " 'x-ms-request-id': '7683a8fc-6110-4d17-ba92-e7986c8af8e0'\n", + " 'Date': 'Wed, 22 Oct 2025 22:06:40 GMT'\n", + " 'Content-Length': '88'\n", + "INFO:azure.core.pipeline.policies.http_logging_policy:Request URL: 'http://169.254.169.254/metadata/identity/oauth2/token?api-version=REDACTED&resource=REDACTED'\n", + "Request method: 'GET'\n", + "Request headers:\n", + " 'Metadata': 'REDACTED'\n", + " 'User-Agent': 'azsdk-python-identity/1.25.0 Python/3.11.13 (Linux-6.8.0-1030-azure-x86_64-with-glibc2.41)'\n", + "No body was attached to the request\n", + "INFO:azure.core.pipeline.policies.http_logging_policy:Response status: 400\n", + "Response headers:\n", + " 'Content-Type': 'application/json; charset=utf-8'\n", + " 'Server': 'IMDS/150.870.65.1854'\n", + " 'x-ms-request-id': '31ec0b5d-182f-4981-8624-34083dd1c063'\n", + " 'Date': 'Wed, 22 Oct 2025 22:06:40 GMT'\n", + " 'Content-Length': '68'\n", + "INFO:azure.identity._credentials.chained:DefaultAzureCredential acquired a token from AzureDeveloperCliCredential\n", + "INFO:azure.identity._credentials.chained:DefaultAzureCredential acquired a token from AzureDeveloperCliCredential\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Content Understanding client initialized successfully!\n" + ] + } + ], + "source": [ + "import logging\n", + "import json\n", + "import os\n", + "import sys\n", + "import uuid\n", + "from pathlib import Path\n", + "from dotenv import find_dotenv, load_dotenv\n", + "from azure.identity import DefaultAzureCredential, get_bearer_token_provider\n", + "\n", + "load_dotenv(find_dotenv())\n", + "logging.basicConfig(level=logging.INFO)\n", + "\n", + "# For authentication, you can use either token-based authentication or a subscription key; only one method is required.\n", + "AZURE_AI_ENDPOINT = os.getenv(\"AZURE_AI_ENDPOINT\")\n", + "# IMPORTANT: Replace with your actual subscription key or set it in the \".env\" file if not using token authentication.\n", + "AZURE_AI_API_KEY = os.getenv(\"AZURE_AI_API_KEY\")\n", + "AZURE_AI_API_VERSION = os.getenv(\"AZURE_AI_API_VERSION\", \"2025-05-01-preview\")\n", + "\n", + "# Add the parent directory to the path to use shared modules\n", + "parent_dir = Path(Path.cwd()).parent\n", + "sys.path.append(str(parent_dir))\n", + "from python.content_understanding_client import AzureContentUnderstandingClient\n", + "\n", + "credential = DefaultAzureCredential()\n", + "token_provider = get_bearer_token_provider(credential, \"https://cognitiveservices.azure.com/.default\")\n", + "\n", + "client = AzureContentUnderstandingClient(\n", + " endpoint=AZURE_AI_ENDPOINT,\n", + " api_version=AZURE_AI_API_VERSION,\n", + " # IMPORTANT: Comment out token_provider if using subscription key\n", + " token_provider=token_provider,\n", + " # IMPORTANT: Uncomment this if using subscription key\n", + " # subscription_key=AZURE_AI_API_KEY,\n", + " x_ms_useragent=\"azure-ai-content-understanding-python/move_training_data\",\n", + ")\n", + "\n", + "print(\"✅ Content Understanding client initialized successfully!\")" + ] + }, + { + "cell_type": "markdown", + "id": "92e5f27f", + "metadata": {}, + "source": [ + "## Step 1: List Available Analyzers\n", + "\n", + "First, let's see what analyzers are available in your resource. We'll look for analyzers that have training data configured." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "fcbc218a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found 675 analyzer(s) in your resource\n", + "\n", + "Available analyzers:\n", + "1. ID: prebuilt-audioAnalyzer\n", + " Name: N/A\n", + "\n", + "2. ID: prebuilt-callCenter\n", + " Name: N/A\n", + "\n", + "3. ID: prebuilt-contract\n", + " Name: N/A\n", + "\n", + "4. ID: prebuilt-documentAnalyzer\n", + " Name: N/A\n", + "\n", + "5. ID: prebuilt-imageAnalyzer\n", + " Name: N/A\n", + "\n", + "6. ID: prebuilt-invoice\n", + " Name: N/A\n", + "\n", + "7. ID: prebuilt-videoAnalyzer\n", + " Name: N/A\n", + "\n", + "8. ID: 123\n", + " Name: N/A\n", + "\n", + "9. ID: Test-description\n", + " Name: N/A\n", + "\n", + "10. ID: Test\n", + " Name: N/A\n", + "\n", + "11. ID: abc\n", + " Name: N/A\n", + "\n", + "12. ID: audio-250808\n", + " Name: N/A\n", + "\n", + "13. ID: auto-highlight-analyzer-1753389013\n", + " Name: N/A\n", + "\n", + "14. ID: auto-highlight-analyzer-1753393121\n", + " Name: N/A\n", + "\n", + "15. ID: auto-highlight-analyzer-1753727044\n", + " Name: N/A\n", + "\n", + "16. ID: auto-highlight-analyzer-1753728638\n", + " Name: N/A\n", + "\n", + "17. ID: auto-highlight-analyzer-1753822646\n", + " Name: N/A\n", + "\n", + "18. ID: auto-highlight-analyzer-1753823934\n", + " Name: N/A\n", + "\n", + "19. ID: auto-highlight-analyzer-1753826664\n", + " Name: N/A\n", + "\n", + "20. ID: auto-highlight-analyzer-1753829625\n", + " Name: N/A\n", + "\n", + "21. ID: auto-highlight-analyzer-1754935354\n", + " Name: N/A\n", + "\n", + "22. ID: auto-labeling-model-1748319168608-457\n", + " Name: N/A\n", + "\n", + "23. ID: auto-labeling-model-1748343190922-522\n", + " Name: N/A\n", + "\n", + "24. ID: auto-labeling-model-1748343844913-193\n", + " Name: N/A\n", + "\n", + "25. ID: auto-labeling-model-1748364582299-194\n", + " Name: N/A\n", + "\n", + "26. ID: auto-labeling-model-1748364610998-174\n", + " Name: N/A\n", + "\n", + "27. ID: auto-labeling-model-1748364627905-392\n", + " Name: N/A\n", + "\n", + "28. ID: auto-labeling-model-1748364882995-331\n", + " Name: N/A\n", + "\n", + "29. ID: auto-labeling-model-1748365809345-194\n", + " Name: N/A\n", + "\n", + "30. ID: auto-labeling-model-1748365844597-722\n", + " Name: N/A\n", + "\n", + "31. ID: auto-labeling-model-1748369310664-291\n", + " Name: N/A\n", + "\n", + "32. ID: auto-labeling-model-1748382666104-108\n", + " Name: N/A\n", + "\n", + "33. ID: auto-labeling-model-1748398666237-678\n", + " Name: N/A\n", + "\n", + "34. ID: auto-labeling-model-1748406169100-153\n", + " Name: N/A\n", + "\n", + "35. ID: auto-labeling-model-1748487450682-652\n", + " Name: N/A\n", + "\n", + "36. ID: auto-labeling-model-1748490709500-887\n", + " Name: N/A\n", + "\n", + "37. ID: auto-labeling-model-1748524957609-245\n", + " Name: N/A\n", + "\n", + "38. ID: auto-labeling-model-1748525150770-437\n", + " Name: N/A\n", + "\n", + "39. ID: auto-labeling-model-1748527146405-802\n", + " Name: N/A\n", + "\n", + "40. ID: auto-labeling-model-1748532349641-24\n", + " Name: N/A\n", + "\n", + "41. ID: auto-labeling-model-1748652707721-341\n", + " Name: N/A\n", + "\n", + "42. ID: auto-labeling-model-1748652848103-155\n", + " Name: N/A\n", + "\n", + "43. ID: auto-labeling-model-1748839949920-863\n", + " Name: N/A\n", + "\n", + "44. ID: auto-labeling-model-1748845791989-716\n", + " Name: N/A\n", + "\n", + "45. ID: auto-labeling-model-1748845807869-415\n", + " Name: N/A\n", + "\n", + "46. ID: auto-labeling-model-1748907891703-517\n", + " Name: N/A\n", + "\n", + "47. ID: auto-labeling-model-1748908692967-569\n", + " Name: N/A\n", + "\n", + "48. ID: auto-labeling-model-1748914058095-616\n", + " Name: N/A\n", + "\n", + "49. ID: auto-labeling-model-1748936065478-291\n", + " Name: N/A\n", + "\n", + "50. ID: auto-labeling-model-1748936271674-552\n", + " Name: N/A\n", + "\n", + "51. ID: auto-labeling-model-1748936490686-646\n", + " Name: N/A\n", + "\n", + "52. ID: auto-labeling-model-1748937447139-653\n", + " Name: N/A\n", + "\n", + "53. ID: auto-labeling-model-1748940860399-529\n", + " Name: N/A\n", + "\n", + "54. ID: auto-labeling-model-1748941320548-161\n", + " Name: N/A\n", + "\n", + "55. ID: auto-labeling-model-1748941816737-4\n", + " Name: N/A\n", + "\n", + "56. ID: auto-labeling-model-1748942668260-584\n", + " Name: N/A\n", + "\n", + "57. ID: auto-labeling-model-1748942752946-240\n", + " Name: N/A\n", + "\n", + "58. ID: auto-labeling-model-1748943751138-585\n", + " Name: N/A\n", + "\n", + "59. ID: auto-labeling-model-1748943869439-730\n", + " Name: N/A\n", + "\n", + "60. ID: auto-labeling-model-1748944505181-366\n", + " Name: N/A\n", + "\n", + "61. ID: auto-labeling-model-1748945194482-115\n", + " Name: N/A\n", + "\n", + "62. ID: auto-labeling-model-1749003326198-992\n", + " Name: N/A\n", + "\n", + "63. ID: auto-labeling-model-1749023590022-874\n", + " Name: N/A\n", + "\n", + "64. ID: auto-labeling-model-1749023636121-927\n", + " Name: N/A\n", + "\n", + "65. ID: auto-labeling-model-1749023850993-339\n", + " Name: N/A\n", + "\n", + "66. ID: auto-labeling-model-1749023887009-843\n", + " Name: N/A\n", + "\n", + "67. ID: auto-labeling-model-1749023901480-881\n", + " Name: N/A\n", + "\n", + "68. ID: auto-labeling-model-1749023933378-529\n", + " Name: N/A\n", + "\n", + "69. ID: auto-labeling-model-1749024617342-607\n", + " Name: N/A\n", + "\n", + "70. ID: auto-labeling-model-1749024650401-862\n", + " Name: N/A\n", + "\n", + "71. ID: auto-labeling-model-1749095665011-257\n", + " Name: N/A\n", + "\n", + "72. ID: auto-labeling-model-1749096929213-707\n", + " Name: N/A\n", + "\n", + "73. ID: auto-labeling-model-1749104361550-221\n", + " Name: N/A\n", + "\n", + "74. ID: auto-labeling-model-1749104922387-882\n", + " Name: N/A\n", + "\n", + "75. ID: auto-labeling-model-1749105026574-367\n", + " Name: N/A\n", + "\n", + "76. ID: auto-labeling-model-1749251965833-403\n", + " Name: N/A\n", + "\n", + "77. ID: auto-labeling-model-1749254053334-357\n", + " Name: N/A\n", + "\n", + "78. ID: auto-labeling-model-1749311286700-369\n", + " Name: N/A\n", + "\n", + "79. ID: auto-labeling-model-1749509842310-370\n", + " Name: N/A\n", + "\n", + "80. ID: auto-labeling-model-1749520600099-409\n", + " Name: N/A\n", + "\n", + "81. ID: auto-labeling-model-1749522784982-438\n", + " Name: N/A\n", + "\n", + "82. ID: auto-labeling-model-1749535466854-401\n", + " Name: N/A\n", + "\n", + "83. ID: auto-labeling-model-1749581796990-277\n", + " Name: N/A\n", + "\n", + "84. ID: auto-labeling-model-1749581836897-138\n", + " Name: N/A\n", + "\n", + "85. ID: auto-labeling-model-1749584140873-572\n", + " Name: N/A\n", + "\n", + "86. ID: auto-labeling-model-1749585959231-24\n", + " Name: N/A\n", + "\n", + "87. ID: auto-labeling-model-1749604604536-674\n", + " Name: N/A\n", + "\n", + "88. ID: auto-labeling-model-1749620902726-984\n", + " Name: N/A\n", + "\n", + "89. ID: auto-labeling-model-1749626687259-809\n", + " Name: N/A\n", + "\n", + "90. ID: auto-labeling-model-1749627602312-979\n", + " Name: N/A\n", + "\n", + "91. ID: auto-labeling-model-1749630601186-689\n", + " Name: N/A\n", + "\n", + "92. ID: auto-labeling-model-1749631339251-319\n", + " Name: N/A\n", + "\n", + "93. ID: auto-labeling-model-1749631742974-733\n", + " Name: N/A\n", + "\n", + "94. ID: auto-labeling-model-1749631891328-309\n", + " Name: N/A\n", + "\n", + "95. ID: auto-labeling-model-1749696702275-545\n", + " Name: N/A\n", + "\n", + "96. ID: auto-labeling-model-1749758278394-240\n", + " Name: N/A\n", + "\n", + "97. ID: auto-labeling-model-1749758517784-660\n", + " Name: N/A\n", + "\n", + "98. ID: auto-labeling-model-1749758533104-929\n", + " Name: N/A\n", + "\n", + "99. ID: auto-labeling-model-1749758555087-116\n", + " Name: N/A\n", + "\n", + "100. ID: auto-labeling-model-1749759432793-891\n", + " Name: N/A\n", + "\n", + "101. ID: auto-labeling-model-1749768746704-802\n", + " Name: N/A\n", + "\n", + "102. ID: auto-labeling-model-1749775305589-256\n", + " Name: N/A\n", + "\n", + "103. ID: auto-labeling-model-1749802761164-406\n", + " Name: N/A\n", + "\n", + "104. ID: auto-labeling-model-1749956497322-594\n", + " Name: N/A\n", + "\n", + "105. ID: auto-labeling-model-1749960177654-514\n", + " Name: N/A\n", + "\n", + "106. ID: auto-labeling-model-1749961833034-154\n", + " Name: N/A\n", + "\n", + "107. ID: auto-labeling-model-1749962138214-21\n", + " Name: N/A\n", + "\n", + "108. ID: auto-labeling-model-1750045513862-445\n", + " Name: N/A\n", + "\n", + "109. ID: auto-labeling-model-1750108497453-922\n", + " Name: N/A\n", + "\n", + "110. ID: auto-labeling-model-1750123214932-968\n", + " Name: N/A\n", + "\n", + "111. ID: auto-labeling-model-1750128770286-412\n", + " Name: N/A\n", + "\n", + "112. ID: auto-labeling-model-1750128888980-243\n", + " Name: N/A\n", + "\n", + "113. ID: auto-labeling-model-1750141234245-231\n", + " Name: N/A\n", + "\n", + "114. ID: auto-labeling-model-1750145695285-480\n", + " Name: N/A\n", + "\n", + "115. ID: auto-labeling-model-1750211643719-379\n", + " Name: N/A\n", + "\n", + "116. ID: auto-labeling-model-1750233198991-694\n", + " Name: N/A\n", + "\n", + "117. ID: auto-labeling-model-1750241272780-2\n", + " Name: N/A\n", + "\n", + "118. ID: auto-labeling-model-1750279157596-35\n", + " Name: N/A\n", + "\n", + "119. ID: auto-labeling-model-1750291999953-91\n", + " Name: N/A\n", + "\n", + "120. ID: auto-labeling-model-1750292632586-625\n", + " Name: N/A\n", + "\n", + "121. ID: auto-labeling-model-1750312049582-59\n", + " Name: N/A\n", + "\n", + "122. ID: auto-labeling-model-1750312573420-578\n", + " Name: N/A\n", + "\n", + "123. ID: auto-labeling-model-1750376726735-970\n", + " Name: N/A\n", + "\n", + "124. ID: auto-labeling-model-1750377427038-364\n", + " Name: N/A\n", + "\n", + "125. ID: auto-labeling-model-1750385575232-897\n", + " Name: N/A\n", + "\n", + "126. ID: auto-labeling-model-1750403576185-741\n", + " Name: N/A\n", + "\n", + "127. ID: auto-labeling-model-1750404809435-451\n", + " Name: N/A\n", + "\n", + "128. ID: auto-labeling-model-1750405070052-89\n", + " Name: N/A\n", + "\n", + "129. ID: auto-labeling-model-1750405091355-763\n", + " Name: N/A\n", + "\n", + "130. ID: auto-labeling-model-1750417420016-430\n", + " Name: N/A\n", + "\n", + "131. ID: auto-labeling-model-1750659725597-788\n", + " Name: N/A\n", + "\n", + "132. ID: auto-labeling-model-1750659733517-772\n", + " Name: N/A\n", + "\n", + "133. ID: auto-labeling-model-1750659761722-251\n", + " Name: N/A\n", + "\n", + "134. ID: auto-labeling-model-1750659784566-101\n", + " Name: N/A\n", + "\n", + "135. ID: auto-labeling-model-1750659903607-108\n", + " Name: N/A\n", + "\n", + "136. ID: auto-labeling-model-1750659933637-141\n", + " Name: N/A\n", + "\n", + "137. ID: auto-labeling-model-1750659945217-945\n", + " Name: N/A\n", + "\n", + "138. ID: auto-labeling-model-1750660650963-739\n", + " Name: N/A\n", + "\n", + "139. ID: auto-labeling-model-1750660824597-923\n", + " Name: N/A\n", + "\n", + "140. ID: auto-labeling-model-1750663207559-512\n", + " Name: N/A\n", + "\n", + "141. ID: auto-labeling-model-1750663259510-796\n", + " Name: N/A\n", + "\n", + "142. ID: auto-labeling-model-1750663303432-581\n", + " Name: N/A\n", + "\n", + "143. ID: auto-labeling-model-1750663377213-340\n", + " Name: N/A\n", + "\n", + "144. ID: auto-labeling-model-1750663393108-597\n", + " Name: N/A\n", + "\n", + "145. ID: auto-labeling-model-1750664456347-683\n", + " Name: N/A\n", + "\n", + "146. ID: auto-labeling-model-1750664605893-618\n", + " Name: N/A\n", + "\n", + "147. ID: auto-labeling-model-1750665355708-8\n", + " Name: N/A\n", + "\n", + "148. ID: auto-labeling-model-1750673318125-535\n", + " Name: N/A\n", + "\n", + "149. ID: auto-labeling-model-1750673331433-642\n", + " Name: N/A\n", + "\n", + "150. ID: auto-labeling-model-1750709349430-630\n", + " Name: N/A\n", + "\n", + "151. ID: auto-labeling-model-1750719511542-531\n", + " Name: N/A\n", + "\n", + "152. ID: auto-labeling-model-1750744047556-446\n", + " Name: N/A\n", + "\n", + "153. ID: auto-labeling-model-1750755510472-120\n", + " Name: N/A\n", + "\n", + "154. ID: auto-labeling-model-1750784814399-27\n", + " Name: N/A\n", + "\n", + "155. ID: auto-labeling-model-1750788356545-200\n", + " Name: N/A\n", + "\n", + "156. ID: auto-labeling-model-1750789921864-730\n", + " Name: N/A\n", + "\n", + "157. ID: auto-labeling-model-1750836585070-913\n", + " Name: N/A\n", + "\n", + "158. ID: auto-labeling-model-1750842588854-962\n", + " Name: N/A\n", + "\n", + "159. ID: auto-labeling-model-1750842831795-314\n", + " Name: N/A\n", + "\n", + "160. ID: auto-labeling-model-1750842897183-394\n", + " Name: N/A\n", + "\n", + "161. ID: auto-labeling-model-1750842978258-136\n", + " Name: N/A\n", + "\n", + "162. ID: auto-labeling-model-1750843282949-512\n", + " Name: N/A\n", + "\n", + "163. ID: auto-labeling-model-1750843704909-216\n", + " Name: N/A\n", + "\n", + "164. ID: auto-labeling-model-1750843908445-174\n", + " Name: N/A\n", + "\n", + "165. ID: auto-labeling-model-1750844014408-330\n", + " Name: N/A\n", + "\n", + "166. ID: auto-labeling-model-1750844234138-988\n", + " Name: N/A\n", + "\n", + "167. ID: auto-labeling-model-1750844709672-320\n", + " Name: N/A\n", + "\n", + "168. ID: auto-labeling-model-1750845307517-940\n", + " Name: N/A\n", + "\n", + "169. ID: auto-labeling-model-1750846220484-837\n", + " Name: N/A\n", + "\n", + "170. ID: auto-labeling-model-1750846255005-395\n", + " Name: N/A\n", + "\n", + "171. ID: auto-labeling-model-1750847433984-311\n", + " Name: N/A\n", + "\n", + "172. ID: auto-labeling-model-1750853034834-460\n", + " Name: N/A\n", + "\n", + "173. ID: auto-labeling-model-1750919114419-408\n", + " Name: N/A\n", + "\n", + "174. ID: auto-labeling-model-1750920179010-279\n", + " Name: N/A\n", + "\n", + "175. ID: auto-labeling-model-1750920218343-518\n", + " Name: N/A\n", + "\n", + "176. ID: auto-labeling-model-1750920298701-557\n", + " Name: N/A\n", + "\n", + "177. ID: auto-labeling-model-1750920352617-62\n", + " Name: N/A\n", + "\n", + "178. ID: auto-labeling-model-1751052501474-178\n", + " Name: N/A\n", + "\n", + "179. ID: auto-labeling-model-1751069615217-264\n", + " Name: N/A\n", + "\n", + "180. ID: auto-labeling-model-1751270970103-549\n", + " Name: N/A\n", + "\n", + "181. ID: auto-labeling-model-1751272499140-268\n", + " Name: N/A\n", + "\n", + "182. ID: auto-labeling-model-1751272544250-613\n", + " Name: N/A\n", + "\n", + "183. ID: auto-labeling-model-1751273787498-265\n", + " Name: N/A\n", + "\n", + "184. ID: auto-labeling-model-1751273849331-220\n", + " Name: N/A\n", + "\n", + "185. ID: auto-labeling-model-1751273904647-201\n", + " Name: N/A\n", + "\n", + "186. ID: auto-labeling-model-1751273937246-448\n", + " Name: N/A\n", + "\n", + "187. ID: auto-labeling-model-1751273983364-401\n", + " Name: N/A\n", + "\n", + "188. ID: auto-labeling-model-1751336918679-904\n", + " Name: N/A\n", + "\n", + "189. ID: auto-labeling-model-1751349360361-963\n", + " Name: N/A\n", + "\n", + "190. ID: auto-labeling-model-1751427888199-459\n", + " Name: N/A\n", + "\n", + "191. ID: auto-labeling-model-1751427891721-940\n", + " Name: N/A\n", + "\n", + "192. ID: auto-labeling-model-1751441608096-967\n", + " Name: N/A\n", + "\n", + "193. ID: auto-labeling-model-1751441662962-402\n", + " Name: N/A\n", + "\n", + "194. ID: auto-labeling-model-1751444577624-169\n", + " Name: N/A\n", + "\n", + "195. ID: auto-labeling-model-1751446425406-566\n", + " Name: N/A\n", + "\n", + "196. ID: auto-labeling-model-1751446744627-904\n", + " Name: N/A\n", + "\n", + "197. ID: auto-labeling-model-1751447069922-153\n", + " Name: N/A\n", + "\n", + "198. ID: auto-labeling-model-1751447126141-210\n", + " Name: N/A\n", + "\n", + "199. ID: auto-labeling-model-1751450223362-323\n", + " Name: N/A\n", + "\n", + "200. ID: auto-labeling-model-1751619901375-912\n", + " Name: N/A\n", + "\n", + "201. ID: auto-labeling-model-1751621939880-824\n", + " Name: N/A\n", + "\n", + "202. ID: auto-labeling-model-1751622003371-912\n", + " Name: N/A\n", + "\n", + "203. ID: auto-labeling-model-1751622246359-22\n", + " Name: N/A\n", + "\n", + "204. ID: auto-labeling-model-1751622337847-185\n", + " Name: N/A\n", + "\n", + "205. ID: auto-labeling-model-1751630796222-228\n", + " Name: N/A\n", + "\n", + "206. ID: auto-labeling-model-1751630815948-351\n", + " Name: N/A\n", + "\n", + "207. ID: auto-labeling-model-1751998528557-924\n", + " Name: N/A\n", + "\n", + "208. ID: auto-labeling-model-1752025809239-846\n", + " Name: N/A\n", + "\n", + "209. ID: auto-labeling-model-1752034702114-180\n", + " Name: N/A\n", + "\n", + "210. ID: auto-labeling-model-1752098586840-747\n", + " Name: N/A\n", + "\n", + "211. ID: auto-labeling-model-1752180782600-490\n", + " Name: N/A\n", + "\n", + "212. ID: auto-labeling-model-1752271117113-156\n", + " Name: N/A\n", + "\n", + "213. ID: auto-labeling-model-1752523653762-595\n", + " Name: N/A\n", + "\n", + "214. ID: auto-labeling-model-1752600290738-67\n", + " Name: N/A\n", + "\n", + "215. ID: auto-labeling-model-1752625416686-81\n", + " Name: N/A\n", + "\n", + "216. ID: auto-labeling-model-1752625871649-767\n", + " Name: N/A\n", + "\n", + "217. ID: auto-labeling-model-1752693120005-346\n", + " Name: N/A\n", + "\n", + "218. ID: auto-labeling-model-1752697569506-376\n", + " Name: N/A\n", + "\n", + "219. ID: auto-labeling-model-1752697610504-950\n", + " Name: N/A\n", + "\n", + "220. ID: auto-labeling-model-1752700740555-590\n", + " Name: N/A\n", + "\n", + "221. ID: auto-labeling-model-1752708687132-939\n", + " Name: N/A\n", + "\n", + "222. ID: auto-labeling-model-1752741732428-578\n", + " Name: N/A\n", + "\n", + "223. ID: auto-labeling-model-1752780032715-66\n", + " Name: N/A\n", + "\n", + "224. ID: auto-labeling-model-1752780325289-573\n", + " Name: N/A\n", + "\n", + "225. ID: auto-labeling-model-1752795955082-603\n", + " Name: N/A\n", + "\n", + "226. ID: auto-labeling-model-1752796753555-462\n", + " Name: N/A\n", + "\n", + "227. ID: auto-labeling-model-1752797239305-251\n", + " Name: N/A\n", + "\n", + "228. ID: auto-labeling-model-1752800932971-876\n", + " Name: N/A\n", + "\n", + "229. ID: auto-labeling-model-1752803086727-971\n", + " Name: N/A\n", + "\n", + "230. ID: auto-labeling-model-1752803985621-193\n", + " Name: N/A\n", + "\n", + "231. ID: auto-labeling-model-1752806777300-862\n", + " Name: N/A\n", + "\n", + "232. ID: auto-labeling-model-1752884829621-441\n", + " Name: N/A\n", + "\n", + "233. ID: auto-labeling-model-1753083025779-103\n", + " Name: N/A\n", + "\n", + "234. ID: auto-labeling-model-1753083077531-666\n", + " Name: N/A\n", + "\n", + "235. ID: auto-labeling-model-1753083850816-29\n", + " Name: N/A\n", + "\n", + "236. ID: auto-labeling-model-1753083864041-58\n", + " Name: N/A\n", + "\n", + "237. ID: auto-labeling-model-1753086883459-951\n", + " Name: N/A\n", + "\n", + "238. ID: auto-labeling-model-1753089079279-222\n", + " Name: N/A\n", + "\n", + "239. ID: auto-labeling-model-1753150531096-410\n", + " Name: N/A\n", + "\n", + "240. ID: auto-labeling-model-1753151865515-394\n", + " Name: N/A\n", + "\n", + "241. ID: auto-labeling-model-1753168395318-507\n", + " Name: N/A\n", + "\n", + "242. ID: auto-labeling-model-1753169409334-912\n", + " Name: N/A\n", + "\n", + "243. ID: auto-labeling-model-1753173597967-303\n", + " Name: N/A\n", + "\n", + "244. ID: auto-labeling-model-1753177537439-711\n", + " Name: N/A\n", + "\n", + "245. ID: auto-labeling-model-1753205662320-583\n", + " Name: N/A\n", + "\n", + "246. ID: auto-labeling-model-1753207022483-913\n", + " Name: N/A\n", + "\n", + "247. ID: auto-labeling-model-1753207579262-276\n", + " Name: N/A\n", + "\n", + "248. ID: auto-labeling-model-1753208672240-981\n", + " Name: N/A\n", + "\n", + "249. ID: auto-labeling-model-1753209156822-298\n", + " Name: N/A\n", + "\n", + "250. ID: auto-labeling-model-1753209981617-818\n", + " Name: N/A\n", + "\n", + "251. ID: auto-labeling-model-1753236316137-300\n", + " Name: N/A\n", + "\n", + "252. ID: auto-labeling-model-1753237512820-249\n", + " Name: N/A\n", + "\n", + "253. ID: auto-labeling-model-1753250369127-625\n", + " Name: N/A\n", + "\n", + "254. ID: auto-labeling-model-1753255567341-610\n", + " Name: N/A\n", + "\n", + "255. ID: auto-labeling-model-1753259092944-226\n", + " Name: N/A\n", + "\n", + "256. ID: auto-labeling-model-1753287197755-783\n", + " Name: N/A\n", + "\n", + "257. ID: auto-labeling-model-1753321650913-823\n", + " Name: N/A\n", + "\n", + "258. ID: auto-labeling-model-1753325891996-80\n", + " Name: N/A\n", + "\n", + "259. ID: auto-labeling-model-1753334968241-706\n", + " Name: N/A\n", + "\n", + "260. ID: auto-labeling-model-1753335132165-512\n", + " Name: N/A\n", + "\n", + "261. ID: auto-labeling-model-1753335555914-390\n", + " Name: N/A\n", + "\n", + "262. ID: auto-labeling-model-1753335697157-843\n", + " Name: N/A\n", + "\n", + "263. ID: auto-labeling-model-1753340903345-139\n", + " Name: N/A\n", + "\n", + "264. ID: auto-labeling-model-1753344102782-140\n", + " Name: N/A\n", + "\n", + "265. ID: auto-labeling-model-1753344491064-431\n", + " Name: N/A\n", + "\n", + "266. ID: auto-labeling-model-1753344947435-154\n", + " Name: N/A\n", + "\n", + "267. ID: auto-labeling-model-1753346772842-804\n", + " Name: N/A\n", + "\n", + "268. ID: auto-labeling-model-1753420107017-420\n", + " Name: N/A\n", + "\n", + "269. ID: auto-labeling-model-1753420466410-256\n", + " Name: N/A\n", + "\n", + "270. ID: auto-labeling-model-1753423049391-214\n", + " Name: N/A\n", + "\n", + "271. ID: auto-labeling-model-1753430316648-188\n", + " Name: N/A\n", + "\n", + "272. ID: auto-labeling-model-1753431705642-795\n", + " Name: N/A\n", + "\n", + "273. ID: auto-labeling-model-1753432653890-622\n", + " Name: N/A\n", + "\n", + "274. ID: auto-labeling-model-1753433164146-455\n", + " Name: N/A\n", + "\n", + "275. ID: auto-labeling-model-1753434806213-833\n", + " Name: N/A\n", + "\n", + "276. ID: auto-labeling-model-1753670824352-493\n", + " Name: N/A\n", + "\n", + "277. ID: auto-labeling-model-1753680640396-566\n", + " Name: N/A\n", + "\n", + "278. ID: auto-labeling-model-1753681888155-667\n", + " Name: N/A\n", + "\n", + "279. ID: auto-labeling-model-1753682254644-331\n", + " Name: N/A\n", + "\n", + "280. ID: auto-labeling-model-1753683583061-323\n", + " Name: N/A\n", + "\n", + "281. ID: auto-labeling-model-1753684547670-475\n", + " Name: N/A\n", + "\n", + "282. ID: auto-labeling-model-1753684784064-358\n", + " Name: N/A\n", + "\n", + "283. ID: auto-labeling-model-1753686206798-898\n", + " Name: N/A\n", + "\n", + "284. ID: auto-labeling-model-1753686800552-354\n", + " Name: N/A\n", + "\n", + "285. ID: auto-labeling-model-1753691313133-192\n", + " Name: N/A\n", + "\n", + "286. ID: auto-labeling-model-1753755468942-82\n", + " Name: N/A\n", + "\n", + "287. ID: auto-labeling-model-1753765727024-37\n", + " Name: N/A\n", + "\n", + "288. ID: auto-labeling-model-1753766046014-152\n", + " Name: N/A\n", + "\n", + "289. ID: auto-labeling-model-1753767335342-370\n", + " Name: N/A\n", + "\n", + "290. ID: auto-labeling-model-1753767338325-621\n", + " Name: N/A\n", + "\n", + "291. ID: auto-labeling-model-1753773699582-540\n", + " Name: N/A\n", + "\n", + "292. ID: auto-labeling-model-1753774470271-985\n", + " Name: N/A\n", + "\n", + "293. ID: auto-labeling-model-1753775949221-151\n", + " Name: N/A\n", + "\n", + "294. ID: auto-labeling-model-1753777245479-372\n", + " Name: N/A\n", + "\n", + "295. ID: auto-labeling-model-1753777925896-803\n", + " Name: N/A\n", + "\n", + "296. ID: auto-labeling-model-1753780557881-855\n", + " Name: N/A\n", + "\n", + "297. ID: auto-labeling-model-1753841121952-979\n", + " Name: N/A\n", + "\n", + "298. ID: auto-labeling-model-1753841981886-902\n", + " Name: N/A\n", + "\n", + "299. ID: auto-labeling-model-1753843376936-643\n", + " Name: N/A\n", + "\n", + "300. ID: auto-labeling-model-1753844211334-641\n", + " Name: N/A\n", + "\n", + "301. ID: auto-labeling-model-1753853033274-214\n", + " Name: N/A\n", + "\n", + "302. ID: auto-labeling-model-1753855251911-309\n", + " Name: N/A\n", + "\n", + "303. ID: auto-labeling-model-1753855551724-866\n", + " Name: N/A\n", + "\n", + "304. ID: auto-labeling-model-1753857116602-791\n", + " Name: N/A\n", + "\n", + "305. ID: auto-labeling-model-1753857268920-608\n", + " Name: N/A\n", + "\n", + "306. ID: auto-labeling-model-1753857820246-647\n", + " Name: N/A\n", + "\n", + "307. ID: auto-labeling-model-1753857865813-554\n", + " Name: N/A\n", + "\n", + "308. ID: auto-labeling-model-1753858369469-249\n", + " Name: N/A\n", + "\n", + "309. ID: auto-labeling-model-1753859412803-605\n", + " Name: N/A\n", + "\n", + "310. ID: auto-labeling-model-1753860904131-872\n", + " Name: N/A\n", + "\n", + "311. ID: auto-labeling-model-1753861167980-954\n", + " Name: N/A\n", + "\n", + "312. ID: auto-labeling-model-1753861799127-664\n", + " Name: N/A\n", + "\n", + "313. ID: auto-labeling-model-1753862553873-905\n", + " Name: N/A\n", + "\n", + "314. ID: auto-labeling-model-1753862814119-255\n", + " Name: N/A\n", + "\n", + "315. ID: auto-labeling-model-1753863784180-612\n", + " Name: N/A\n", + "\n", + "316. ID: auto-labeling-model-1753863994987-510\n", + " Name: N/A\n", + "\n", + "317. ID: auto-labeling-model-1753864084656-697\n", + " Name: N/A\n", + "\n", + "318. ID: auto-labeling-model-1753865255601-417\n", + " Name: N/A\n", + "\n", + "319. ID: auto-labeling-model-1753888993477-912\n", + " Name: N/A\n", + "\n", + "320. ID: auto-labeling-model-1753936473158-979\n", + " Name: N/A\n", + "\n", + "321. ID: auto-labeling-model-1753939417926-903\n", + " Name: N/A\n", + "\n", + "322. ID: auto-labeling-model-1753941090969-886\n", + " Name: N/A\n", + "\n", + "323. ID: auto-labeling-model-1753941295803-93\n", + " Name: N/A\n", + "\n", + "324. ID: auto-labeling-model-1753943808756-255\n", + " Name: N/A\n", + "\n", + "325. ID: auto-labeling-model-1754012684592-887\n", + " Name: N/A\n", + "\n", + "326. ID: auto-labeling-model-1754015881192-443\n", + " Name: N/A\n", + "\n", + "327. ID: auto-labeling-model-1754016406351-97\n", + " Name: N/A\n", + "\n", + "328. ID: auto-labeling-model-1754016977082-211\n", + " Name: N/A\n", + "\n", + "329. ID: auto-labeling-model-1754017707931-428\n", + " Name: N/A\n", + "\n", + "330. ID: auto-labeling-model-1754024495010-992\n", + " Name: N/A\n", + "\n", + "331. ID: auto-labeling-model-1754025560953-192\n", + " Name: N/A\n", + "\n", + "332. ID: auto-labeling-model-1754026435557-853\n", + " Name: N/A\n", + "\n", + "333. ID: auto-labeling-model-1754037940196-869\n", + " Name: N/A\n", + "\n", + "334. ID: auto-labeling-model-1754082032616-607\n", + " Name: N/A\n", + "\n", + "335. ID: auto-labeling-model-1754082215077-482\n", + " Name: N/A\n", + "\n", + "336. ID: auto-labeling-model-1754082332437-629\n", + " Name: N/A\n", + "\n", + "337. ID: auto-labeling-model-1754082479343-224\n", + " Name: N/A\n", + "\n", + "338. ID: auto-labeling-model-1754082536526-914\n", + " Name: N/A\n", + "\n", + "339. ID: auto-labeling-model-1754082630700-302\n", + " Name: N/A\n", + "\n", + "340. ID: auto-labeling-model-1754082725263-83\n", + " Name: N/A\n", + "\n", + "341. ID: auto-labeling-model-1754082811382-584\n", + " Name: N/A\n", + "\n", + "342. ID: auto-labeling-model-1754082998761-352\n", + " Name: N/A\n", + "\n", + "343. ID: auto-labeling-model-1754083046825-203\n", + " Name: N/A\n", + "\n", + "344. ID: auto-labeling-model-1754083150278-445\n", + " Name: N/A\n", + "\n", + "345. ID: auto-labeling-model-1754083462284-222\n", + " Name: N/A\n", + "\n", + "346. ID: auto-labeling-model-1754083621516-367\n", + " Name: N/A\n", + "\n", + "347. ID: auto-labeling-model-1754083719163-272\n", + " Name: N/A\n", + "\n", + "348. ID: auto-labeling-model-1754083866374-41\n", + " Name: N/A\n", + "\n", + "349. ID: auto-labeling-model-1754084032708-231\n", + " Name: N/A\n", + "\n", + "350. ID: auto-labeling-model-1754084406835-168\n", + " Name: N/A\n", + "\n", + "351. ID: auto-labeling-model-1754084472348-188\n", + " Name: N/A\n", + "\n", + "352. ID: auto-labeling-model-1754084575001-916\n", + " Name: N/A\n", + "\n", + "353. ID: auto-labeling-model-1754084884148-481\n", + " Name: N/A\n", + "\n", + "354. ID: auto-labeling-model-1754088680537-743\n", + " Name: N/A\n", + "\n", + "355. ID: auto-labeling-model-1754277589373-867\n", + " Name: N/A\n", + "\n", + "356. ID: auto-labeling-model-1754327062412-76\n", + " Name: N/A\n", + "\n", + "357. ID: auto-labeling-model-1754361872613-844\n", + " Name: N/A\n", + "\n", + "358. ID: auto-labeling-model-1754442934624-187\n", + " Name: N/A\n", + "\n", + "359. ID: auto-labeling-model-1754443219339-17\n", + " Name: N/A\n", + "\n", + "360. ID: auto-labeling-model-1754448125079-528\n", + " Name: N/A\n", + "\n", + "361. ID: auto-labeling-model-1754448200938-6\n", + " Name: N/A\n", + "\n", + "362. ID: auto-labeling-model-1754448830534-215\n", + " Name: N/A\n", + "\n", + "363. ID: auto-labeling-model-1754448901751-597\n", + " Name: N/A\n", + "\n", + "364. ID: auto-labeling-model-1754449038080-472\n", + " Name: N/A\n", + "\n", + "365. ID: auto-labeling-model-1754449135369-901\n", + " Name: N/A\n", + "\n", + "366. ID: auto-labeling-model-1754449150398-162\n", + " Name: N/A\n", + "\n", + "367. ID: auto-labeling-model-1754449206123-981\n", + " Name: N/A\n", + "\n", + "368. ID: auto-labeling-model-1754449280061-594\n", + " Name: N/A\n", + "\n", + "369. ID: auto-labeling-model-1754449347580-776\n", + " Name: N/A\n", + "\n", + "370. ID: auto-labeling-model-1754449538829-202\n", + " Name: N/A\n", + "\n", + "371. ID: auto-labeling-model-1754449608449-502\n", + " Name: N/A\n", + "\n", + "372. ID: auto-labeling-model-1754449678933-461\n", + " Name: N/A\n", + "\n", + "373. ID: auto-labeling-model-1754449747782-122\n", + " Name: N/A\n", + "\n", + "374. ID: auto-labeling-model-1754449819030-776\n", + " Name: N/A\n", + "\n", + "375. ID: auto-labeling-model-1754454485024-346\n", + " Name: N/A\n", + "\n", + "376. ID: auto-labeling-model-1754456633663-795\n", + " Name: N/A\n", + "\n", + "377. ID: auto-labeling-model-1754457369864-749\n", + " Name: N/A\n", + "\n", + "378. ID: auto-labeling-model-1754457591929-484\n", + " Name: N/A\n", + "\n", + "379. ID: auto-labeling-model-1754460230719-575\n", + " Name: N/A\n", + "\n", + "380. ID: auto-labeling-model-1754460479500-36\n", + " Name: N/A\n", + "\n", + "381. ID: auto-labeling-model-1754460640349-364\n", + " Name: N/A\n", + "\n", + "382. ID: auto-labeling-model-1754669409054-428\n", + " Name: N/A\n", + "\n", + "383. ID: auto-labeling-model-1754951212582-203\n", + " Name: N/A\n", + "\n", + "384. ID: auto-labeling-model-1754965260794-576\n", + " Name: N/A\n", + "\n", + "385. ID: auto-labeling-model-1754965331102-485\n", + " Name: N/A\n", + "\n", + "386. ID: auto-labeling-model-1754965445643-161\n", + " Name: N/A\n", + "\n", + "387. ID: auto-labeling-model-1754965630031-820\n", + " Name: N/A\n", + "\n", + "388. ID: auto-labeling-model-1754965704606-779\n", + " Name: N/A\n", + "\n", + "389. ID: auto-labeling-model-1754965767126-499\n", + " Name: N/A\n", + "\n", + "390. ID: auto-labeling-model-1754965926600-215\n", + " Name: N/A\n", + "\n", + "391. ID: auto-labeling-model-1754965996281-810\n", + " Name: N/A\n", + "\n", + "392. ID: auto-labeling-model-1754966073913-92\n", + " Name: N/A\n", + "\n", + "393. ID: auto-labeling-model-1754966208584-396\n", + " Name: N/A\n", + "\n", + "394. ID: auto-labeling-model-1754966287090-692\n", + " Name: N/A\n", + "\n", + "395. ID: auto-labeling-model-1754966553579-724\n", + " Name: N/A\n", + "\n", + "396. ID: auto-labeling-model-1754966634261-409\n", + " Name: N/A\n", + "\n", + "397. ID: auto-labeling-model-1754966703678-7\n", + " Name: N/A\n", + "\n", + "398. ID: auto-labeling-model-1754966778721-225\n", + " Name: N/A\n", + "\n", + "399. ID: auto-labeling-model-1754966848977-806\n", + " Name: N/A\n", + "\n", + "400. ID: auto-labeling-model-1754966934481-980\n", + " Name: N/A\n", + "\n", + "401. ID: auto-labeling-model-1754967006745-602\n", + " Name: N/A\n", + "\n", + "402. ID: auto-labeling-model-1754967080546-450\n", + " Name: N/A\n", + "\n", + "403. ID: auto-labeling-model-1754967570056-479\n", + " Name: N/A\n", + "\n", + "404. ID: auto-labeling-model-1754967665781-18\n", + " Name: N/A\n", + "\n", + "405. ID: auto-labeling-model-1754967737902-258\n", + " Name: N/A\n", + "\n", + "406. ID: auto-labeling-model-1754967809639-969\n", + " Name: N/A\n", + "\n", + "407. ID: auto-labeling-model-1754967879833-46\n", + " Name: N/A\n", + "\n", + "408. ID: auto-labeling-model-1754967953160-263\n", + " Name: N/A\n", + "\n", + "409. ID: auto-labeling-model-1754968036672-249\n", + " Name: N/A\n", + "\n", + "410. ID: auto-labeling-model-1754968110963-400\n", + " Name: N/A\n", + "\n", + "411. ID: auto-labeling-model-1754968179908-761\n", + " Name: N/A\n", + "\n", + "412. ID: auto-labeling-model-1754974913641-913\n", + " Name: N/A\n", + "\n", + "413. ID: auto-labeling-model-1754975127019-903\n", + " Name: N/A\n", + "\n", + "414. ID: auto-labeling-model-1754975368613-717\n", + " Name: N/A\n", + "\n", + "415. ID: auto-labeling-model-1754975432901-90\n", + " Name: N/A\n", + "\n", + "416. ID: auto-labeling-model-1754975454687-707\n", + " Name: N/A\n", + "\n", + "417. ID: auto-labeling-model-1754975527897-708\n", + " Name: N/A\n", + "\n", + "418. ID: auto-labeling-model-1754975600064-524\n", + " Name: N/A\n", + "\n", + "419. ID: auto-labeling-model-1754975711179-28\n", + " Name: N/A\n", + "\n", + "420. ID: auto-labeling-model-1754975967653-203\n", + " Name: N/A\n", + "\n", + "421. ID: auto-labeling-model-1754976038813-381\n", + " Name: N/A\n", + "\n", + "422. ID: auto-labeling-model-1754976117940-973\n", + " Name: N/A\n", + "\n", + "423. ID: auto-labeling-model-1754976193933-189\n", + " Name: N/A\n", + "\n", + "424. ID: auto-labeling-model-1754976293724-520\n", + " Name: N/A\n", + "\n", + "425. ID: auto-labeling-model-1754976368518-509\n", + " Name: N/A\n", + "\n", + "426. ID: auto-labeling-model-1754976437096-539\n", + " Name: N/A\n", + "\n", + "427. ID: auto-labeling-model-1754976513472-952\n", + " Name: N/A\n", + "\n", + "428. ID: auto-labeling-model-1754976754715-501\n", + " Name: N/A\n", + "\n", + "429. ID: auto-labeling-model-1754976904752-710\n", + " Name: N/A\n", + "\n", + "430. ID: auto-labeling-model-1754976976653-350\n", + " Name: N/A\n", + "\n", + "431. ID: auto-labeling-model-1754977052535-217\n", + " Name: N/A\n", + "\n", + "432. ID: auto-labeling-model-1754977121829-706\n", + " Name: N/A\n", + "\n", + "433. ID: auto-labeling-model-1754977217214-291\n", + " Name: N/A\n", + "\n", + "434. ID: auto-labeling-model-1754977287574-575\n", + " Name: N/A\n", + "\n", + "435. ID: auto-labeling-model-1754977360553-264\n", + " Name: N/A\n", + "\n", + "436. ID: auto-labeling-model-1754977435968-198\n", + " Name: N/A\n", + "\n", + "437. ID: auto-labeling-model-1754977508312-429\n", + " Name: N/A\n", + "\n", + "438. ID: auto-labeling-model-1754977588026-221\n", + " Name: N/A\n", + "\n", + "439. ID: auto-labeling-model-1754977663056-797\n", + " Name: N/A\n", + "\n", + "440. ID: auto-labeling-model-1754978589858-924\n", + " Name: N/A\n", + "\n", + "441. ID: auto-labeling-model-1754978799780-511\n", + " Name: N/A\n", + "\n", + "442. ID: auto-labeling-model-1754980148754-523\n", + " Name: N/A\n", + "\n", + "443. ID: auto-labeling-model-1754980966501-518\n", + " Name: N/A\n", + "\n", + "444. ID: auto-labeling-model-1754981828125-533\n", + " Name: N/A\n", + "\n", + "445. ID: auto-labeling-model-1754983426916-774\n", + " Name: N/A\n", + "\n", + "446. ID: auto-labeling-model-1754984348089-313\n", + " Name: N/A\n", + "\n", + "447. ID: auto-labeling-model-1754984423463-874\n", + " Name: N/A\n", + "\n", + "448. ID: auto-labeling-model-1754984499501-967\n", + " Name: N/A\n", + "\n", + "449. ID: auto-labeling-model-1754984577453-603\n", + " Name: N/A\n", + "\n", + "450. ID: auto-labeling-model-1754984673348-39\n", + " Name: N/A\n", + "\n", + "451. ID: auto-labeling-model-1754984745908-988\n", + " Name: N/A\n", + "\n", + "452. ID: auto-labeling-model-1754984844230-121\n", + " Name: N/A\n", + "\n", + "453. ID: auto-labeling-model-1754985031421-137\n", + " Name: N/A\n", + "\n", + "454. ID: auto-labeling-model-1754985052679-764\n", + " Name: N/A\n", + "\n", + "455. ID: auto-labeling-model-1754985230207-884\n", + " Name: N/A\n", + "\n", + "456. ID: auto-labeling-model-1754993665797-458\n", + " Name: N/A\n", + "\n", + "457. ID: auto-labeling-model-1754993775398-308\n", + " Name: N/A\n", + "\n", + "458. ID: auto-labeling-model-1755021430602-389\n", + " Name: N/A\n", + "\n", + "459. ID: auto-labeling-model-1755021530633-576\n", + " Name: N/A\n", + "\n", + "460. ID: auto-labeling-model-1755034509086-812\n", + " Name: N/A\n", + "\n", + "461. ID: auto-labeling-model-1755036680421-274\n", + " Name: N/A\n", + "\n", + "462. ID: auto-labeling-model-1755036840212-13\n", + " Name: N/A\n", + "\n", + "463. ID: auto-labeling-model-1755037123033-737\n", + " Name: N/A\n", + "\n", + "464. ID: auto-labeling-model-1755041702234-29\n", + " Name: N/A\n", + "\n", + "465. ID: auto-labeling-model-1755041716845-12\n", + " Name: N/A\n", + "\n", + "466. ID: auto-labeling-model-1755043090900-677\n", + " Name: N/A\n", + "\n", + "467. ID: auto-labeling-model-1755044191218-796\n", + " Name: N/A\n", + "\n", + "468. ID: auto-labeling-model-1755044423164-353\n", + " Name: N/A\n", + "\n", + "469. ID: auto-labeling-model-1755048701795-244\n", + " Name: N/A\n", + "\n", + "470. ID: auto-labeling-model-1755048719130-947\n", + " Name: N/A\n", + "\n", + "471. ID: auto-labeling-model-1755048825616-336\n", + " Name: N/A\n", + "\n", + "472. ID: auto-labeling-model-1755048863902-319\n", + " Name: N/A\n", + "\n", + "473. ID: auto-labeling-model-1755048975788-30\n", + " Name: N/A\n", + "\n", + "474. ID: auto-labeling-model-1755049161847-499\n", + " Name: N/A\n", + "\n", + "475. ID: auto-labeling-model-1755061734445-540\n", + " Name: N/A\n", + "\n", + "476. ID: auto-labeling-model-1755061987015-686\n", + " Name: N/A\n", + "\n", + "477. ID: auto-labeling-model-1755062318015-752\n", + " Name: N/A\n", + "\n", + "478. ID: auto-labeling-model-1755062966345-99\n", + " Name: N/A\n", + "\n", + "479. ID: auto-labeling-model-1755063315485-717\n", + " Name: N/A\n", + "\n", + "480. ID: auto-labeling-model-1755063386013-936\n", + " Name: N/A\n", + "\n", + "481. ID: auto-labeling-model-1755069455912-277\n", + " Name: N/A\n", + "\n", + "482. ID: auto-labeling-model-1755069553935-338\n", + " Name: N/A\n", + "\n", + "483. ID: auto-labeling-model-1755069702068-412\n", + " Name: N/A\n", + "\n", + "484. ID: auto-labeling-model-1755069842876-922\n", + " Name: N/A\n", + "\n", + "485. ID: auto-labeling-model-1755072279253-390\n", + " Name: N/A\n", + "\n", + "486. ID: auto-labeling-model-1755076709324-342\n", + " Name: N/A\n", + "\n", + "487. ID: auto-labeling-model-1755077617558-667\n", + " Name: N/A\n", + "\n", + "488. ID: auto-labeling-model-1755077873604-810\n", + " Name: N/A\n", + "\n", + "489. ID: auto-labeling-model-1755078021426-256\n", + " Name: N/A\n", + "\n", + "490. ID: auto-labeling-model-1755134767049-985\n", + " Name: N/A\n", + "\n", + "491. ID: auto-labeling-model-1755135457748-675\n", + " Name: N/A\n", + "\n", + "492. ID: auto-labeling-model-1755220299075-866\n", + " Name: N/A\n", + "\n", + "493. ID: auto-labeling-model-1755221919898-254\n", + " Name: N/A\n", + "\n", + "494. ID: auto-labeling-model-1755222009716-189\n", + " Name: N/A\n", + "\n", + "495. ID: auto-labeling-model-1755222110837-250\n", + " Name: N/A\n", + "\n", + "496. ID: auto-labeling-model-1755222196939-944\n", + " Name: N/A\n", + "\n", + "497. ID: auto-labeling-model-1755222580985-811\n", + " Name: N/A\n", + "\n", + "498. ID: auto-labeling-model-1755224344739-857\n", + " Name: N/A\n", + "\n", + "499. ID: auto-labeling-model-1755224418333-237\n", + " Name: N/A\n", + "\n", + "500. ID: auto-labeling-model-1755224501846-126\n", + " Name: N/A\n", + "\n", + "501. ID: auto-labeling-model-1755224573788-830\n", + " Name: N/A\n", + "\n", + "502. ID: auto-labeling-model-1755274111236-815\n", + " Name: N/A\n", + "\n", + "503. ID: auto-labeling-model-1755546385161-718\n", + " Name: N/A\n", + "\n", + "504. ID: auto-labeling-model-1755564859753-49\n", + " Name: N/A\n", + "\n", + "505. ID: auto-labeling-model-1755571891436-24\n", + " Name: N/A\n", + "\n", + "506. ID: auto-labeling-model-1755575417648-956\n", + " Name: N/A\n", + "\n", + "507. ID: auto-labeling-model-1755589868572-105\n", + " Name: N/A\n", + "\n", + "508. ID: auto-labeling-model-1755623887267-687\n", + " Name: N/A\n", + "\n", + "509. ID: auto-labeling-model-1755657602248-443\n", + " Name: N/A\n", + "\n", + "510. ID: auto-labeling-model-1755671136055-108\n", + " Name: N/A\n", + "\n", + "511. ID: auto-labeling-model-1755673245801-744\n", + " Name: N/A\n", + "\n", + "512. ID: auto-labeling-model-1755675180889-142\n", + " Name: N/A\n", + "\n", + "513. ID: auto-labeling-model-1755678446620-988\n", + " Name: N/A\n", + "\n", + "514. ID: auto-labeling-model-1755738759590-405\n", + " Name: N/A\n", + "\n", + "515. ID: auto-labeling-model-1755741941138-610\n", + " Name: N/A\n", + "\n", + "516. ID: auto-labeling-model-1755745805348-731\n", + " Name: N/A\n", + "\n", + "517. ID: auto-labeling-model-1755753976159-223\n", + " Name: N/A\n", + "\n", + "518. ID: auto-labeling-model-1755756092896-628\n", + " Name: N/A\n", + "\n", + "519. ID: auto-labeling-model-1755761289894-657\n", + " Name: N/A\n", + "\n", + "520. ID: auto-labeling-model-1755824923780-82\n", + " Name: N/A\n", + "\n", + "521. ID: auto-labeling-model-1755839089591-320\n", + " Name: N/A\n", + "\n", + "522. ID: auto-labeling-model-1755840078392-806\n", + " Name: N/A\n", + "\n", + "523. ID: auto-labeling-model-1755843001974-210\n", + " Name: N/A\n", + "\n", + "524. ID: auto-labeling-model-1755844906709-250\n", + " Name: N/A\n", + "\n", + "525. ID: auto-labeling-model-1755846971954-69\n", + " Name: N/A\n", + "\n", + "526. ID: auto-labeling-model-1755847550122-149\n", + " Name: N/A\n", + "\n", + "527. ID: auto-labeling-model-1755849254781-355\n", + " Name: N/A\n", + "\n", + "528. ID: auto-labeling-model-1755854539631-293\n", + " Name: N/A\n", + "\n", + "529. ID: auto-labeling-model-1756087002299-72\n", + " Name: N/A\n", + "\n", + "530. ID: auto-labeling-model-1756087565828-132\n", + " Name: N/A\n", + "\n", + "531. ID: auto-labeling-model-1756087680461-719\n", + " Name: N/A\n", + "\n", + "532. ID: auto-labeling-model-1756087819774-813\n", + " Name: N/A\n", + "\n", + "533. ID: auto-labeling-model-1756087867761-583\n", + " Name: N/A\n", + "\n", + "534. ID: auto-labeling-model-1756112514075-201\n", + " Name: N/A\n", + "\n", + "535. ID: auto-labeling-model-1756137207447-376\n", + " Name: N/A\n", + "\n", + "536. ID: auto-labeling-model-1756137492728-788\n", + " Name: N/A\n", + "\n", + "537. ID: auto-labeling-model-1756138904093-804\n", + " Name: N/A\n", + "\n", + "538. ID: auto-labeling-model-1756193938984-510\n", + " Name: N/A\n", + "\n", + "539. ID: auto-labeling-model-1756279382223-424\n", + " Name: N/A\n", + "\n", + "540. ID: auto-labeling-model-1756281178604-829\n", + " Name: N/A\n", + "\n", + "541. ID: auto-labeling-model-1756347012781-494\n", + " Name: N/A\n", + "\n", + "542. ID: auto-labeling-model-1756348972897-103\n", + " Name: N/A\n", + "\n", + "543. ID: auto-labeling-model-1756349422839-305\n", + " Name: N/A\n", + "\n", + "544. ID: auto-labeling-model-1756349498730-552\n", + " Name: N/A\n", + "\n", + "545. ID: auto-labeling-model-1756360413351-308\n", + " Name: N/A\n", + "\n", + "546. ID: auto-labeling-model-1756363959156-20\n", + " Name: N/A\n", + "\n", + "547. ID: auto-labeling-model-1756369801529-118\n", + " Name: N/A\n", + "\n", + "548. ID: auto-labeling-model-1756430598758-905\n", + " Name: N/A\n", + "\n", + "549. ID: auto-labeling-model-1756440760505-307\n", + " Name: N/A\n", + "\n", + "550. ID: auto-labeling-model-1756460100800-668\n", + " Name: N/A\n", + "\n", + "551. ID: auto-labeling-model-1756460110544-559\n", + " Name: N/A\n", + "\n", + "552. ID: auto-labeling-model-1756693820728-76\n", + " Name: N/A\n", + "\n", + "553. ID: auto-labeling-model-1756912886736-101\n", + " Name: N/A\n", + "\n", + "554. ID: auto-labeling-model-1757497814136-763\n", + " Name: N/A\n", + "\n", + "555. ID: auto-labeling-model-1757663204666-122\n", + " Name: N/A\n", + "\n", + "556. ID: auto-labeling-model-1757995180429-664\n", + " Name: N/A\n", + "\n", + "557. ID: auto-labeling-model-1758045209157-220\n", + " Name: N/A\n", + "\n", + "558. ID: auto-labeling-model-1758045343765-419\n", + " Name: N/A\n", + "\n", + "559. ID: auto-labeling-model-1758182652735-580\n", + " Name: N/A\n", + "\n", + "560. ID: auto-labeling-model-1758551942230-384\n", + " Name: N/A\n", + "\n", + "561. ID: auto-labeling-model-1758693093755-157\n", + " Name: N/A\n", + "\n", + "562. ID: auto-labeling-model-1758703215086-912\n", + " Name: N/A\n", + "\n", + "563. ID: auto-labeling-model-1758742506653-803\n", + " Name: N/A\n", + "\n", + "564. ID: auto-labeling-model-1758859363470-900\n", + " Name: N/A\n", + "\n", + "565. ID: auto-labeling-model-1758861498544-317\n", + " Name: N/A\n", + "\n", + "566. ID: auto-labeling-model-1759166848691-35\n", + " Name: N/A\n", + "\n", + "567. ID: auto-labeling-model-1759310043204-41\n", + " Name: N/A\n", + "\n", + "568. ID: auto-labeling-model-1759334232768-397\n", + " Name: N/A\n", + "\n", + "569. ID: auto-labeling-model-1759817160138-569\n", + " Name: N/A\n", + "\n", + "570. ID: auto-labeling-model-1759956977266-516\n", + " Name: N/A\n", + "\n", + "571. ID: auto-labeling-model-1760426322250-908\n", + " Name: N/A\n", + "\n", + "572. ID: auto-labeling-model-1760479341007-491\n", + " Name: N/A\n", + "\n", + "573. ID: auto-labeling-model-1760479492039-631\n", + " Name: N/A\n", + "\n", + "574. ID: auto-labeling-model-1760479645658-613\n", + " Name: N/A\n", + "\n", + "575. ID: auto-labeling-model-1760479761056-497\n", + " Name: N/A\n", + "\n", + "576. ID: auto-labeling-model-1760479780527-626\n", + " Name: N/A\n", + "\n", + "577. ID: auto-labeling-model-1760479932099-212\n", + " Name: N/A\n", + "\n", + "578. ID: auto-labeling-model-1760479949487-358\n", + " Name: N/A\n", + "\n", + "579. ID: auto-labeling-model-1760480389179-217\n", + " Name: N/A\n", + "\n", + "580. ID: auto-labeling-model-1760490988143-30\n", + " Name: N/A\n", + "\n", + "581. ID: auto-labeling-model-1760499252646-774\n", + " Name: N/A\n", + "\n", + "582. ID: auto-labeling-model-1760539714171-740\n", + " Name: N/A\n", + "\n", + "583. ID: auto-labeling-model-1760540221082-518\n", + " Name: N/A\n", + "\n", + "584. ID: auto-labeling-model-1760566206649-192\n", + " Name: N/A\n", + "\n", + "585. ID: auto-labeling-model-1760649718443-469\n", + " Name: N/A\n", + "\n", + "586. ID: auto-labeling-model-1760974810245-633\n", + " Name: N/A\n", + "\n", + "587. ID: auto-labeling-model-1761060285537-410\n", + " Name: N/A\n", + "\n", + "588. ID: auto-labeling-model-1761072595965-766\n", + " Name: N/A\n", + "\n", + "589. ID: auto-labeling-model-1761170322608-61\n", + " Name: N/A\n", + "\n", + "590. ID: auto-labeling-model-1761170507108-187\n", + " Name: N/A\n", + "\n", + "591. ID: cu-eox\n", + " Name: N/A\n", + "\n", + "592. ID: cu-test-2\n", + " Name: N/A\n", + "\n", + "593. ID: cu-test-3\n", + " Name: N/A\n", + "\n", + "594. ID: cu-test\n", + " Name: N/A\n", + "\n", + "595. ID: cu-test3\n", + " Name: N/A\n", + "\n", + "596. ID: cu-trainig-debug\n", + " Name: N/A\n", + "\n", + "597. ID: cx-deloitte-all-items-good\n", + " Name: N/A\n", + "\n", + "598. ID: cx-deloitte-all-items-idex\n", + " Name: N/A\n", + "\n", + "599. ID: cx-deloitte-all-items-keep-one-label\n", + " Name: N/A\n", + "\n", + "600. ID: cx-deloitte-all-items\n", + " Name: N/A\n", + "\n", + "601. ID: cx-deloitte\n", + " Name: N/A\n", + "\n", + "602. ID: default\n", + " Name: N/A\n", + "\n", + "603. ID: document-test\n", + " Name: N/A\n", + "\n", + "604. ID: example\n", + " Name: N/A\n", + "\n", + "605. ID: excel\n", + " Name: N/A\n", + "\n", + "606. ID: highlight-analyzer-03673070-1755304831\n", + " Name: N/A\n", + "\n", + "607. ID: highlight-analyzer-1755112569\n", + " Name: N/A\n", + "\n", + "608. ID: highlight-analyzer-1755113090\n", + " Name: N/A\n", + "\n", + "609. ID: highlight-analyzer-1755117427\n", + " Name: N/A\n", + "\n", + "610. ID: highlight-analyzer-1755127191\n", + " Name: N/A\n", + "\n", + "611. ID: highlight-analyzer-1755128917\n", + " Name: N/A\n", + "\n", + "612. ID: highlight-analyzer-1755204485\n", + " Name: N/A\n", + "\n", + "613. ID: highlight-analyzer-1755205148\n", + " Name: N/A\n", + "\n", + "614. ID: highlight-analyzer-1755304423\n", + " Name: N/A\n", + "\n", + "615. ID: highlight-analyzer-49453d78-1755304719\n", + " Name: N/A\n", + "\n", + "616. ID: insurance-test\n", + " Name: N/A\n", + "\n", + "617. ID: invoiceLabeledData\n", + " Name: N/A\n", + "\n", + "618. ID: joann-insurance\n", + " Name: N/A\n", + "\n", + "619. ID: joann-tryout-invoice\n", + " Name: N/A\n", + "\n", + "620. ID: k\n", + " Name: N/A\n", + "\n", + "621. ID: minus\n", + " Name: N/A\n", + "\n", + "622. ID: mySampleAnalyzer\n", + " Name: N/A\n", + "\n", + "623. ID: pro-test\n", + " Name: N/A\n", + "\n", + "624. ID: proExample\n", + " Name: N/A\n", + "\n", + "625. ID: sampleAnalyzer273626\n", + " Name: N/A\n", + "\n", + "626. ID: sampleAnalyzer530775\n", + " Name: N/A\n", + "\n", + "627. ID: sampleAnalyzer679281\n", + " Name: N/A\n", + "\n", + "628. ID: shihw-insurance-0529\n", + " Name: N/A\n", + "\n", + "629. ID: shihw-video-test0528\n", + " Name: N/A\n", + "\n", + "630. ID: soccer-highlights-analyzer-v1\n", + " Name: N/A\n", + "\n", + "631. ID: soccer-highlights-analyzer-v2\n", + " Name: N/A\n", + "\n", + "632. ID: soccer-highlights-analyzer-v3\n", + " Name: N/A\n", + "\n", + "633. ID: soccer-highlights-analyzer-v4\n", + " Name: N/A\n", + "\n", + "634. ID: soccer-highlights-analyzer-v5\n", + " Name: N/A\n", + "\n", + "635. ID: soccer-highlights-analyzer1751301403\n", + " Name: N/A\n", + "\n", + "636. ID: soccer-highlights-analyzer1751301722\n", + " Name: N/A\n", + "\n", + "637. ID: soccer-highlights-analyzer2-v1\n", + " Name: N/A\n", + "\n", + "638. ID: soccer-highlights-analyzer5314167881751302137\n", + " Name: N/A\n", + "\n", + "639. ID: soccer-highlights-analyzer5314167881751302581\n", + " Name: N/A\n", + "\n", + "640. ID: soccer-highlights-analyzer5314167881751303949\n", + " Name: N/A\n", + "\n", + "641. ID: soccer-highlights-analyzer5314167881751306147\n", + " Name: N/A\n", + "\n", + "642. ID: soccer-highlights-analyzer5314167881751313349\n", + " Name: N/A\n", + "\n", + "643. ID: soccer-highlights-analyzer5314167881755019232\n", + " Name: N/A\n", + "\n", + "644. ID: soccer-highlights-analyzer5314167881755020564\n", + " Name: N/A\n", + "\n", + "645. ID: soccer-highlights-analyzer5314167881755023993\n", + " Name: N/A\n", + "\n", + "646. ID: soccer-highlights-analyzer5314167881755029594\n", + " Name: N/A\n", + "\n", + "647. ID: soccer-highlights-analyzer54167881751301841\n", + " Name: N/A\n", + "\n", + "648. ID: t\n", + " Name: N/A\n", + "\n", + "649. ID: tes\n", + " Name: N/A\n", + "\n", + "650. ID: test-bar-gap\n", + " Name: N/A\n", + "\n", + "651. ID: test\n", + " Name: N/A\n", + "\n", + "652. ID: testMeow\n", + " Name: N/A\n", + "\n", + "653. ID: tingwliu-invoice-test\n", + " Name: N/A\n", + "\n", + "654. ID: video-250808\n", + " Name: N/A\n", + "\n", + "655. ID: video\n", + " Name: N/A\n", + "\n", + "656. ID: videotest\n", + " Name: N/A\n", + "\n", + "657. ID: yahch-contract-0805-1\n", + " Name: N/A\n", + "\n", + "658. ID: yahch-document-HKinvoice-label-2\n", + " Name: N/A\n", + "\n", + "659. ID: yahch-document-HKinvoice-label-3\n", + " Name: N/A\n", + "\n", + "660. ID: yahch-document-HKinvoice-label-4\n", + " Name: N/A\n", + "\n", + "661. ID: yahch-document-HKinvoice-label-5\n", + " Name: N/A\n", + "\n", + "662. ID: yahch-document-HKinvoice-label-6\n", + " Name: N/A\n", + "\n", + "663. ID: yahch-document-HKinvoice-label\n", + " Name: N/A\n", + "\n", + "664. ID: yahch-document-HKinvoice-local-1\n", + " Name: N/A\n", + "\n", + "665. ID: yahch-document-HKinvoice-local-icl-1\n", + " Name: N/A\n", + "\n", + "666. ID: yahch-document-HKinvoice-local-icl-2\n", + " Name: N/A\n", + "\n", + "667. ID: yahch-invoice-HKinvoice-local-icl-1\n", + " Name: N/A\n", + "\n", + "668. ID: yahch-invoice-HKinvoice-local-zeroshot-1\n", + " Name: N/A\n", + "\n", + "669. ID: yahch-invoice-HKinvoice-local-zeroshot-2\n", + " Name: N/A\n", + "\n", + "670. ID: yiyun\n", + " Name: N/A\n", + "\n", + "671. ID: yiyun1223\n", + " Name: N/A\n", + "\n", + "672. ID: yiyun3333\n", + " Name: N/A\n", + "\n", + "673. ID: yiyun65656\n", + " Name: N/A\n", + "\n", + "674. ID: yiyunPromode\n", + " Name: N/A\n", + "\n", + "675. ID: yslin-2025-06-25-generative-date-fields\n", + " Name: N/A\n", + "\n" + ] + } + ], + "source": [ + "# Get all analyzers in your resource\n", + "all_analyzers = client.get_all_analyzers()\n", + "analyzers_list = all_analyzers.get('value', [])\n", + "\n", + "print(f\"Found {len(analyzers_list)} analyzer(s) in your resource\\n\")\n", + "\n", + "# Display analyzer names and IDs\n", + "if analyzers_list:\n", + " print(\"Available analyzers:\")\n", + " for idx, analyzer in enumerate(analyzers_list, 1):\n", + " analyzer_id = analyzer.get('analyzerId', 'N/A')\n", + " analyzer_name = analyzer.get('name', 'N/A')\n", + " print(f\"{idx}. ID: {analyzer_id}\")\n", + " print(f\" Name: {analyzer_name}\")\n", + " print()\n", + "else:\n", + " print(\"No analyzers found. Please create an analyzer with training data first.\")\n", + " print(\"See: notebooks/analyzer_training.ipynb for guidance.\")" + ] + }, + { + "cell_type": "markdown", + "id": "8e6ae2ac", + "metadata": {}, + "source": [ + "## Step 2: Select Source Analyzer\n", + "\n", + "Specify the ID of the analyzer whose training data you want to reuse.\n", + "\n", + "**Option 1**: Set `SOURCE_ANALYZER_ID` to an existing analyzer ID from the list above.\n", + "\n", + "**Option 2**: If you don't have an analyzer with training data, uncomment and run the next cell to create one first." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "9772b0f5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Source Analyzer ID: invoiceLabeledData\n" + ] + } + ], + "source": [ + "# OPTION 1: Specify an existing analyzer ID that has training data\n", + "# Replace this with your actual analyzer ID\n", + "SOURCE_ANALYZER_ID = \"invoiceLabeledData\"\n", + "\n", + "# Uncomment to use the first analyzer from the list\n", + "# if analyzers_list:\n", + "# SOURCE_ANALYZER_ID = analyzers_list[0].get('id')\n", + "# print(f\"Using first analyzer: {SOURCE_ANALYZER_ID}\")\n", + "\n", + "print(f\"Source Analyzer ID: {SOURCE_ANALYZER_ID}\")" + ] + }, + { + "cell_type": "markdown", + "id": "d7ceffda", + "metadata": {}, + "source": [ + "### Option 2: Create a Source Analyzer with Training Data (Optional)\n", + "\n", + "If you don't have an existing analyzer with training data, run this cell to create one first.\n", + "\n", + "**Prerequisites**:\n", + "- Set environment variables for training data (see [docs/set_env_for_training_data_and_reference_doc.md](../docs/set_env_for_training_data_and_reference_doc.md))\n", + "- Ensure you have labeled training data in `../data/document_training/`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1ce228bd", + "metadata": {}, + "outputs": [], + "source": [ + "# Uncomment this entire cell if you need to create a source analyzer first\n", + "\n", + "# from azure.storage.blob import ContainerSasPermissions\n", + "\n", + "# # Configure training data\n", + "# analyzer_template_path = \"../analyzer_templates/receipt.json\"\n", + "# training_docs_folder = \"../data/document_training\"\n", + "\n", + "# # Get or generate SAS URL\n", + "# training_data_sas_url = os.getenv(\"TRAINING_DATA_SAS_URL\")\n", + "# if not training_data_sas_url:\n", + "# TRAINING_DATA_STORAGE_ACCOUNT_NAME = os.getenv(\"TRAINING_DATA_STORAGE_ACCOUNT_NAME\")\n", + "# TRAINING_DATA_CONTAINER_NAME = os.getenv(\"TRAINING_DATA_CONTAINER_NAME\")\n", + "# if not TRAINING_DATA_STORAGE_ACCOUNT_NAME:\n", + "# raise ValueError(\n", + "# \"Please set either TRAINING_DATA_SAS_URL or both TRAINING_DATA_STORAGE_ACCOUNT_NAME \"\n", + "# \"and TRAINING_DATA_CONTAINER_NAME environment variables.\"\n", + "# )\n", + "# training_data_sas_url = AzureContentUnderstandingClient.generate_temp_container_sas_url(\n", + "# account_name=TRAINING_DATA_STORAGE_ACCOUNT_NAME,\n", + "# container_name=TRAINING_DATA_CONTAINER_NAME,\n", + "# permissions=ContainerSasPermissions(read=True, write=True, list=True),\n", + "# expiry_hours=1,\n", + "# )\n", + "\n", + "# training_data_path = os.getenv(\"TRAINING_DATA_PATH\")\n", + "\n", + "# # Upload training data to blob storage\n", + "# print(\"Uploading training data to blob storage...\")\n", + "# await client.generate_training_data_on_blob(training_docs_folder, training_data_sas_url, training_data_path)\n", + "# print(\"✅ Training data uploaded successfully!\")\n", + "\n", + "# # Create source analyzer\n", + "# SOURCE_ANALYZER_ID = \"source-analyzer-\" + str(uuid.uuid4())\n", + "# print(f\"Creating source analyzer: {SOURCE_ANALYZER_ID}\")\n", + "\n", + "# response = client.begin_create_analyzer(\n", + "# SOURCE_ANALYZER_ID,\n", + "# analyzer_template_path=analyzer_template_path,\n", + "# training_storage_container_sas_url=training_data_sas_url,\n", + "# training_storage_container_path_prefix=training_data_path,\n", + "# )\n", + "# result = client.poll_result(response)\n", + "# print(\"✅ Source analyzer created successfully!\")\n", + "# print(json.dumps(result, indent=2))" + ] + }, + { + "cell_type": "markdown", + "id": "d9b1bc93", + "metadata": {}, + "source": [ + "## Step 3: Retrieve Source Analyzer Details\n", + "\n", + "Now we'll fetch the complete definition of the source analyzer, including its training data configuration." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "b2c9ae0c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Source Analyzer: invoiceLabeledData\n", + "Name: N/A\n", + "Description: \n", + "\n", + "Full analyzer definition:\n", + "{\n", + " \"analyzerId\": \"invoiceLabeledData\",\n", + " \"description\": \"\",\n", + " \"tags\": {\n", + " \"projectId\": \"d7afeaa4-fe05-4df7-bd7c-46f3a94a96cb\",\n", + " \"templateId\": \"document-2025-05-01\"\n", + " },\n", + " \"createdAt\": \"2025-10-22T22:03:08Z\",\n", + " \"lastModifiedAt\": \"2025-10-22T22:03:11Z\",\n", + " \"baseAnalyzerId\": \"prebuilt-documentAnalyzer\",\n", + " \"config\": {\n", + " \"returnDetails\": true,\n", + " \"enableOcr\": true,\n", + " \"enableLayout\": true,\n", + " \"enableFormula\": false,\n", + " \"disableContentFiltering\": false,\n", + " \"tableFormat\": \"html\",\n", + " \"estimateFieldSourceAndConfidence\": false\n", + " },\n", + " \"fieldSchema\": {\n", + " \"fields\": {\n", + " \"CompanyName\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Name of the pharmaceutical company involved in the rebate program\"\n", + " },\n", + " \"ProductDetails\": {\n", + " \"type\": \"array\",\n", + " \"description\": \"List of products with rebate and unit details\",\n", + " \"items\": {\n", + " \"type\": \"object\",\n", + " \"description\": \"Details of a single product\",\n", + " \"properties\": {\n", + " \"ProductPackageCode\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Code representing the product or package\"\n", + " },\n", + " \"ProductName\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Name of the product\"\n", + " },\n", + " \"FfsimcoRecordId\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Record ID for FFSIMCO\"\n", + " },\n", + " \"RebatePerUnit\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Rebate amount per unit of the product\"\n", + " },\n", + " \"AdjustedRebatePerUnit\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Adjusted rebate amount per unit\"\n", + " },\n", + " \"UnitsInvoiced\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Number of units invoiced\"\n", + " },\n", + " \"UnitsPaid\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Number of units for which payment was made\"\n", + " },\n", + " \"RebateAmountInvoiced\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Total rebate amount invoiced\"\n", + " },\n", + " \"RebateAmountPaid\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Total rebate amount paid\"\n", + " }\n", + " }\n", + " }\n", + " },\n", + " \"TotalPaid\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Total payment amount \"\n", + " }\n", + " }\n", + " },\n", + " \"trainingData\": {\n", + " \"containerUrl\": \"https://staistudiote203841201294.blob.core.windows.net/7c123b64-9378-4fa7-a807-081efa839c00-cu\",\n", + " \"kind\": \"blob\",\n", + " \"prefix\": \"labelingProjects/d7afeaa4-fe05-4df7-bd7c-46f3a94a96cb/train\"\n", + " },\n", + " \"warnings\": [],\n", + " \"status\": \"ready\",\n", + " \"processingLocation\": \"geography\",\n", + " \"mode\": \"standard\"\n", + "}\n" + ] + } + ], + "source": [ + "# Get detailed information about the source analyzer\n", + "source_analyzer = client.get_analyzer_detail_by_id(SOURCE_ANALYZER_ID)\n", + "\n", + "print(f\"Source Analyzer: {SOURCE_ANALYZER_ID}\")\n", + "print(f\"Name: {source_analyzer.get('name', 'N/A')}\")\n", + "print(f\"Description: {source_analyzer.get('description', 'N/A')}\")\n", + "print(\"\\nFull analyzer definition:\")\n", + "print(json.dumps(source_analyzer, indent=2))" + ] + }, + { + "cell_type": "markdown", + "id": "3eb0b65d", + "metadata": {}, + "source": [ + "## Step 4: Extract Training Data Configuration\n", + "\n", + "Extract the training data configuration from the source analyzer. This includes:\n", + "- **trainingData**: The blob container location with labeled examples\n", + "- **fieldSchema**: The field definitions\n", + "- **tags**: Project and template metadata (important for Azure AI Foundry project association)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "7c57655f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "📦 Training Data Configuration:\n", + "{\n", + " \"containerUrl\": \"https://staistudiote203841201294.blob.core.windows.net/7c123b64-9378-4fa7-a807-081efa839c00-cu\",\n", + " \"kind\": \"blob\",\n", + " \"prefix\": \"labelingProjects/d7afeaa4-fe05-4df7-bd7c-46f3a94a96cb/train\"\n", + "}\n", + "\n", + "✅ Found training data at: https://staistudiote203841201294.blob.core.windows.net/7c123b64-9378-4fa7-a807-081efa839c00-cu\n", + " Path prefix: labelingProjects/d7afeaa4-fe05-4df7-bd7c-46f3a94a96cb/train\n", + "\n", + "📚 Knowledge Sources Configuration:\n", + "No knowledge sources configured (this is normal for standard mode)\n", + "\n", + "📋 Field Schema:\n", + "{\n", + " \"fields\": {\n", + " \"CompanyName\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Name of the pharmaceutical company involved in the rebate program\"\n", + " },\n", + " \"ProductDetails\": {\n", + " \"type\": \"array\",\n", + " \"description\": \"List of products with rebate and unit details\",\n", + " \"items\": {\n", + " \"type\": \"object\",\n", + " \"description\": \"Details of a single product\",\n", + " \"properties\": {\n", + " \"ProductPackageCode\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Code representing the product or package\"\n", + " },\n", + " \"ProductName\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Name of the product\"\n", + " },\n", + " \"FfsimcoRecordId\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Record ID for FFSIMCO\"\n", + " },\n", + " \"RebatePerUnit\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Rebate amount per unit of the product\"\n", + " },\n", + " \"AdjustedRebatePerUnit\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Adjusted rebate amount per unit\"\n", + " },\n", + " \"UnitsInvoiced\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Number of units invoiced\"\n", + " },\n", + " \"UnitsPaid\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Number of units for which payment was made\"\n", + " },\n", + " \"RebateAmountInvoiced\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Total rebate amount invoiced\"\n", + " },\n", + " \"RebateAmountPaid\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Total rebate amount paid\"\n", + " }\n", + " }\n", + " }\n", + " },\n", + " \"TotalPaid\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Total payment amount \"\n", + " }\n", + " }\n", + "}\n", + "\n", + "🏷️ Tags (Project & Template Metadata):\n", + "{\n", + " \"projectId\": \"d7afeaa4-fe05-4df7-bd7c-46f3a94a96cb\",\n", + " \"templateId\": \"document-2025-05-01\"\n", + "}\n", + "\n", + "✅ Found Project ID: d7afeaa4-fe05-4df7-bd7c-46f3a94a96cb\n", + "✅ Found Template ID: document-2025-05-01\n", + "\n", + "💡 These tags will be copied to ensure the new analyzer appears in the same Azure AI Foundry project.\n" + ] + } + ], + "source": [ + "# Extract training data configuration\n", + "training_data_config = source_analyzer.get('trainingData')\n", + "knowledge_sources_config = source_analyzer.get('knowledgeSources')\n", + "field_schema = source_analyzer.get('fieldSchema', {})\n", + "tags = source_analyzer.get('tags', {})\n", + "\n", + "print(\"📦 Training Data Configuration:\")\n", + "if training_data_config:\n", + " print(json.dumps(training_data_config, indent=2))\n", + " container_url = training_data_config.get('containerUrl', 'N/A')\n", + " prefix = training_data_config.get('prefix', '')\n", + " print(f\"\\n✅ Found training data at: {container_url}\")\n", + " print(f\" Path prefix: {prefix}\")\n", + "else:\n", + " print(\"⚠️ No training data found in this analyzer.\")\n", + " print(\" Please select an analyzer that has training data configured.\")\n", + "\n", + "print(\"\\n📚 Knowledge Sources Configuration:\")\n", + "if knowledge_sources_config:\n", + " print(json.dumps(knowledge_sources_config, indent=2))\n", + "else:\n", + " print(\"No knowledge sources configured (this is normal for standard mode)\")\n", + "\n", + "print(\"\\n📋 Field Schema:\")\n", + "print(json.dumps(field_schema, indent=2))\n", + "\n", + "print(\"\\n🏷️ Tags (Project & Template Metadata):\")\n", + "if tags:\n", + " print(json.dumps(tags, indent=2))\n", + " project_id = tags.get('projectId')\n", + " template_id = tags.get('templateId')\n", + " if project_id:\n", + " print(f\"\\n✅ Found Project ID: {project_id}\")\n", + " if template_id:\n", + " print(f\"✅ Found Template ID: {template_id}\")\n", + " print(\"\\n💡 These tags will be copied to ensure the new analyzer appears in the same Azure AI Foundry project.\")\n", + "else:\n", + " print(\"No tags found (the new analyzer may not be associated with a Foundry project)\")" + ] + }, + { + "cell_type": "markdown", + "id": "e7770461", + "metadata": {}, + "source": [ + "## Step 5: Create New Analyzer with Existing Training Data\n", + "\n", + "Now we'll create a new analyzer that references the same training data. This new analyzer will:\n", + "- Use the same blob storage container and path\n", + "- Start with the same field schema (you can modify this)\n", + "- Have its own unique ID\n", + "- **Include the same tags** (projectId and templateId) to ensure it appears in the correct Azure AI Foundry project\n", + "\n", + "### Key Points:\n", + "- **Same resource**: Both analyzers are in the same Azure AI resource\n", + "- **No data duplication**: The training data stays in one place\n", + "- **Same project**: Tags ensure the analyzer appears in the same Foundry project\n", + "- **Independent lifecycle**: Each analyzer can be updated or deleted independently" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "98b0c9c3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Including tags from source analyzer (ensures correct project association in Foundry)\n", + " Project ID: d7afeaa4-fe05-4df7-bd7c-46f3a94a96cb\n", + " Template ID: document-2025-05-01\n", + "\n", + "Creating new analyzer: cloned-analyzer-c073f24d-5659-42ed-8ac8-b083bde79a9b\n", + "\n", + "New analyzer payload (ordered to match API structure):\n", + "{\n", + " \"description\": \"Created from invoiceLabeledData with reused training data\",\n", + " \"tags\": {\n", + " \"projectId\": \"d7afeaa4-fe05-4df7-bd7c-46f3a94a96cb\",\n", + " \"templateId\": \"document-2025-05-01\"\n", + " },\n", + " \"baseAnalyzerId\": \"prebuilt-documentAnalyzer\",\n", + " \"config\": {\n", + " \"returnDetails\": true,\n", + " \"enableOcr\": true,\n", + " \"enableLayout\": true,\n", + " \"enableFormula\": false,\n", + " \"disableContentFiltering\": false,\n", + " \"tableFormat\": \"html\",\n", + " \"estimateFieldSourceAndConfidence\": false\n", + " },\n", + " \"fieldSchema\": {\n", + " \"fields\": {\n", + " \"CompanyName\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Name of the pharmaceutical company involved in the rebate program\"\n", + " },\n", + " \"ProductDetails\": {\n", + " \"type\": \"array\",\n", + " \"description\": \"List of products with rebate and unit details\",\n", + " \"items\": {\n", + " \"type\": \"object\",\n", + " \"description\": \"Details of a single product\",\n", + " \"properties\": {\n", + " \"ProductPackageCode\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Code representing the product or package\"\n", + " },\n", + " \"ProductName\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Name of the product\"\n", + " },\n", + " \"FfsimcoRecordId\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Record ID for FFSIMCO\"\n", + " },\n", + " \"RebatePerUnit\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Rebate amount per unit of the product\"\n", + " },\n", + " \"AdjustedRebatePerUnit\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Adjusted rebate amount per unit\"\n", + " },\n", + " \"UnitsInvoiced\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Number of units invoiced\"\n", + " },\n", + " \"UnitsPaid\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Number of units for which payment was made\"\n", + " },\n", + " \"RebateAmountInvoiced\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Total rebate amount invoiced\"\n", + " },\n", + " \"RebateAmountPaid\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Total rebate amount paid\"\n", + " }\n", + " }\n", + " }\n", + " },\n", + " \"TotalPaid\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Total payment amount \"\n", + " }\n", + " }\n", + " },\n", + " \"mode\": \"standard\"\n", + "}\n", + "\n", + "📦 Training data will be configured separately:\n", + " Container URL: https://staistudiote203841201294.blob.core.windows.net/7c123b64-9378-4fa7-a807-081efa839c00-cu\n", + " Prefix: labelingProjects/d7afeaa4-fe05-4df7-bd7c-46f3a94a96cb/train\n" + ] + } + ], + "source": [ + "# Verify we have training data before proceeding\n", + "if not training_data_config:\n", + " raise ValueError(\n", + " \"Cannot proceed: Source analyzer does not have training data. \"\n", + " \"Please select an analyzer with training data or create one using the optional cell above.\"\n", + " )\n", + "\n", + "# Create a new analyzer ID\n", + "# Analyzer names must be 1-64 characters and only contain letters, numbers, dots, underscores, or hyphens\n", + "NEW_ANALYZER_ID = \"cloned-analyzer-\" + str(uuid.uuid4())\n", + "\n", + "# Build the new analyzer payload in the correct order matching the API structure\n", + "# Note: Read-only fields like createdAt, lastModifiedAt, status, etc. are omitted as they're set by the service\n", + "new_analyzer_payload = {}\n", + "\n", + "# 1. Analyzer ID (not needed as it's passed separately, but kept for reference)\n", + "# new_analyzer_payload[\"analyzerId\"] = NEW_ANALYZER_ID\n", + "\n", + "# 2. Description\n", + "new_analyzer_payload[\"description\"] = f\"Created from {SOURCE_ANALYZER_ID} with reused training data\"\n", + "\n", + "# 3. Tags (projectId and templateId) - IMPORTANT for Foundry project association\n", + "if tags:\n", + " new_analyzer_payload[\"tags\"] = tags\n", + " print(\"✅ Including tags from source analyzer (ensures correct project association in Foundry)\")\n", + " print(f\" Project ID: {tags.get('projectId', 'N/A')}\")\n", + " print(f\" Template ID: {tags.get('templateId', 'N/A')}\")\n", + "else:\n", + " print(\"⚠️ No tags found in source analyzer - new analyzer may not appear in Foundry project\")\n", + "\n", + "# 4. Base Analyzer ID (if present)\n", + "if 'baseAnalyzerId' in source_analyzer:\n", + " new_analyzer_payload['baseAnalyzerId'] = source_analyzer['baseAnalyzerId']\n", + "\n", + "# 5. Config settings\n", + "if 'config' in source_analyzer:\n", + " new_analyzer_payload['config'] = source_analyzer['config']\n", + "\n", + "# 6. Field Schema\n", + "new_analyzer_payload[\"fieldSchema\"] = field_schema\n", + "\n", + "# 7. Training Data - Will be passed separately to begin_create_analyzer()\n", + "# Note: We extract the container URL and prefix to pass as separate parameters\n", + "training_container_sas_url = training_data_config.get('containerUrl', '')\n", + "training_container_prefix = training_data_config.get('prefix', '')\n", + "\n", + "# 8. Knowledge Sources (if present - typically for Pro mode)\n", + "# Extract these separately if they exist\n", + "pro_mode_container_sas_url = \"\"\n", + "pro_mode_container_prefix = \"\"\n", + "if knowledge_sources_config and isinstance(knowledge_sources_config, list) and len(knowledge_sources_config) > 0:\n", + " # Get the first knowledge source (typically there's only one)\n", + " first_knowledge_source = knowledge_sources_config[0]\n", + " pro_mode_container_sas_url = first_knowledge_source.get('containerUrl', '')\n", + " pro_mode_container_prefix = first_knowledge_source.get('prefix', '')\n", + "\n", + "# 9. Mode (if present)\n", + "if 'mode' in source_analyzer:\n", + " new_analyzer_payload['mode'] = source_analyzer['mode']\n", + "\n", + "print(f\"\\nCreating new analyzer: {NEW_ANALYZER_ID}\")\n", + "print(\"\\nNew analyzer payload (ordered to match API structure):\")\n", + "print(json.dumps(new_analyzer_payload, indent=2))\n", + "\n", + "print(\"\\n📦 Training data will be configured separately:\")\n", + "print(f\" Container URL: {training_container_sas_url}\")\n", + "print(f\" Prefix: {training_container_prefix}\")\n", + "\n", + "if pro_mode_container_sas_url:\n", + " print(\"\\n📚 Pro mode reference docs will be configured separately:\")\n", + " print(f\" Container URL: {pro_mode_container_sas_url}\")\n", + " print(f\" Prefix: {pro_mode_container_prefix}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "385a0867", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:python.content_understanding_client:Analyzer cloned-analyzer-c073f24d-5659-42ed-8ac8-b083bde79a9b create request accepted.\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request result is ready after 152.25 seconds.\n", + "INFO:python.content_understanding_client:Request result is ready after 152.25 seconds.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Successfully created new analyzer: cloned-analyzer-c073f24d-5659-42ed-8ac8-b083bde79a9b\n", + "\n", + "Creation result:\n", + "{\n", + " \"id\": \"a22ddf12-3156-4a9a-9675-7b85789a8686\",\n", + " \"status\": \"Succeeded\",\n", + " \"result\": {\n", + " \"analyzerId\": \"cloned-analyzer-c073f24d-5659-42ed-8ac8-b083bde79a9b\",\n", + " \"description\": \"Created from invoiceLabeledData with reused training data\",\n", + " \"tags\": {\n", + " \"projectId\": \"d7afeaa4-fe05-4df7-bd7c-46f3a94a96cb\",\n", + " \"templateId\": \"document-2025-05-01\"\n", + " },\n", + " \"createdAt\": \"2025-10-22T22:44:56Z\",\n", + " \"lastModifiedAt\": \"2025-10-22T22:47:27Z\",\n", + " \"baseAnalyzerId\": \"prebuilt-documentAnalyzer\",\n", + " \"config\": {\n", + " \"returnDetails\": true,\n", + " \"enableOcr\": true,\n", + " \"enableLayout\": true,\n", + " \"enableFormula\": false,\n", + " \"disableContentFiltering\": false,\n", + " \"tableFormat\": \"html\",\n", + " \"estimateFieldSourceAndConfidence\": false\n", + " },\n", + " \"fieldSchema\": {\n", + " \"fields\": {\n", + " \"CompanyName\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Name of the pharmaceutical company involved in the rebate program\"\n", + " },\n", + " \"ProductDetails\": {\n", + " \"type\": \"array\",\n", + " \"description\": \"List of products with rebate and unit details\",\n", + " \"items\": {\n", + " \"type\": \"object\",\n", + " \"description\": \"Details of a single product\",\n", + " \"properties\": {\n", + " \"ProductPackageCode\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Code representing the product or package\"\n", + " },\n", + " \"ProductName\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Name of the product\"\n", + " },\n", + " \"FfsimcoRecordId\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Record ID for FFSIMCO\"\n", + " },\n", + " \"RebatePerUnit\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Rebate amount per unit of the product\"\n", + " },\n", + " \"AdjustedRebatePerUnit\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Adjusted rebate amount per unit\"\n", + " },\n", + " \"UnitsInvoiced\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Number of units invoiced\"\n", + " },\n", + " \"UnitsPaid\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Number of units for which payment was made\"\n", + " },\n", + " \"RebateAmountInvoiced\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Total rebate amount invoiced\"\n", + " },\n", + " \"RebateAmountPaid\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Total rebate amount paid\"\n", + " }\n", + " }\n", + " }\n", + " },\n", + " \"TotalPaid\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Total payment amount \"\n", + " }\n", + " }\n", + " },\n", + " \"trainingData\": {\n", + " \"containerUrl\": \"https://staistudiote203841201294.blob.core.windows.net/7c123b64-9378-4fa7-a807-081efa839c00-cu\",\n", + " \"kind\": \"blob\",\n", + " \"prefix\": \"labelingProjects/d7afeaa4-fe05-4df7-bd7c-46f3a94a96cb/train/\"\n", + " },\n", + " \"warnings\": [],\n", + " \"status\": \"ready\",\n", + " \"processingLocation\": \"geography\",\n", + " \"mode\": \"standard\"\n", + " }\n", + "}\n" + ] + } + ], + "source": [ + "# Create the new analyzer\n", + "# Pass training data and knowledge sources as separate parameters\n", + "response = client.begin_create_analyzer(\n", + " NEW_ANALYZER_ID,\n", + " analyzer_template=new_analyzer_payload,\n", + " training_storage_container_sas_url=training_container_sas_url,\n", + " training_storage_container_path_prefix=training_container_prefix,\n", + ")\n", + "\n", + "result = client.poll_result(response)\n", + "\n", + "if result and result.get('status') == 'Succeeded':\n", + " print(f\"✅ Successfully created new analyzer: {NEW_ANALYZER_ID}\")\n", + " print(\"\\nCreation result:\")\n", + " print(json.dumps(result, indent=2))\n", + "else:\n", + " print(\"⚠️ Analyzer creation encountered an issue.\")\n", + " print(json.dumps(result, indent=2))" + ] + }, + { + "cell_type": "markdown", + "id": "63295659", + "metadata": {}, + "source": [ + "## Step 6: Verify the New Analyzer\n", + "\n", + "Let's confirm the new analyzer was created correctly and is using the same training data." + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "685ff06f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "New Analyzer: cloned-analyzer-c073f24d-5659-42ed-8ac8-b083bde79a9b\n", + "Name: N/A\n", + "Description: Created from invoiceLabeledData with reused training data\n", + "\n", + "Training Data Configuration:\n", + "{\n", + " \"containerUrl\": \"https://staistudiote203841201294.blob.core.windows.net/7c123b64-9378-4fa7-a807-081efa839c00-cu\",\n", + " \"kind\": \"blob\",\n", + " \"prefix\": \"labelingProjects/d7afeaa4-fe05-4df7-bd7c-46f3a94a96cb/train/\"\n", + "}\n", + "\n", + "✅ Verification successful: Both analyzers reference the same training data location!\n" + ] + } + ], + "source": [ + "# Get details of the newly created analyzer\n", + "new_analyzer = client.get_analyzer_detail_by_id(NEW_ANALYZER_ID)\n", + "\n", + "print(f\"New Analyzer: {NEW_ANALYZER_ID}\")\n", + "print(f\"Name: {new_analyzer.get('name', 'N/A')}\")\n", + "print(f\"Description: {new_analyzer.get('description', 'N/A')}\")\n", + "print(\"\\nTraining Data Configuration:\")\n", + "print(json.dumps(new_analyzer.get('trainingData', {}), indent=2))\n", + "\n", + "# Verify the training data location matches\n", + "new_training_data = new_analyzer.get('trainingData', {})\n", + "original_container = training_data_config.get('containerUrl', '')\n", + "new_container = new_training_data.get('containerUrl', '')\n", + "\n", + "if original_container == new_container:\n", + " print(\"\\n✅ Verification successful: Both analyzers reference the same training data location!\")\n", + "else:\n", + " print(\"\\n⚠️ Warning: Training data locations don't match.\")\n", + " print(f\"Original: {original_container}\")\n", + " print(f\"New: {new_container}\")" + ] + }, + { + "cell_type": "markdown", + "id": "fe3352c9", + "metadata": {}, + "source": [ + "## Step 7: Test Both Analyzers\n", + "\n", + "Now let's test both analyzers with a sample file to verify they both work correctly with the shared training data." + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "cc934efd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Testing with file: ../data/receipt.png\n" + ] + } + ], + "source": [ + "# Specify a test file - adjust this path based on your analyzer type\n", + "# For receipt analyzers:\n", + "test_file = \"../data/receipt.png\"\n", + "\n", + "# For invoice analyzers:\n", + "# test_file = \"../data/invoice.pdf\"\n", + "\n", + "# For custom documents:\n", + "# test_file = \"../data/your-document.pdf\"\n", + "\n", + "# Verify the file exists\n", + "if not Path(test_file).exists():\n", + " print(f\"⚠️ Test file not found: {test_file}\")\n", + " print(\"Please adjust the test_file path to match your use case.\")\n", + "else:\n", + " print(f\"Testing with file: {test_file}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "273dd85c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "📝 Analyzing with SOURCE analyzer: invoiceLabeledData\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:python.content_understanding_client:Analyzing file ../data/receipt.png with analyzer: invoiceLabeledData\n", + "INFO:python.content_understanding_client:Request 80b00372-a498-4564-9ff1-1e6901778a2d in progress ...\n", + "INFO:python.content_understanding_client:Request 80b00372-a498-4564-9ff1-1e6901778a2d in progress ...\n", + "INFO:python.content_understanding_client:Request 80b00372-a498-4564-9ff1-1e6901778a2d in progress ...\n", + "INFO:python.content_understanding_client:Request 80b00372-a498-4564-9ff1-1e6901778a2d in progress ...\n", + "INFO:python.content_understanding_client:Request result is ready after 4.71 seconds.\n", + "INFO:python.content_understanding_client:Request result is ready after 4.71 seconds.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Source Analyzer Results:\n", + "Extracted 3 field(s)\n", + " - CompanyName: {'type': 'string', 'valueString': 'Contoso'}\n", + " - ProductDetails: {'type': 'array'}\n", + " - TotalPaid: {'type': 'number', 'valueNumber': 2516.28}\n" + ] + } + ], + "source": [ + "# Test the original analyzer\n", + "if Path(test_file).exists():\n", + " print(f\"\\n📝 Analyzing with SOURCE analyzer: {SOURCE_ANALYZER_ID}\")\n", + " response_source = client.begin_analyze(SOURCE_ANALYZER_ID, file_location=test_file)\n", + " result_source = client.poll_result(response_source)\n", + " \n", + " print(\"\\nSource Analyzer Results:\")\n", + " # Print a summary of extracted fields\n", + " if result_source.get('status') == 'Succeeded':\n", + " result_data = result_source.get('result', {})\n", + " fields = result_data.get('contents', [{}])[0].get('fields', {})\n", + " print(f\"Extracted {len(fields)} field(s)\")\n", + " for field_name, field_value in fields.items():\n", + " print(f\" - {field_name}: {field_value}\")\n", + " else:\n", + " print(json.dumps(result_source, indent=2))" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "e9654313", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "📝 Analyzing with NEW analyzer: cloned-analyzer-c073f24d-5659-42ed-8ac8-b083bde79a9b\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:python.content_understanding_client:Analyzing file ../data/receipt.png with analyzer: cloned-analyzer-c073f24d-5659-42ed-8ac8-b083bde79a9b\n", + "INFO:python.content_understanding_client:Request 5d982b83-4b1c-4e99-b045-48e36cb5a7e3 in progress ...\n", + "INFO:python.content_understanding_client:Request 5d982b83-4b1c-4e99-b045-48e36cb5a7e3 in progress ...\n", + "INFO:python.content_understanding_client:Request 5d982b83-4b1c-4e99-b045-48e36cb5a7e3 in progress ...\n", + "INFO:python.content_understanding_client:Request 5d982b83-4b1c-4e99-b045-48e36cb5a7e3 in progress ...\n", + "INFO:python.content_understanding_client:Request result is ready after 4.72 seconds.\n", + "INFO:python.content_understanding_client:Request result is ready after 4.72 seconds.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "New Analyzer Results:\n", + "Extracted 3 field(s)\n", + " - CompanyName: {'type': 'string', 'valueString': 'Contoso'}\n", + " - ProductDetails: {'type': 'array'}\n", + " - TotalPaid: {'type': 'number', 'valueNumber': 2516.28}\n", + "\n", + "✅ Both analyzers successfully processed the file using the shared training data!\n" + ] + } + ], + "source": [ + "# Test the new analyzer\n", + "if Path(test_file).exists():\n", + " print(f\"\\n📝 Analyzing with NEW analyzer: {NEW_ANALYZER_ID}\")\n", + " response_new = client.begin_analyze(NEW_ANALYZER_ID, file_location=test_file)\n", + " result_new = client.poll_result(response_new)\n", + " \n", + " print(\"\\nNew Analyzer Results:\")\n", + " # Print a summary of extracted fields\n", + " if result_new.get('status') == 'Succeeded':\n", + " result_data = result_new.get('result', {})\n", + " fields = result_data.get('contents', [{}])[0].get('fields', {})\n", + " print(f\"Extracted {len(fields)} field(s)\")\n", + " for field_name, field_value in fields.items():\n", + " print(f\" - {field_name}: {field_value}\")\n", + " else:\n", + " print(json.dumps(result_new, indent=2))\n", + " \n", + " print(\"\\n✅ Both analyzers successfully processed the file using the shared training data!\")" + ] + }, + { + "cell_type": "markdown", + "id": "f913b6dd", + "metadata": {}, + "source": [ + "## Step 8: Compare Results (Optional)\n", + "\n", + "Let's compare the full results from both analyzers side by side." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c6467b3f", + "metadata": {}, + "outputs": [], + "source": [ + "if Path(test_file).exists():\n", + " print(\"=\" * 80)\n", + " print(\"SOURCE ANALYZER FULL RESULTS\")\n", + " print(\"=\" * 80)\n", + " print(json.dumps(result_source, indent=2))\n", + " \n", + " print(\"\\n\" + \"=\" * 80)\n", + " print(\"NEW ANALYZER FULL RESULTS\")\n", + " print(\"=\" * 80)\n", + " print(json.dumps(result_new, indent=2))" + ] + }, + { + "cell_type": "markdown", + "id": "5f65f05c", + "metadata": {}, + "source": [ + "## Step 9: Cleanup (Optional)\n", + "\n", + "If you want to clean up the test analyzers, you can delete them. In production, you typically keep analyzers for reuse.\n", + "\n", + "⚠️ **Warning**: This will permanently delete the analyzer. The training data in blob storage will remain unaffected." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "00cde3ff", + "metadata": {}, + "outputs": [], + "source": [ + "# Uncomment to delete the new analyzer\n", + "# print(f\"Deleting new analyzer: {NEW_ANALYZER_ID}\")\n", + "# client.delete_analyzer(NEW_ANALYZER_ID)\n", + "# print(\"✅ New analyzer deleted\")\n", + "\n", + "# Uncomment to also delete the source analyzer (be careful!)\n", + "# print(f\"Deleting source analyzer: {SOURCE_ANALYZER_ID}\")\n", + "# client.delete_analyzer(SOURCE_ANALYZER_ID)\n", + "# print(\"✅ Source analyzer deleted\")" + ] + }, + { + "cell_type": "markdown", + "id": "d952dfef", + "metadata": {}, + "source": [ + "## Summary\n", + "\n", + "🎉 **Congratulations!** You have successfully:\n", + "\n", + "✅ Retrieved an existing analyzer with training data \n", + "✅ Extracted the training data configuration \n", + "✅ Created a new analyzer referencing the same training data \n", + "✅ Verified both analyzers work correctly \n", + "✅ Tested both analyzers with a sample file \n", + "\n", + "### Key Takeaways\n", + "\n", + "- **No data duplication**: Both analyzers reference the same blob storage location\n", + "- **Same resource**: Both analyzers use the same authentication and access permissions\n", + "- **Field portability**: You can maintain stable `fieldId`s across different analyzer versions\n", + "- **Rapid iteration**: Test schema changes quickly without re-uploading training data\n", + "\n", + "### Best Practices\n", + "\n", + "1. **Stable field IDs**: Keep `fieldId`s consistent across analyzers for easier migration\n", + "2. **Version control**: Maintain analyzer schemas in source control\n", + "3. **Documentation**: Document which blob paths contain which training datasets\n", + "4. **Testing**: Always test a new analyzer before deleting the original\n", + "5. **Naming conventions**: Use descriptive analyzer IDs that indicate purpose and version\n", + "\n", + "### Next Steps\n", + "\n", + "- Modify the field schema in the new analyzer to test different configurations\n", + "- Add additional training data to improve both analyzers\n", + "- Use this pattern to create A/B testing scenarios\n", + "- Explore other notebooks:\n", + " - [analyzer_training.ipynb](./analyzer_training.ipynb) - Create analyzers with training data\n", + " - [field_extraction.ipynb](./field_extraction.ipynb) - Extract fields from documents\n", + " - [management.ipynb](./management.ipynb) - Manage analyzer lifecycle" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 50557e6b4d690d9d3d3243f5a0bed2b478bfd1cc Mon Sep 17 00:00:00 2001 From: Joe Filcik Date: Thu, 23 Oct 2025 15:00:30 -0400 Subject: [PATCH 2/8] removing extra analyzer list and add clear note to add analyzer name --- .../move_training_data_across_analyzers.ipynb | 2051 +---------------- 1 file changed, 9 insertions(+), 2042 deletions(-) diff --git a/notebooks/move_training_data_across_analyzers.ipynb b/notebooks/move_training_data_across_analyzers.ipynb index 4117155..6bc6195 100644 --- a/notebooks/move_training_data_across_analyzers.ipynb +++ b/notebooks/move_training_data_across_analyzers.ipynb @@ -246,2045 +246,10 @@ }, { "cell_type": "code", - "execution_count": 7, - "id": "fcbc218a", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Found 675 analyzer(s) in your resource\n", - "\n", - "Available analyzers:\n", - "1. ID: prebuilt-audioAnalyzer\n", - " Name: N/A\n", - "\n", - "2. ID: prebuilt-callCenter\n", - " Name: N/A\n", - "\n", - "3. ID: prebuilt-contract\n", - " Name: N/A\n", - "\n", - "4. ID: prebuilt-documentAnalyzer\n", - " Name: N/A\n", - "\n", - "5. ID: prebuilt-imageAnalyzer\n", - " Name: N/A\n", - "\n", - "6. ID: prebuilt-invoice\n", - " Name: N/A\n", - "\n", - "7. ID: prebuilt-videoAnalyzer\n", - " Name: N/A\n", - "\n", - "8. ID: 123\n", - " Name: N/A\n", - "\n", - "9. ID: Test-description\n", - " Name: N/A\n", - "\n", - "10. ID: Test\n", - " Name: N/A\n", - "\n", - "11. ID: abc\n", - " Name: N/A\n", - "\n", - "12. ID: audio-250808\n", - " Name: N/A\n", - "\n", - "13. ID: auto-highlight-analyzer-1753389013\n", - " Name: N/A\n", - "\n", - "14. ID: auto-highlight-analyzer-1753393121\n", - " Name: N/A\n", - "\n", - "15. ID: auto-highlight-analyzer-1753727044\n", - " Name: N/A\n", - "\n", - "16. ID: auto-highlight-analyzer-1753728638\n", - " Name: N/A\n", - "\n", - "17. ID: auto-highlight-analyzer-1753822646\n", - " Name: N/A\n", - "\n", - "18. ID: auto-highlight-analyzer-1753823934\n", - " Name: N/A\n", - "\n", - "19. ID: auto-highlight-analyzer-1753826664\n", - " Name: N/A\n", - "\n", - "20. ID: auto-highlight-analyzer-1753829625\n", - " Name: N/A\n", - "\n", - "21. ID: auto-highlight-analyzer-1754935354\n", - " Name: N/A\n", - "\n", - "22. ID: auto-labeling-model-1748319168608-457\n", - " Name: N/A\n", - "\n", - "23. ID: auto-labeling-model-1748343190922-522\n", - " Name: N/A\n", - "\n", - "24. ID: auto-labeling-model-1748343844913-193\n", - " Name: N/A\n", - "\n", - "25. ID: auto-labeling-model-1748364582299-194\n", - " Name: N/A\n", - "\n", - "26. ID: auto-labeling-model-1748364610998-174\n", - " Name: N/A\n", - "\n", - "27. ID: auto-labeling-model-1748364627905-392\n", - " Name: N/A\n", - "\n", - "28. ID: auto-labeling-model-1748364882995-331\n", - " Name: N/A\n", - "\n", - "29. ID: auto-labeling-model-1748365809345-194\n", - " Name: N/A\n", - "\n", - "30. ID: auto-labeling-model-1748365844597-722\n", - " Name: N/A\n", - "\n", - "31. ID: auto-labeling-model-1748369310664-291\n", - " Name: N/A\n", - "\n", - "32. ID: auto-labeling-model-1748382666104-108\n", - " Name: N/A\n", - "\n", - "33. ID: auto-labeling-model-1748398666237-678\n", - " Name: N/A\n", - "\n", - "34. ID: auto-labeling-model-1748406169100-153\n", - " Name: N/A\n", - "\n", - "35. ID: auto-labeling-model-1748487450682-652\n", - " Name: N/A\n", - "\n", - "36. ID: auto-labeling-model-1748490709500-887\n", - " Name: N/A\n", - "\n", - "37. ID: auto-labeling-model-1748524957609-245\n", - " Name: N/A\n", - "\n", - "38. ID: auto-labeling-model-1748525150770-437\n", - " Name: N/A\n", - "\n", - "39. ID: auto-labeling-model-1748527146405-802\n", - " Name: N/A\n", - "\n", - "40. ID: auto-labeling-model-1748532349641-24\n", - " Name: N/A\n", - "\n", - "41. ID: auto-labeling-model-1748652707721-341\n", - " Name: N/A\n", - "\n", - "42. ID: auto-labeling-model-1748652848103-155\n", - " Name: N/A\n", - "\n", - "43. ID: auto-labeling-model-1748839949920-863\n", - " Name: N/A\n", - "\n", - "44. ID: auto-labeling-model-1748845791989-716\n", - " Name: N/A\n", - "\n", - "45. ID: auto-labeling-model-1748845807869-415\n", - " Name: N/A\n", - "\n", - "46. ID: auto-labeling-model-1748907891703-517\n", - " Name: N/A\n", - "\n", - "47. ID: auto-labeling-model-1748908692967-569\n", - " Name: N/A\n", - "\n", - "48. ID: auto-labeling-model-1748914058095-616\n", - " Name: N/A\n", - "\n", - "49. ID: auto-labeling-model-1748936065478-291\n", - " Name: N/A\n", - "\n", - "50. ID: auto-labeling-model-1748936271674-552\n", - " Name: N/A\n", - "\n", - "51. ID: auto-labeling-model-1748936490686-646\n", - " Name: N/A\n", - "\n", - "52. ID: auto-labeling-model-1748937447139-653\n", - " Name: N/A\n", - "\n", - "53. ID: auto-labeling-model-1748940860399-529\n", - " Name: N/A\n", - "\n", - "54. ID: auto-labeling-model-1748941320548-161\n", - " Name: N/A\n", - "\n", - "55. ID: auto-labeling-model-1748941816737-4\n", - " Name: N/A\n", - "\n", - "56. ID: auto-labeling-model-1748942668260-584\n", - " Name: N/A\n", - "\n", - "57. ID: auto-labeling-model-1748942752946-240\n", - " Name: N/A\n", - "\n", - "58. ID: auto-labeling-model-1748943751138-585\n", - " Name: N/A\n", - "\n", - "59. ID: auto-labeling-model-1748943869439-730\n", - " Name: N/A\n", - "\n", - "60. ID: auto-labeling-model-1748944505181-366\n", - " Name: N/A\n", - "\n", - "61. ID: auto-labeling-model-1748945194482-115\n", - " Name: N/A\n", - "\n", - "62. ID: auto-labeling-model-1749003326198-992\n", - " Name: N/A\n", - "\n", - "63. ID: auto-labeling-model-1749023590022-874\n", - " Name: N/A\n", - "\n", - "64. ID: auto-labeling-model-1749023636121-927\n", - " Name: N/A\n", - "\n", - "65. ID: auto-labeling-model-1749023850993-339\n", - " Name: N/A\n", - "\n", - "66. ID: auto-labeling-model-1749023887009-843\n", - " Name: N/A\n", - "\n", - "67. ID: auto-labeling-model-1749023901480-881\n", - " Name: N/A\n", - "\n", - "68. ID: auto-labeling-model-1749023933378-529\n", - " Name: N/A\n", - "\n", - "69. ID: auto-labeling-model-1749024617342-607\n", - " Name: N/A\n", - "\n", - "70. ID: auto-labeling-model-1749024650401-862\n", - " Name: N/A\n", - "\n", - "71. ID: auto-labeling-model-1749095665011-257\n", - " Name: N/A\n", - "\n", - "72. ID: auto-labeling-model-1749096929213-707\n", - " Name: N/A\n", - "\n", - "73. ID: auto-labeling-model-1749104361550-221\n", - " Name: N/A\n", - "\n", - "74. ID: auto-labeling-model-1749104922387-882\n", - " Name: N/A\n", - "\n", - "75. ID: auto-labeling-model-1749105026574-367\n", - " Name: N/A\n", - "\n", - "76. ID: auto-labeling-model-1749251965833-403\n", - " Name: N/A\n", - "\n", - "77. ID: auto-labeling-model-1749254053334-357\n", - " Name: N/A\n", - "\n", - "78. ID: auto-labeling-model-1749311286700-369\n", - " Name: N/A\n", - "\n", - "79. ID: auto-labeling-model-1749509842310-370\n", - " Name: N/A\n", - "\n", - "80. ID: auto-labeling-model-1749520600099-409\n", - " Name: N/A\n", - "\n", - "81. ID: auto-labeling-model-1749522784982-438\n", - " Name: N/A\n", - "\n", - "82. ID: auto-labeling-model-1749535466854-401\n", - " Name: N/A\n", - "\n", - "83. ID: auto-labeling-model-1749581796990-277\n", - " Name: N/A\n", - "\n", - "84. ID: auto-labeling-model-1749581836897-138\n", - " Name: N/A\n", - "\n", - "85. ID: auto-labeling-model-1749584140873-572\n", - " Name: N/A\n", - "\n", - "86. ID: auto-labeling-model-1749585959231-24\n", - " Name: N/A\n", - "\n", - "87. ID: auto-labeling-model-1749604604536-674\n", - " Name: N/A\n", - "\n", - "88. ID: auto-labeling-model-1749620902726-984\n", - " Name: N/A\n", - "\n", - "89. ID: auto-labeling-model-1749626687259-809\n", - " Name: N/A\n", - "\n", - "90. ID: auto-labeling-model-1749627602312-979\n", - " Name: N/A\n", - "\n", - "91. ID: auto-labeling-model-1749630601186-689\n", - " Name: N/A\n", - "\n", - "92. ID: auto-labeling-model-1749631339251-319\n", - " Name: N/A\n", - "\n", - "93. ID: auto-labeling-model-1749631742974-733\n", - " Name: N/A\n", - "\n", - "94. ID: auto-labeling-model-1749631891328-309\n", - " Name: N/A\n", - "\n", - "95. ID: auto-labeling-model-1749696702275-545\n", - " Name: N/A\n", - "\n", - "96. ID: auto-labeling-model-1749758278394-240\n", - " Name: N/A\n", - "\n", - "97. ID: auto-labeling-model-1749758517784-660\n", - " Name: N/A\n", - "\n", - "98. ID: auto-labeling-model-1749758533104-929\n", - " Name: N/A\n", - "\n", - "99. ID: auto-labeling-model-1749758555087-116\n", - " Name: N/A\n", - "\n", - "100. ID: auto-labeling-model-1749759432793-891\n", - " Name: N/A\n", - "\n", - "101. ID: auto-labeling-model-1749768746704-802\n", - " Name: N/A\n", - "\n", - "102. ID: auto-labeling-model-1749775305589-256\n", - " Name: N/A\n", - "\n", - "103. ID: auto-labeling-model-1749802761164-406\n", - " Name: N/A\n", - "\n", - "104. ID: auto-labeling-model-1749956497322-594\n", - " Name: N/A\n", - "\n", - "105. ID: auto-labeling-model-1749960177654-514\n", - " Name: N/A\n", - "\n", - "106. ID: auto-labeling-model-1749961833034-154\n", - " Name: N/A\n", - "\n", - "107. ID: auto-labeling-model-1749962138214-21\n", - " Name: N/A\n", - "\n", - "108. ID: auto-labeling-model-1750045513862-445\n", - " Name: N/A\n", - "\n", - "109. ID: auto-labeling-model-1750108497453-922\n", - " Name: N/A\n", - "\n", - "110. ID: auto-labeling-model-1750123214932-968\n", - " Name: N/A\n", - "\n", - "111. ID: auto-labeling-model-1750128770286-412\n", - " Name: N/A\n", - "\n", - "112. ID: auto-labeling-model-1750128888980-243\n", - " Name: N/A\n", - "\n", - "113. ID: auto-labeling-model-1750141234245-231\n", - " Name: N/A\n", - "\n", - "114. ID: auto-labeling-model-1750145695285-480\n", - " Name: N/A\n", - "\n", - "115. ID: auto-labeling-model-1750211643719-379\n", - " Name: N/A\n", - "\n", - "116. ID: auto-labeling-model-1750233198991-694\n", - " Name: N/A\n", - "\n", - "117. ID: auto-labeling-model-1750241272780-2\n", - " Name: N/A\n", - "\n", - "118. ID: auto-labeling-model-1750279157596-35\n", - " Name: N/A\n", - "\n", - "119. ID: auto-labeling-model-1750291999953-91\n", - " Name: N/A\n", - "\n", - "120. ID: auto-labeling-model-1750292632586-625\n", - " Name: N/A\n", - "\n", - "121. ID: auto-labeling-model-1750312049582-59\n", - " Name: N/A\n", - "\n", - "122. ID: auto-labeling-model-1750312573420-578\n", - " Name: N/A\n", - "\n", - "123. ID: auto-labeling-model-1750376726735-970\n", - " Name: N/A\n", - "\n", - "124. ID: auto-labeling-model-1750377427038-364\n", - " Name: N/A\n", - "\n", - "125. ID: auto-labeling-model-1750385575232-897\n", - " Name: N/A\n", - "\n", - "126. ID: auto-labeling-model-1750403576185-741\n", - " Name: N/A\n", - "\n", - "127. ID: auto-labeling-model-1750404809435-451\n", - " Name: N/A\n", - "\n", - "128. ID: auto-labeling-model-1750405070052-89\n", - " Name: N/A\n", - "\n", - "129. ID: auto-labeling-model-1750405091355-763\n", - " Name: N/A\n", - "\n", - "130. ID: auto-labeling-model-1750417420016-430\n", - " Name: N/A\n", - "\n", - "131. ID: auto-labeling-model-1750659725597-788\n", - " Name: N/A\n", - "\n", - "132. ID: auto-labeling-model-1750659733517-772\n", - " Name: N/A\n", - "\n", - "133. ID: auto-labeling-model-1750659761722-251\n", - " Name: N/A\n", - "\n", - "134. ID: auto-labeling-model-1750659784566-101\n", - " Name: N/A\n", - "\n", - "135. ID: auto-labeling-model-1750659903607-108\n", - " Name: N/A\n", - "\n", - "136. ID: auto-labeling-model-1750659933637-141\n", - " Name: N/A\n", - "\n", - "137. ID: auto-labeling-model-1750659945217-945\n", - " Name: N/A\n", - "\n", - "138. ID: auto-labeling-model-1750660650963-739\n", - " Name: N/A\n", - "\n", - "139. ID: auto-labeling-model-1750660824597-923\n", - " Name: N/A\n", - "\n", - "140. ID: auto-labeling-model-1750663207559-512\n", - " Name: N/A\n", - "\n", - "141. ID: auto-labeling-model-1750663259510-796\n", - " Name: N/A\n", - "\n", - "142. ID: auto-labeling-model-1750663303432-581\n", - " Name: N/A\n", - "\n", - "143. ID: auto-labeling-model-1750663377213-340\n", - " Name: N/A\n", - "\n", - "144. ID: auto-labeling-model-1750663393108-597\n", - " Name: N/A\n", - "\n", - "145. ID: auto-labeling-model-1750664456347-683\n", - " Name: N/A\n", - "\n", - "146. ID: auto-labeling-model-1750664605893-618\n", - " Name: N/A\n", - "\n", - "147. ID: auto-labeling-model-1750665355708-8\n", - " Name: N/A\n", - "\n", - "148. ID: auto-labeling-model-1750673318125-535\n", - " Name: N/A\n", - "\n", - "149. ID: auto-labeling-model-1750673331433-642\n", - " Name: N/A\n", - "\n", - "150. ID: auto-labeling-model-1750709349430-630\n", - " Name: N/A\n", - "\n", - "151. ID: auto-labeling-model-1750719511542-531\n", - " Name: N/A\n", - "\n", - "152. ID: auto-labeling-model-1750744047556-446\n", - " Name: N/A\n", - "\n", - "153. ID: auto-labeling-model-1750755510472-120\n", - " Name: N/A\n", - "\n", - "154. ID: auto-labeling-model-1750784814399-27\n", - " Name: N/A\n", - "\n", - "155. ID: auto-labeling-model-1750788356545-200\n", - " Name: N/A\n", - "\n", - "156. ID: auto-labeling-model-1750789921864-730\n", - " Name: N/A\n", - "\n", - "157. ID: auto-labeling-model-1750836585070-913\n", - " Name: N/A\n", - "\n", - "158. ID: auto-labeling-model-1750842588854-962\n", - " Name: N/A\n", - "\n", - "159. ID: auto-labeling-model-1750842831795-314\n", - " Name: N/A\n", - "\n", - "160. ID: auto-labeling-model-1750842897183-394\n", - " Name: N/A\n", - "\n", - "161. ID: auto-labeling-model-1750842978258-136\n", - " Name: N/A\n", - "\n", - "162. ID: auto-labeling-model-1750843282949-512\n", - " Name: N/A\n", - "\n", - "163. ID: auto-labeling-model-1750843704909-216\n", - " Name: N/A\n", - "\n", - "164. ID: auto-labeling-model-1750843908445-174\n", - " Name: N/A\n", - "\n", - "165. ID: auto-labeling-model-1750844014408-330\n", - " Name: N/A\n", - "\n", - "166. ID: auto-labeling-model-1750844234138-988\n", - " Name: N/A\n", - "\n", - "167. ID: auto-labeling-model-1750844709672-320\n", - " Name: N/A\n", - "\n", - "168. ID: auto-labeling-model-1750845307517-940\n", - " Name: N/A\n", - "\n", - "169. ID: auto-labeling-model-1750846220484-837\n", - " Name: N/A\n", - "\n", - "170. ID: auto-labeling-model-1750846255005-395\n", - " Name: N/A\n", - "\n", - "171. ID: auto-labeling-model-1750847433984-311\n", - " Name: N/A\n", - "\n", - "172. ID: auto-labeling-model-1750853034834-460\n", - " Name: N/A\n", - "\n", - "173. ID: auto-labeling-model-1750919114419-408\n", - " Name: N/A\n", - "\n", - "174. ID: auto-labeling-model-1750920179010-279\n", - " Name: N/A\n", - "\n", - "175. ID: auto-labeling-model-1750920218343-518\n", - " Name: N/A\n", - "\n", - "176. ID: auto-labeling-model-1750920298701-557\n", - " Name: N/A\n", - "\n", - "177. ID: auto-labeling-model-1750920352617-62\n", - " Name: N/A\n", - "\n", - "178. ID: auto-labeling-model-1751052501474-178\n", - " Name: N/A\n", - "\n", - "179. ID: auto-labeling-model-1751069615217-264\n", - " Name: N/A\n", - "\n", - "180. ID: auto-labeling-model-1751270970103-549\n", - " Name: N/A\n", - "\n", - "181. ID: auto-labeling-model-1751272499140-268\n", - " Name: N/A\n", - "\n", - "182. ID: auto-labeling-model-1751272544250-613\n", - " Name: N/A\n", - "\n", - "183. ID: auto-labeling-model-1751273787498-265\n", - " Name: N/A\n", - "\n", - "184. ID: auto-labeling-model-1751273849331-220\n", - " Name: N/A\n", - "\n", - "185. ID: auto-labeling-model-1751273904647-201\n", - " Name: N/A\n", - "\n", - "186. ID: auto-labeling-model-1751273937246-448\n", - " Name: N/A\n", - "\n", - "187. ID: auto-labeling-model-1751273983364-401\n", - " Name: N/A\n", - "\n", - "188. ID: auto-labeling-model-1751336918679-904\n", - " Name: N/A\n", - "\n", - "189. ID: auto-labeling-model-1751349360361-963\n", - " Name: N/A\n", - "\n", - "190. ID: auto-labeling-model-1751427888199-459\n", - " Name: N/A\n", - "\n", - "191. ID: auto-labeling-model-1751427891721-940\n", - " Name: N/A\n", - "\n", - "192. ID: auto-labeling-model-1751441608096-967\n", - " Name: N/A\n", - "\n", - "193. ID: auto-labeling-model-1751441662962-402\n", - " Name: N/A\n", - "\n", - "194. ID: auto-labeling-model-1751444577624-169\n", - " Name: N/A\n", - "\n", - "195. ID: auto-labeling-model-1751446425406-566\n", - " Name: N/A\n", - "\n", - "196. ID: auto-labeling-model-1751446744627-904\n", - " Name: N/A\n", - "\n", - "197. ID: auto-labeling-model-1751447069922-153\n", - " Name: N/A\n", - "\n", - "198. ID: auto-labeling-model-1751447126141-210\n", - " Name: N/A\n", - "\n", - "199. ID: auto-labeling-model-1751450223362-323\n", - " Name: N/A\n", - "\n", - "200. ID: auto-labeling-model-1751619901375-912\n", - " Name: N/A\n", - "\n", - "201. ID: auto-labeling-model-1751621939880-824\n", - " Name: N/A\n", - "\n", - "202. ID: auto-labeling-model-1751622003371-912\n", - " Name: N/A\n", - "\n", - "203. ID: auto-labeling-model-1751622246359-22\n", - " Name: N/A\n", - "\n", - "204. ID: auto-labeling-model-1751622337847-185\n", - " Name: N/A\n", - "\n", - "205. ID: auto-labeling-model-1751630796222-228\n", - " Name: N/A\n", - "\n", - "206. ID: auto-labeling-model-1751630815948-351\n", - " Name: N/A\n", - "\n", - "207. ID: auto-labeling-model-1751998528557-924\n", - " Name: N/A\n", - "\n", - "208. ID: auto-labeling-model-1752025809239-846\n", - " Name: N/A\n", - "\n", - "209. ID: auto-labeling-model-1752034702114-180\n", - " Name: N/A\n", - "\n", - "210. ID: auto-labeling-model-1752098586840-747\n", - " Name: N/A\n", - "\n", - "211. ID: auto-labeling-model-1752180782600-490\n", - " Name: N/A\n", - "\n", - "212. ID: auto-labeling-model-1752271117113-156\n", - " Name: N/A\n", - "\n", - "213. ID: auto-labeling-model-1752523653762-595\n", - " Name: N/A\n", - "\n", - "214. ID: auto-labeling-model-1752600290738-67\n", - " Name: N/A\n", - "\n", - "215. ID: auto-labeling-model-1752625416686-81\n", - " Name: N/A\n", - "\n", - "216. ID: auto-labeling-model-1752625871649-767\n", - " Name: N/A\n", - "\n", - "217. ID: auto-labeling-model-1752693120005-346\n", - " Name: N/A\n", - "\n", - "218. ID: auto-labeling-model-1752697569506-376\n", - " Name: N/A\n", - "\n", - "219. ID: auto-labeling-model-1752697610504-950\n", - " Name: N/A\n", - "\n", - "220. ID: auto-labeling-model-1752700740555-590\n", - " Name: N/A\n", - "\n", - "221. ID: auto-labeling-model-1752708687132-939\n", - " Name: N/A\n", - "\n", - "222. ID: auto-labeling-model-1752741732428-578\n", - " Name: N/A\n", - "\n", - "223. ID: auto-labeling-model-1752780032715-66\n", - " Name: N/A\n", - "\n", - "224. ID: auto-labeling-model-1752780325289-573\n", - " Name: N/A\n", - "\n", - "225. ID: auto-labeling-model-1752795955082-603\n", - " Name: N/A\n", - "\n", - "226. ID: auto-labeling-model-1752796753555-462\n", - " Name: N/A\n", - "\n", - "227. ID: auto-labeling-model-1752797239305-251\n", - " Name: N/A\n", - "\n", - "228. ID: auto-labeling-model-1752800932971-876\n", - " Name: N/A\n", - "\n", - "229. ID: auto-labeling-model-1752803086727-971\n", - " Name: N/A\n", - "\n", - "230. ID: auto-labeling-model-1752803985621-193\n", - " Name: N/A\n", - "\n", - "231. ID: auto-labeling-model-1752806777300-862\n", - " Name: N/A\n", - "\n", - "232. ID: auto-labeling-model-1752884829621-441\n", - " Name: N/A\n", - "\n", - "233. ID: auto-labeling-model-1753083025779-103\n", - " Name: N/A\n", - "\n", - "234. ID: auto-labeling-model-1753083077531-666\n", - " Name: N/A\n", - "\n", - "235. ID: auto-labeling-model-1753083850816-29\n", - " Name: N/A\n", - "\n", - "236. ID: auto-labeling-model-1753083864041-58\n", - " Name: N/A\n", - "\n", - "237. ID: auto-labeling-model-1753086883459-951\n", - " Name: N/A\n", - "\n", - "238. ID: auto-labeling-model-1753089079279-222\n", - " Name: N/A\n", - "\n", - "239. ID: auto-labeling-model-1753150531096-410\n", - " Name: N/A\n", - "\n", - "240. ID: auto-labeling-model-1753151865515-394\n", - " Name: N/A\n", - "\n", - "241. ID: auto-labeling-model-1753168395318-507\n", - " Name: N/A\n", - "\n", - "242. ID: auto-labeling-model-1753169409334-912\n", - " Name: N/A\n", - "\n", - "243. ID: auto-labeling-model-1753173597967-303\n", - " Name: N/A\n", - "\n", - "244. ID: auto-labeling-model-1753177537439-711\n", - " Name: N/A\n", - "\n", - "245. ID: auto-labeling-model-1753205662320-583\n", - " Name: N/A\n", - "\n", - "246. ID: auto-labeling-model-1753207022483-913\n", - " Name: N/A\n", - "\n", - "247. ID: auto-labeling-model-1753207579262-276\n", - " Name: N/A\n", - "\n", - "248. ID: auto-labeling-model-1753208672240-981\n", - " Name: N/A\n", - "\n", - "249. ID: auto-labeling-model-1753209156822-298\n", - " Name: N/A\n", - "\n", - "250. ID: auto-labeling-model-1753209981617-818\n", - " Name: N/A\n", - "\n", - "251. ID: auto-labeling-model-1753236316137-300\n", - " Name: N/A\n", - "\n", - "252. ID: auto-labeling-model-1753237512820-249\n", - " Name: N/A\n", - "\n", - "253. ID: auto-labeling-model-1753250369127-625\n", - " Name: N/A\n", - "\n", - "254. ID: auto-labeling-model-1753255567341-610\n", - " Name: N/A\n", - "\n", - "255. ID: auto-labeling-model-1753259092944-226\n", - " Name: N/A\n", - "\n", - "256. ID: auto-labeling-model-1753287197755-783\n", - " Name: N/A\n", - "\n", - "257. ID: auto-labeling-model-1753321650913-823\n", - " Name: N/A\n", - "\n", - "258. ID: auto-labeling-model-1753325891996-80\n", - " Name: N/A\n", - "\n", - "259. ID: auto-labeling-model-1753334968241-706\n", - " Name: N/A\n", - "\n", - "260. ID: auto-labeling-model-1753335132165-512\n", - " Name: N/A\n", - "\n", - "261. ID: auto-labeling-model-1753335555914-390\n", - " Name: N/A\n", - "\n", - "262. ID: auto-labeling-model-1753335697157-843\n", - " Name: N/A\n", - "\n", - "263. ID: auto-labeling-model-1753340903345-139\n", - " Name: N/A\n", - "\n", - "264. ID: auto-labeling-model-1753344102782-140\n", - " Name: N/A\n", - "\n", - "265. ID: auto-labeling-model-1753344491064-431\n", - " Name: N/A\n", - "\n", - "266. ID: auto-labeling-model-1753344947435-154\n", - " Name: N/A\n", - "\n", - "267. ID: auto-labeling-model-1753346772842-804\n", - " Name: N/A\n", - "\n", - "268. ID: auto-labeling-model-1753420107017-420\n", - " Name: N/A\n", - "\n", - "269. ID: auto-labeling-model-1753420466410-256\n", - " Name: N/A\n", - "\n", - "270. ID: auto-labeling-model-1753423049391-214\n", - " Name: N/A\n", - "\n", - "271. ID: auto-labeling-model-1753430316648-188\n", - " Name: N/A\n", - "\n", - "272. ID: auto-labeling-model-1753431705642-795\n", - " Name: N/A\n", - "\n", - "273. ID: auto-labeling-model-1753432653890-622\n", - " Name: N/A\n", - "\n", - "274. ID: auto-labeling-model-1753433164146-455\n", - " Name: N/A\n", - "\n", - "275. ID: auto-labeling-model-1753434806213-833\n", - " Name: N/A\n", - "\n", - "276. ID: auto-labeling-model-1753670824352-493\n", - " Name: N/A\n", - "\n", - "277. ID: auto-labeling-model-1753680640396-566\n", - " Name: N/A\n", - "\n", - "278. ID: auto-labeling-model-1753681888155-667\n", - " Name: N/A\n", - "\n", - "279. ID: auto-labeling-model-1753682254644-331\n", - " Name: N/A\n", - "\n", - "280. ID: auto-labeling-model-1753683583061-323\n", - " Name: N/A\n", - "\n", - "281. ID: auto-labeling-model-1753684547670-475\n", - " Name: N/A\n", - "\n", - "282. ID: auto-labeling-model-1753684784064-358\n", - " Name: N/A\n", - "\n", - "283. ID: auto-labeling-model-1753686206798-898\n", - " Name: N/A\n", - "\n", - "284. ID: auto-labeling-model-1753686800552-354\n", - " Name: N/A\n", - "\n", - "285. ID: auto-labeling-model-1753691313133-192\n", - " Name: N/A\n", - "\n", - "286. ID: auto-labeling-model-1753755468942-82\n", - " Name: N/A\n", - "\n", - "287. ID: auto-labeling-model-1753765727024-37\n", - " Name: N/A\n", - "\n", - "288. ID: auto-labeling-model-1753766046014-152\n", - " Name: N/A\n", - "\n", - "289. ID: auto-labeling-model-1753767335342-370\n", - " Name: N/A\n", - "\n", - "290. ID: auto-labeling-model-1753767338325-621\n", - " Name: N/A\n", - "\n", - "291. ID: auto-labeling-model-1753773699582-540\n", - " Name: N/A\n", - "\n", - "292. ID: auto-labeling-model-1753774470271-985\n", - " Name: N/A\n", - "\n", - "293. ID: auto-labeling-model-1753775949221-151\n", - " Name: N/A\n", - "\n", - "294. ID: auto-labeling-model-1753777245479-372\n", - " Name: N/A\n", - "\n", - "295. ID: auto-labeling-model-1753777925896-803\n", - " Name: N/A\n", - "\n", - "296. ID: auto-labeling-model-1753780557881-855\n", - " Name: N/A\n", - "\n", - "297. ID: auto-labeling-model-1753841121952-979\n", - " Name: N/A\n", - "\n", - "298. ID: auto-labeling-model-1753841981886-902\n", - " Name: N/A\n", - "\n", - "299. ID: auto-labeling-model-1753843376936-643\n", - " Name: N/A\n", - "\n", - "300. ID: auto-labeling-model-1753844211334-641\n", - " Name: N/A\n", - "\n", - "301. ID: auto-labeling-model-1753853033274-214\n", - " Name: N/A\n", - "\n", - "302. ID: auto-labeling-model-1753855251911-309\n", - " Name: N/A\n", - "\n", - "303. ID: auto-labeling-model-1753855551724-866\n", - " Name: N/A\n", - "\n", - "304. ID: auto-labeling-model-1753857116602-791\n", - " Name: N/A\n", - "\n", - "305. ID: auto-labeling-model-1753857268920-608\n", - " Name: N/A\n", - "\n", - "306. ID: auto-labeling-model-1753857820246-647\n", - " Name: N/A\n", - "\n", - "307. ID: auto-labeling-model-1753857865813-554\n", - " Name: N/A\n", - "\n", - "308. ID: auto-labeling-model-1753858369469-249\n", - " Name: N/A\n", - "\n", - "309. ID: auto-labeling-model-1753859412803-605\n", - " Name: N/A\n", - "\n", - "310. ID: auto-labeling-model-1753860904131-872\n", - " Name: N/A\n", - "\n", - "311. ID: auto-labeling-model-1753861167980-954\n", - " Name: N/A\n", - "\n", - "312. ID: auto-labeling-model-1753861799127-664\n", - " Name: N/A\n", - "\n", - "313. ID: auto-labeling-model-1753862553873-905\n", - " Name: N/A\n", - "\n", - "314. ID: auto-labeling-model-1753862814119-255\n", - " Name: N/A\n", - "\n", - "315. ID: auto-labeling-model-1753863784180-612\n", - " Name: N/A\n", - "\n", - "316. ID: auto-labeling-model-1753863994987-510\n", - " Name: N/A\n", - "\n", - "317. ID: auto-labeling-model-1753864084656-697\n", - " Name: N/A\n", - "\n", - "318. ID: auto-labeling-model-1753865255601-417\n", - " Name: N/A\n", - "\n", - "319. ID: auto-labeling-model-1753888993477-912\n", - " Name: N/A\n", - "\n", - "320. ID: auto-labeling-model-1753936473158-979\n", - " Name: N/A\n", - "\n", - "321. ID: auto-labeling-model-1753939417926-903\n", - " Name: N/A\n", - "\n", - "322. ID: auto-labeling-model-1753941090969-886\n", - " Name: N/A\n", - "\n", - "323. ID: auto-labeling-model-1753941295803-93\n", - " Name: N/A\n", - "\n", - "324. ID: auto-labeling-model-1753943808756-255\n", - " Name: N/A\n", - "\n", - "325. ID: auto-labeling-model-1754012684592-887\n", - " Name: N/A\n", - "\n", - "326. ID: auto-labeling-model-1754015881192-443\n", - " Name: N/A\n", - "\n", - "327. ID: auto-labeling-model-1754016406351-97\n", - " Name: N/A\n", - "\n", - "328. ID: auto-labeling-model-1754016977082-211\n", - " Name: N/A\n", - "\n", - "329. ID: auto-labeling-model-1754017707931-428\n", - " Name: N/A\n", - "\n", - "330. ID: auto-labeling-model-1754024495010-992\n", - " Name: N/A\n", - "\n", - "331. ID: auto-labeling-model-1754025560953-192\n", - " Name: N/A\n", - "\n", - "332. ID: auto-labeling-model-1754026435557-853\n", - " Name: N/A\n", - "\n", - "333. ID: auto-labeling-model-1754037940196-869\n", - " Name: N/A\n", - "\n", - "334. ID: auto-labeling-model-1754082032616-607\n", - " Name: N/A\n", - "\n", - "335. ID: auto-labeling-model-1754082215077-482\n", - " Name: N/A\n", - "\n", - "336. ID: auto-labeling-model-1754082332437-629\n", - " Name: N/A\n", - "\n", - "337. ID: auto-labeling-model-1754082479343-224\n", - " Name: N/A\n", - "\n", - "338. ID: auto-labeling-model-1754082536526-914\n", - " Name: N/A\n", - "\n", - "339. ID: auto-labeling-model-1754082630700-302\n", - " Name: N/A\n", - "\n", - "340. ID: auto-labeling-model-1754082725263-83\n", - " Name: N/A\n", - "\n", - "341. ID: auto-labeling-model-1754082811382-584\n", - " Name: N/A\n", - "\n", - "342. ID: auto-labeling-model-1754082998761-352\n", - " Name: N/A\n", - "\n", - "343. ID: auto-labeling-model-1754083046825-203\n", - " Name: N/A\n", - "\n", - "344. ID: auto-labeling-model-1754083150278-445\n", - " Name: N/A\n", - "\n", - "345. ID: auto-labeling-model-1754083462284-222\n", - " Name: N/A\n", - "\n", - "346. ID: auto-labeling-model-1754083621516-367\n", - " Name: N/A\n", - "\n", - "347. ID: auto-labeling-model-1754083719163-272\n", - " Name: N/A\n", - "\n", - "348. ID: auto-labeling-model-1754083866374-41\n", - " Name: N/A\n", - "\n", - "349. ID: auto-labeling-model-1754084032708-231\n", - " Name: N/A\n", - "\n", - "350. ID: auto-labeling-model-1754084406835-168\n", - " Name: N/A\n", - "\n", - "351. ID: auto-labeling-model-1754084472348-188\n", - " Name: N/A\n", - "\n", - "352. ID: auto-labeling-model-1754084575001-916\n", - " Name: N/A\n", - "\n", - "353. ID: auto-labeling-model-1754084884148-481\n", - " Name: N/A\n", - "\n", - "354. ID: auto-labeling-model-1754088680537-743\n", - " Name: N/A\n", - "\n", - "355. ID: auto-labeling-model-1754277589373-867\n", - " Name: N/A\n", - "\n", - "356. ID: auto-labeling-model-1754327062412-76\n", - " Name: N/A\n", - "\n", - "357. ID: auto-labeling-model-1754361872613-844\n", - " Name: N/A\n", - "\n", - "358. ID: auto-labeling-model-1754442934624-187\n", - " Name: N/A\n", - "\n", - "359. ID: auto-labeling-model-1754443219339-17\n", - " Name: N/A\n", - "\n", - "360. ID: auto-labeling-model-1754448125079-528\n", - " Name: N/A\n", - "\n", - "361. ID: auto-labeling-model-1754448200938-6\n", - " Name: N/A\n", - "\n", - "362. ID: auto-labeling-model-1754448830534-215\n", - " Name: N/A\n", - "\n", - "363. ID: auto-labeling-model-1754448901751-597\n", - " Name: N/A\n", - "\n", - "364. ID: auto-labeling-model-1754449038080-472\n", - " Name: N/A\n", - "\n", - "365. ID: auto-labeling-model-1754449135369-901\n", - " Name: N/A\n", - "\n", - "366. ID: auto-labeling-model-1754449150398-162\n", - " Name: N/A\n", - "\n", - "367. ID: auto-labeling-model-1754449206123-981\n", - " Name: N/A\n", - "\n", - "368. ID: auto-labeling-model-1754449280061-594\n", - " Name: N/A\n", - "\n", - "369. ID: auto-labeling-model-1754449347580-776\n", - " Name: N/A\n", - "\n", - "370. ID: auto-labeling-model-1754449538829-202\n", - " Name: N/A\n", - "\n", - "371. ID: auto-labeling-model-1754449608449-502\n", - " Name: N/A\n", - "\n", - "372. ID: auto-labeling-model-1754449678933-461\n", - " Name: N/A\n", - "\n", - "373. ID: auto-labeling-model-1754449747782-122\n", - " Name: N/A\n", - "\n", - "374. ID: auto-labeling-model-1754449819030-776\n", - " Name: N/A\n", - "\n", - "375. ID: auto-labeling-model-1754454485024-346\n", - " Name: N/A\n", - "\n", - "376. ID: auto-labeling-model-1754456633663-795\n", - " Name: N/A\n", - "\n", - "377. ID: auto-labeling-model-1754457369864-749\n", - " Name: N/A\n", - "\n", - "378. ID: auto-labeling-model-1754457591929-484\n", - " Name: N/A\n", - "\n", - "379. ID: auto-labeling-model-1754460230719-575\n", - " Name: N/A\n", - "\n", - "380. ID: auto-labeling-model-1754460479500-36\n", - " Name: N/A\n", - "\n", - "381. ID: auto-labeling-model-1754460640349-364\n", - " Name: N/A\n", - "\n", - "382. ID: auto-labeling-model-1754669409054-428\n", - " Name: N/A\n", - "\n", - "383. ID: auto-labeling-model-1754951212582-203\n", - " Name: N/A\n", - "\n", - "384. ID: auto-labeling-model-1754965260794-576\n", - " Name: N/A\n", - "\n", - "385. ID: auto-labeling-model-1754965331102-485\n", - " Name: N/A\n", - "\n", - "386. ID: auto-labeling-model-1754965445643-161\n", - " Name: N/A\n", - "\n", - "387. ID: auto-labeling-model-1754965630031-820\n", - " Name: N/A\n", - "\n", - "388. ID: auto-labeling-model-1754965704606-779\n", - " Name: N/A\n", - "\n", - "389. ID: auto-labeling-model-1754965767126-499\n", - " Name: N/A\n", - "\n", - "390. ID: auto-labeling-model-1754965926600-215\n", - " Name: N/A\n", - "\n", - "391. ID: auto-labeling-model-1754965996281-810\n", - " Name: N/A\n", - "\n", - "392. ID: auto-labeling-model-1754966073913-92\n", - " Name: N/A\n", - "\n", - "393. ID: auto-labeling-model-1754966208584-396\n", - " Name: N/A\n", - "\n", - "394. ID: auto-labeling-model-1754966287090-692\n", - " Name: N/A\n", - "\n", - "395. ID: auto-labeling-model-1754966553579-724\n", - " Name: N/A\n", - "\n", - "396. ID: auto-labeling-model-1754966634261-409\n", - " Name: N/A\n", - "\n", - "397. ID: auto-labeling-model-1754966703678-7\n", - " Name: N/A\n", - "\n", - "398. ID: auto-labeling-model-1754966778721-225\n", - " Name: N/A\n", - "\n", - "399. ID: auto-labeling-model-1754966848977-806\n", - " Name: N/A\n", - "\n", - "400. ID: auto-labeling-model-1754966934481-980\n", - " Name: N/A\n", - "\n", - "401. ID: auto-labeling-model-1754967006745-602\n", - " Name: N/A\n", - "\n", - "402. ID: auto-labeling-model-1754967080546-450\n", - " Name: N/A\n", - "\n", - "403. ID: auto-labeling-model-1754967570056-479\n", - " Name: N/A\n", - "\n", - "404. ID: auto-labeling-model-1754967665781-18\n", - " Name: N/A\n", - "\n", - "405. ID: auto-labeling-model-1754967737902-258\n", - " Name: N/A\n", - "\n", - "406. ID: auto-labeling-model-1754967809639-969\n", - " Name: N/A\n", - "\n", - "407. ID: auto-labeling-model-1754967879833-46\n", - " Name: N/A\n", - "\n", - "408. ID: auto-labeling-model-1754967953160-263\n", - " Name: N/A\n", - "\n", - "409. ID: auto-labeling-model-1754968036672-249\n", - " Name: N/A\n", - "\n", - "410. ID: auto-labeling-model-1754968110963-400\n", - " Name: N/A\n", - "\n", - "411. ID: auto-labeling-model-1754968179908-761\n", - " Name: N/A\n", - "\n", - "412. ID: auto-labeling-model-1754974913641-913\n", - " Name: N/A\n", - "\n", - "413. ID: auto-labeling-model-1754975127019-903\n", - " Name: N/A\n", - "\n", - "414. ID: auto-labeling-model-1754975368613-717\n", - " Name: N/A\n", - "\n", - "415. ID: auto-labeling-model-1754975432901-90\n", - " Name: N/A\n", - "\n", - "416. ID: auto-labeling-model-1754975454687-707\n", - " Name: N/A\n", - "\n", - "417. ID: auto-labeling-model-1754975527897-708\n", - " Name: N/A\n", - "\n", - "418. ID: auto-labeling-model-1754975600064-524\n", - " Name: N/A\n", - "\n", - "419. ID: auto-labeling-model-1754975711179-28\n", - " Name: N/A\n", - "\n", - "420. ID: auto-labeling-model-1754975967653-203\n", - " Name: N/A\n", - "\n", - "421. ID: auto-labeling-model-1754976038813-381\n", - " Name: N/A\n", - "\n", - "422. ID: auto-labeling-model-1754976117940-973\n", - " Name: N/A\n", - "\n", - "423. ID: auto-labeling-model-1754976193933-189\n", - " Name: N/A\n", - "\n", - "424. ID: auto-labeling-model-1754976293724-520\n", - " Name: N/A\n", - "\n", - "425. ID: auto-labeling-model-1754976368518-509\n", - " Name: N/A\n", - "\n", - "426. ID: auto-labeling-model-1754976437096-539\n", - " Name: N/A\n", - "\n", - "427. ID: auto-labeling-model-1754976513472-952\n", - " Name: N/A\n", - "\n", - "428. ID: auto-labeling-model-1754976754715-501\n", - " Name: N/A\n", - "\n", - "429. ID: auto-labeling-model-1754976904752-710\n", - " Name: N/A\n", - "\n", - "430. ID: auto-labeling-model-1754976976653-350\n", - " Name: N/A\n", - "\n", - "431. ID: auto-labeling-model-1754977052535-217\n", - " Name: N/A\n", - "\n", - "432. ID: auto-labeling-model-1754977121829-706\n", - " Name: N/A\n", - "\n", - "433. ID: auto-labeling-model-1754977217214-291\n", - " Name: N/A\n", - "\n", - "434. ID: auto-labeling-model-1754977287574-575\n", - " Name: N/A\n", - "\n", - "435. ID: auto-labeling-model-1754977360553-264\n", - " Name: N/A\n", - "\n", - "436. ID: auto-labeling-model-1754977435968-198\n", - " Name: N/A\n", - "\n", - "437. ID: auto-labeling-model-1754977508312-429\n", - " Name: N/A\n", - "\n", - "438. ID: auto-labeling-model-1754977588026-221\n", - " Name: N/A\n", - "\n", - "439. ID: auto-labeling-model-1754977663056-797\n", - " Name: N/A\n", - "\n", - "440. ID: auto-labeling-model-1754978589858-924\n", - " Name: N/A\n", - "\n", - "441. ID: auto-labeling-model-1754978799780-511\n", - " Name: N/A\n", - "\n", - "442. ID: auto-labeling-model-1754980148754-523\n", - " Name: N/A\n", - "\n", - "443. ID: auto-labeling-model-1754980966501-518\n", - " Name: N/A\n", - "\n", - "444. ID: auto-labeling-model-1754981828125-533\n", - " Name: N/A\n", - "\n", - "445. ID: auto-labeling-model-1754983426916-774\n", - " Name: N/A\n", - "\n", - "446. ID: auto-labeling-model-1754984348089-313\n", - " Name: N/A\n", - "\n", - "447. ID: auto-labeling-model-1754984423463-874\n", - " Name: N/A\n", - "\n", - "448. ID: auto-labeling-model-1754984499501-967\n", - " Name: N/A\n", - "\n", - "449. ID: auto-labeling-model-1754984577453-603\n", - " Name: N/A\n", - "\n", - "450. ID: auto-labeling-model-1754984673348-39\n", - " Name: N/A\n", - "\n", - "451. ID: auto-labeling-model-1754984745908-988\n", - " Name: N/A\n", - "\n", - "452. ID: auto-labeling-model-1754984844230-121\n", - " Name: N/A\n", - "\n", - "453. ID: auto-labeling-model-1754985031421-137\n", - " Name: N/A\n", - "\n", - "454. ID: auto-labeling-model-1754985052679-764\n", - " Name: N/A\n", - "\n", - "455. ID: auto-labeling-model-1754985230207-884\n", - " Name: N/A\n", - "\n", - "456. ID: auto-labeling-model-1754993665797-458\n", - " Name: N/A\n", - "\n", - "457. ID: auto-labeling-model-1754993775398-308\n", - " Name: N/A\n", - "\n", - "458. ID: auto-labeling-model-1755021430602-389\n", - " Name: N/A\n", - "\n", - "459. ID: auto-labeling-model-1755021530633-576\n", - " Name: N/A\n", - "\n", - "460. ID: auto-labeling-model-1755034509086-812\n", - " Name: N/A\n", - "\n", - "461. ID: auto-labeling-model-1755036680421-274\n", - " Name: N/A\n", - "\n", - "462. ID: auto-labeling-model-1755036840212-13\n", - " Name: N/A\n", - "\n", - "463. ID: auto-labeling-model-1755037123033-737\n", - " Name: N/A\n", - "\n", - "464. ID: auto-labeling-model-1755041702234-29\n", - " Name: N/A\n", - "\n", - "465. ID: auto-labeling-model-1755041716845-12\n", - " Name: N/A\n", - "\n", - "466. ID: auto-labeling-model-1755043090900-677\n", - " Name: N/A\n", - "\n", - "467. ID: auto-labeling-model-1755044191218-796\n", - " Name: N/A\n", - "\n", - "468. ID: auto-labeling-model-1755044423164-353\n", - " Name: N/A\n", - "\n", - "469. ID: auto-labeling-model-1755048701795-244\n", - " Name: N/A\n", - "\n", - "470. ID: auto-labeling-model-1755048719130-947\n", - " Name: N/A\n", - "\n", - "471. ID: auto-labeling-model-1755048825616-336\n", - " Name: N/A\n", - "\n", - "472. ID: auto-labeling-model-1755048863902-319\n", - " Name: N/A\n", - "\n", - "473. ID: auto-labeling-model-1755048975788-30\n", - " Name: N/A\n", - "\n", - "474. ID: auto-labeling-model-1755049161847-499\n", - " Name: N/A\n", - "\n", - "475. ID: auto-labeling-model-1755061734445-540\n", - " Name: N/A\n", - "\n", - "476. ID: auto-labeling-model-1755061987015-686\n", - " Name: N/A\n", - "\n", - "477. ID: auto-labeling-model-1755062318015-752\n", - " Name: N/A\n", - "\n", - "478. ID: auto-labeling-model-1755062966345-99\n", - " Name: N/A\n", - "\n", - "479. ID: auto-labeling-model-1755063315485-717\n", - " Name: N/A\n", - "\n", - "480. ID: auto-labeling-model-1755063386013-936\n", - " Name: N/A\n", - "\n", - "481. ID: auto-labeling-model-1755069455912-277\n", - " Name: N/A\n", - "\n", - "482. ID: auto-labeling-model-1755069553935-338\n", - " Name: N/A\n", - "\n", - "483. ID: auto-labeling-model-1755069702068-412\n", - " Name: N/A\n", - "\n", - "484. ID: auto-labeling-model-1755069842876-922\n", - " Name: N/A\n", - "\n", - "485. ID: auto-labeling-model-1755072279253-390\n", - " Name: N/A\n", - "\n", - "486. ID: auto-labeling-model-1755076709324-342\n", - " Name: N/A\n", - "\n", - "487. ID: auto-labeling-model-1755077617558-667\n", - " Name: N/A\n", - "\n", - "488. ID: auto-labeling-model-1755077873604-810\n", - " Name: N/A\n", - "\n", - "489. ID: auto-labeling-model-1755078021426-256\n", - " Name: N/A\n", - "\n", - "490. ID: auto-labeling-model-1755134767049-985\n", - " Name: N/A\n", - "\n", - "491. ID: auto-labeling-model-1755135457748-675\n", - " Name: N/A\n", - "\n", - "492. ID: auto-labeling-model-1755220299075-866\n", - " Name: N/A\n", - "\n", - "493. ID: auto-labeling-model-1755221919898-254\n", - " Name: N/A\n", - "\n", - "494. ID: auto-labeling-model-1755222009716-189\n", - " Name: N/A\n", - "\n", - "495. ID: auto-labeling-model-1755222110837-250\n", - " Name: N/A\n", - "\n", - "496. ID: auto-labeling-model-1755222196939-944\n", - " Name: N/A\n", - "\n", - "497. ID: auto-labeling-model-1755222580985-811\n", - " Name: N/A\n", - "\n", - "498. ID: auto-labeling-model-1755224344739-857\n", - " Name: N/A\n", - "\n", - "499. ID: auto-labeling-model-1755224418333-237\n", - " Name: N/A\n", - "\n", - "500. ID: auto-labeling-model-1755224501846-126\n", - " Name: N/A\n", - "\n", - "501. ID: auto-labeling-model-1755224573788-830\n", - " Name: N/A\n", - "\n", - "502. ID: auto-labeling-model-1755274111236-815\n", - " Name: N/A\n", - "\n", - "503. ID: auto-labeling-model-1755546385161-718\n", - " Name: N/A\n", - "\n", - "504. ID: auto-labeling-model-1755564859753-49\n", - " Name: N/A\n", - "\n", - "505. ID: auto-labeling-model-1755571891436-24\n", - " Name: N/A\n", - "\n", - "506. ID: auto-labeling-model-1755575417648-956\n", - " Name: N/A\n", - "\n", - "507. ID: auto-labeling-model-1755589868572-105\n", - " Name: N/A\n", - "\n", - "508. ID: auto-labeling-model-1755623887267-687\n", - " Name: N/A\n", - "\n", - "509. ID: auto-labeling-model-1755657602248-443\n", - " Name: N/A\n", - "\n", - "510. ID: auto-labeling-model-1755671136055-108\n", - " Name: N/A\n", - "\n", - "511. ID: auto-labeling-model-1755673245801-744\n", - " Name: N/A\n", - "\n", - "512. ID: auto-labeling-model-1755675180889-142\n", - " Name: N/A\n", - "\n", - "513. ID: auto-labeling-model-1755678446620-988\n", - " Name: N/A\n", - "\n", - "514. ID: auto-labeling-model-1755738759590-405\n", - " Name: N/A\n", - "\n", - "515. ID: auto-labeling-model-1755741941138-610\n", - " Name: N/A\n", - "\n", - "516. ID: auto-labeling-model-1755745805348-731\n", - " Name: N/A\n", - "\n", - "517. ID: auto-labeling-model-1755753976159-223\n", - " Name: N/A\n", - "\n", - "518. ID: auto-labeling-model-1755756092896-628\n", - " Name: N/A\n", - "\n", - "519. ID: auto-labeling-model-1755761289894-657\n", - " Name: N/A\n", - "\n", - "520. ID: auto-labeling-model-1755824923780-82\n", - " Name: N/A\n", - "\n", - "521. ID: auto-labeling-model-1755839089591-320\n", - " Name: N/A\n", - "\n", - "522. ID: auto-labeling-model-1755840078392-806\n", - " Name: N/A\n", - "\n", - "523. ID: auto-labeling-model-1755843001974-210\n", - " Name: N/A\n", - "\n", - "524. ID: auto-labeling-model-1755844906709-250\n", - " Name: N/A\n", - "\n", - "525. ID: auto-labeling-model-1755846971954-69\n", - " Name: N/A\n", - "\n", - "526. ID: auto-labeling-model-1755847550122-149\n", - " Name: N/A\n", - "\n", - "527. ID: auto-labeling-model-1755849254781-355\n", - " Name: N/A\n", - "\n", - "528. ID: auto-labeling-model-1755854539631-293\n", - " Name: N/A\n", - "\n", - "529. ID: auto-labeling-model-1756087002299-72\n", - " Name: N/A\n", - "\n", - "530. ID: auto-labeling-model-1756087565828-132\n", - " Name: N/A\n", - "\n", - "531. ID: auto-labeling-model-1756087680461-719\n", - " Name: N/A\n", - "\n", - "532. ID: auto-labeling-model-1756087819774-813\n", - " Name: N/A\n", - "\n", - "533. ID: auto-labeling-model-1756087867761-583\n", - " Name: N/A\n", - "\n", - "534. ID: auto-labeling-model-1756112514075-201\n", - " Name: N/A\n", - "\n", - "535. ID: auto-labeling-model-1756137207447-376\n", - " Name: N/A\n", - "\n", - "536. ID: auto-labeling-model-1756137492728-788\n", - " Name: N/A\n", - "\n", - "537. ID: auto-labeling-model-1756138904093-804\n", - " Name: N/A\n", - "\n", - "538. ID: auto-labeling-model-1756193938984-510\n", - " Name: N/A\n", - "\n", - "539. ID: auto-labeling-model-1756279382223-424\n", - " Name: N/A\n", - "\n", - "540. ID: auto-labeling-model-1756281178604-829\n", - " Name: N/A\n", - "\n", - "541. ID: auto-labeling-model-1756347012781-494\n", - " Name: N/A\n", - "\n", - "542. ID: auto-labeling-model-1756348972897-103\n", - " Name: N/A\n", - "\n", - "543. ID: auto-labeling-model-1756349422839-305\n", - " Name: N/A\n", - "\n", - "544. ID: auto-labeling-model-1756349498730-552\n", - " Name: N/A\n", - "\n", - "545. ID: auto-labeling-model-1756360413351-308\n", - " Name: N/A\n", - "\n", - "546. ID: auto-labeling-model-1756363959156-20\n", - " Name: N/A\n", - "\n", - "547. ID: auto-labeling-model-1756369801529-118\n", - " Name: N/A\n", - "\n", - "548. ID: auto-labeling-model-1756430598758-905\n", - " Name: N/A\n", - "\n", - "549. ID: auto-labeling-model-1756440760505-307\n", - " Name: N/A\n", - "\n", - "550. ID: auto-labeling-model-1756460100800-668\n", - " Name: N/A\n", - "\n", - "551. ID: auto-labeling-model-1756460110544-559\n", - " Name: N/A\n", - "\n", - "552. ID: auto-labeling-model-1756693820728-76\n", - " Name: N/A\n", - "\n", - "553. ID: auto-labeling-model-1756912886736-101\n", - " Name: N/A\n", - "\n", - "554. ID: auto-labeling-model-1757497814136-763\n", - " Name: N/A\n", - "\n", - "555. ID: auto-labeling-model-1757663204666-122\n", - " Name: N/A\n", - "\n", - "556. ID: auto-labeling-model-1757995180429-664\n", - " Name: N/A\n", - "\n", - "557. ID: auto-labeling-model-1758045209157-220\n", - " Name: N/A\n", - "\n", - "558. ID: auto-labeling-model-1758045343765-419\n", - " Name: N/A\n", - "\n", - "559. ID: auto-labeling-model-1758182652735-580\n", - " Name: N/A\n", - "\n", - "560. ID: auto-labeling-model-1758551942230-384\n", - " Name: N/A\n", - "\n", - "561. ID: auto-labeling-model-1758693093755-157\n", - " Name: N/A\n", - "\n", - "562. ID: auto-labeling-model-1758703215086-912\n", - " Name: N/A\n", - "\n", - "563. ID: auto-labeling-model-1758742506653-803\n", - " Name: N/A\n", - "\n", - "564. ID: auto-labeling-model-1758859363470-900\n", - " Name: N/A\n", - "\n", - "565. ID: auto-labeling-model-1758861498544-317\n", - " Name: N/A\n", - "\n", - "566. ID: auto-labeling-model-1759166848691-35\n", - " Name: N/A\n", - "\n", - "567. ID: auto-labeling-model-1759310043204-41\n", - " Name: N/A\n", - "\n", - "568. ID: auto-labeling-model-1759334232768-397\n", - " Name: N/A\n", - "\n", - "569. ID: auto-labeling-model-1759817160138-569\n", - " Name: N/A\n", - "\n", - "570. ID: auto-labeling-model-1759956977266-516\n", - " Name: N/A\n", - "\n", - "571. ID: auto-labeling-model-1760426322250-908\n", - " Name: N/A\n", - "\n", - "572. ID: auto-labeling-model-1760479341007-491\n", - " Name: N/A\n", - "\n", - "573. ID: auto-labeling-model-1760479492039-631\n", - " Name: N/A\n", - "\n", - "574. ID: auto-labeling-model-1760479645658-613\n", - " Name: N/A\n", - "\n", - "575. ID: auto-labeling-model-1760479761056-497\n", - " Name: N/A\n", - "\n", - "576. ID: auto-labeling-model-1760479780527-626\n", - " Name: N/A\n", - "\n", - "577. ID: auto-labeling-model-1760479932099-212\n", - " Name: N/A\n", - "\n", - "578. ID: auto-labeling-model-1760479949487-358\n", - " Name: N/A\n", - "\n", - "579. ID: auto-labeling-model-1760480389179-217\n", - " Name: N/A\n", - "\n", - "580. ID: auto-labeling-model-1760490988143-30\n", - " Name: N/A\n", - "\n", - "581. ID: auto-labeling-model-1760499252646-774\n", - " Name: N/A\n", - "\n", - "582. ID: auto-labeling-model-1760539714171-740\n", - " Name: N/A\n", - "\n", - "583. ID: auto-labeling-model-1760540221082-518\n", - " Name: N/A\n", - "\n", - "584. ID: auto-labeling-model-1760566206649-192\n", - " Name: N/A\n", - "\n", - "585. ID: auto-labeling-model-1760649718443-469\n", - " Name: N/A\n", - "\n", - "586. ID: auto-labeling-model-1760974810245-633\n", - " Name: N/A\n", - "\n", - "587. ID: auto-labeling-model-1761060285537-410\n", - " Name: N/A\n", - "\n", - "588. ID: auto-labeling-model-1761072595965-766\n", - " Name: N/A\n", - "\n", - "589. ID: auto-labeling-model-1761170322608-61\n", - " Name: N/A\n", - "\n", - "590. ID: auto-labeling-model-1761170507108-187\n", - " Name: N/A\n", - "\n", - "591. ID: cu-eox\n", - " Name: N/A\n", - "\n", - "592. ID: cu-test-2\n", - " Name: N/A\n", - "\n", - "593. ID: cu-test-3\n", - " Name: N/A\n", - "\n", - "594. ID: cu-test\n", - " Name: N/A\n", - "\n", - "595. ID: cu-test3\n", - " Name: N/A\n", - "\n", - "596. ID: cu-trainig-debug\n", - " Name: N/A\n", - "\n", - "597. ID: cx-deloitte-all-items-good\n", - " Name: N/A\n", - "\n", - "598. ID: cx-deloitte-all-items-idex\n", - " Name: N/A\n", - "\n", - "599. ID: cx-deloitte-all-items-keep-one-label\n", - " Name: N/A\n", - "\n", - "600. ID: cx-deloitte-all-items\n", - " Name: N/A\n", - "\n", - "601. ID: cx-deloitte\n", - " Name: N/A\n", - "\n", - "602. ID: default\n", - " Name: N/A\n", - "\n", - "603. ID: document-test\n", - " Name: N/A\n", - "\n", - "604. ID: example\n", - " Name: N/A\n", - "\n", - "605. ID: excel\n", - " Name: N/A\n", - "\n", - "606. ID: highlight-analyzer-03673070-1755304831\n", - " Name: N/A\n", - "\n", - "607. ID: highlight-analyzer-1755112569\n", - " Name: N/A\n", - "\n", - "608. ID: highlight-analyzer-1755113090\n", - " Name: N/A\n", - "\n", - "609. ID: highlight-analyzer-1755117427\n", - " Name: N/A\n", - "\n", - "610. ID: highlight-analyzer-1755127191\n", - " Name: N/A\n", - "\n", - "611. ID: highlight-analyzer-1755128917\n", - " Name: N/A\n", - "\n", - "612. ID: highlight-analyzer-1755204485\n", - " Name: N/A\n", - "\n", - "613. ID: highlight-analyzer-1755205148\n", - " Name: N/A\n", - "\n", - "614. ID: highlight-analyzer-1755304423\n", - " Name: N/A\n", - "\n", - "615. ID: highlight-analyzer-49453d78-1755304719\n", - " Name: N/A\n", - "\n", - "616. ID: insurance-test\n", - " Name: N/A\n", - "\n", - "617. ID: invoiceLabeledData\n", - " Name: N/A\n", - "\n", - "618. ID: joann-insurance\n", - " Name: N/A\n", - "\n", - "619. ID: joann-tryout-invoice\n", - " Name: N/A\n", - "\n", - "620. ID: k\n", - " Name: N/A\n", - "\n", - "621. ID: minus\n", - " Name: N/A\n", - "\n", - "622. ID: mySampleAnalyzer\n", - " Name: N/A\n", - "\n", - "623. ID: pro-test\n", - " Name: N/A\n", - "\n", - "624. ID: proExample\n", - " Name: N/A\n", - "\n", - "625. ID: sampleAnalyzer273626\n", - " Name: N/A\n", - "\n", - "626. ID: sampleAnalyzer530775\n", - " Name: N/A\n", - "\n", - "627. ID: sampleAnalyzer679281\n", - " Name: N/A\n", - "\n", - "628. ID: shihw-insurance-0529\n", - " Name: N/A\n", - "\n", - "629. ID: shihw-video-test0528\n", - " Name: N/A\n", - "\n", - "630. ID: soccer-highlights-analyzer-v1\n", - " Name: N/A\n", - "\n", - "631. ID: soccer-highlights-analyzer-v2\n", - " Name: N/A\n", - "\n", - "632. ID: soccer-highlights-analyzer-v3\n", - " Name: N/A\n", - "\n", - "633. ID: soccer-highlights-analyzer-v4\n", - " Name: N/A\n", - "\n", - "634. ID: soccer-highlights-analyzer-v5\n", - " Name: N/A\n", - "\n", - "635. ID: soccer-highlights-analyzer1751301403\n", - " Name: N/A\n", - "\n", - "636. ID: soccer-highlights-analyzer1751301722\n", - " Name: N/A\n", - "\n", - "637. ID: soccer-highlights-analyzer2-v1\n", - " Name: N/A\n", - "\n", - "638. ID: soccer-highlights-analyzer5314167881751302137\n", - " Name: N/A\n", - "\n", - "639. ID: soccer-highlights-analyzer5314167881751302581\n", - " Name: N/A\n", - "\n", - "640. ID: soccer-highlights-analyzer5314167881751303949\n", - " Name: N/A\n", - "\n", - "641. ID: soccer-highlights-analyzer5314167881751306147\n", - " Name: N/A\n", - "\n", - "642. ID: soccer-highlights-analyzer5314167881751313349\n", - " Name: N/A\n", - "\n", - "643. ID: soccer-highlights-analyzer5314167881755019232\n", - " Name: N/A\n", - "\n", - "644. ID: soccer-highlights-analyzer5314167881755020564\n", - " Name: N/A\n", - "\n", - "645. ID: soccer-highlights-analyzer5314167881755023993\n", - " Name: N/A\n", - "\n", - "646. ID: soccer-highlights-analyzer5314167881755029594\n", - " Name: N/A\n", - "\n", - "647. ID: soccer-highlights-analyzer54167881751301841\n", - " Name: N/A\n", - "\n", - "648. ID: t\n", - " Name: N/A\n", - "\n", - "649. ID: tes\n", - " Name: N/A\n", - "\n", - "650. ID: test-bar-gap\n", - " Name: N/A\n", - "\n", - "651. ID: test\n", - " Name: N/A\n", - "\n", - "652. ID: testMeow\n", - " Name: N/A\n", - "\n", - "653. ID: tingwliu-invoice-test\n", - " Name: N/A\n", - "\n", - "654. ID: video-250808\n", - " Name: N/A\n", - "\n", - "655. ID: video\n", - " Name: N/A\n", - "\n", - "656. ID: videotest\n", - " Name: N/A\n", - "\n", - "657. ID: yahch-contract-0805-1\n", - " Name: N/A\n", - "\n", - "658. ID: yahch-document-HKinvoice-label-2\n", - " Name: N/A\n", - "\n", - "659. ID: yahch-document-HKinvoice-label-3\n", - " Name: N/A\n", - "\n", - "660. ID: yahch-document-HKinvoice-label-4\n", - " Name: N/A\n", - "\n", - "661. ID: yahch-document-HKinvoice-label-5\n", - " Name: N/A\n", - "\n", - "662. ID: yahch-document-HKinvoice-label-6\n", - " Name: N/A\n", - "\n", - "663. ID: yahch-document-HKinvoice-label\n", - " Name: N/A\n", - "\n", - "664. ID: yahch-document-HKinvoice-local-1\n", - " Name: N/A\n", - "\n", - "665. ID: yahch-document-HKinvoice-local-icl-1\n", - " Name: N/A\n", - "\n", - "666. ID: yahch-document-HKinvoice-local-icl-2\n", - " Name: N/A\n", - "\n", - "667. ID: yahch-invoice-HKinvoice-local-icl-1\n", - " Name: N/A\n", - "\n", - "668. ID: yahch-invoice-HKinvoice-local-zeroshot-1\n", - " Name: N/A\n", - "\n", - "669. ID: yahch-invoice-HKinvoice-local-zeroshot-2\n", - " Name: N/A\n", - "\n", - "670. ID: yiyun\n", - " Name: N/A\n", - "\n", - "671. ID: yiyun1223\n", - " Name: N/A\n", - "\n", - "672. ID: yiyun3333\n", - " Name: N/A\n", - "\n", - "673. ID: yiyun65656\n", - " Name: N/A\n", - "\n", - "674. ID: yiyunPromode\n", - " Name: N/A\n", - "\n", - "675. ID: yslin-2025-06-25-generative-date-fields\n", - " Name: N/A\n", - "\n" - ] - } - ], + "execution_count": null, + "id": "fcbc218a", + "metadata": {}, + "outputs": [], "source": [ "# Get all analyzers in your resource\n", "all_analyzers = client.get_all_analyzers()\n", @@ -2322,7 +287,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "id": "9772b0f5", "metadata": {}, "outputs": [ @@ -2336,8 +301,10 @@ ], "source": [ "# OPTION 1: Specify an existing analyzer ID that has training data\n", - "# Replace this with your actual analyzer ID\n", - "SOURCE_ANALYZER_ID = \"invoiceLabeledData\"\n", + "\n", + "# ⚠️ REQUIRED: Replace \"MyAnalyzer\" with your actual analyzer ID from the list above\n", + "# You can find available analyzer IDs in the output of the previous cell\n", + "SOURCE_ANALYZER_ID = \"MyAnalyzer\" # ← CHANGE THIS!\n", "\n", "# Uncomment to use the first analyzer from the list\n", "# if analyzers_list:\n", From a5ca66de2ca250f1f027181b8f3416a750eecfad Mon Sep 17 00:00:00 2001 From: Joe Filcik Date: Thu, 23 Oct 2025 15:15:21 -0400 Subject: [PATCH 3/8] Improving readability --- .../move_training_data_across_analyzers.ipynb | 219 +----------------- 1 file changed, 5 insertions(+), 214 deletions(-) diff --git a/notebooks/move_training_data_across_analyzers.ipynb b/notebooks/move_training_data_across_analyzers.ipynb index 6bc6195..428d9db 100644 --- a/notebooks/move_training_data_across_analyzers.ipynb +++ b/notebooks/move_training_data_across_analyzers.ipynb @@ -27,80 +27,10 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "2f76b866", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Defaulting to user installation because normal site-packages is not writeable\n", - "Requirement already satisfied: aiohttp in /home/vscode/.local/lib/python3.11/site-packages (from -r ../requirements.txt (line 1)) (3.12.15)\n", - "Requirement already satisfied: azure-identity in /home/vscode/.local/lib/python3.11/site-packages (from -r ../requirements.txt (line 2)) (1.25.0)\n", - "Requirement already satisfied: azure-storage-blob in /home/vscode/.local/lib/python3.11/site-packages (from -r ../requirements.txt (line 3)) (12.26.0)\n", - "Requirement already satisfied: python-dotenv in /home/vscode/.local/lib/python3.11/site-packages (from -r ../requirements.txt (line 4)) (1.1.1)\n", - "Requirement already satisfied: requests in /home/vscode/.local/lib/python3.11/site-packages (from -r ../requirements.txt (line 5)) (2.32.5)\n", - "Requirement already satisfied: Pillow in /home/vscode/.local/lib/python3.11/site-packages (from -r ../requirements.txt (line 6)) (11.3.0)\n", - "Requirement already satisfied: aiohappyeyeballs>=2.5.0 in /home/vscode/.local/lib/python3.11/site-packages (from aiohttp->-r ../requirements.txt (line 1)) (2.6.1)\n", - "Requirement already satisfied: aiosignal>=1.4.0 in /home/vscode/.local/lib/python3.11/site-packages (from aiohttp->-r ../requirements.txt (line 1)) (1.4.0)\n", - "Requirement already satisfied: attrs>=17.3.0 in /home/vscode/.local/lib/python3.11/site-packages (from aiohttp->-r ../requirements.txt (line 1)) (25.3.0)\n", - "Requirement already satisfied: frozenlist>=1.1.1 in /home/vscode/.local/lib/python3.11/site-packages (from aiohttp->-r ../requirements.txt (line 1)) (1.7.0)\n", - "Requirement already satisfied: multidict<7.0,>=4.5 in /home/vscode/.local/lib/python3.11/site-packages (from aiohttp->-r ../requirements.txt (line 1)) (6.6.4)\n", - "Requirement already satisfied: propcache>=0.2.0 in /home/vscode/.local/lib/python3.11/site-packages (from aiohttp->-r ../requirements.txt (line 1)) (0.3.2)\n", - "Requirement already satisfied: yarl<2.0,>=1.17.0 in /home/vscode/.local/lib/python3.11/site-packages (from aiohttp->-r ../requirements.txt (line 1)) (1.20.1)\n", - "Requirement already satisfied: aiohttp in /home/vscode/.local/lib/python3.11/site-packages (from -r ../requirements.txt (line 1)) (3.12.15)\n", - "Requirement already satisfied: azure-identity in /home/vscode/.local/lib/python3.11/site-packages (from -r ../requirements.txt (line 2)) (1.25.0)\n", - "Requirement already satisfied: azure-storage-blob in /home/vscode/.local/lib/python3.11/site-packages (from -r ../requirements.txt (line 3)) (12.26.0)\n", - "Requirement already satisfied: python-dotenv in /home/vscode/.local/lib/python3.11/site-packages (from -r ../requirements.txt (line 4)) (1.1.1)\n", - "Requirement already satisfied: requests in /home/vscode/.local/lib/python3.11/site-packages (from -r ../requirements.txt (line 5)) (2.32.5)\n", - "Requirement already satisfied: Pillow in /home/vscode/.local/lib/python3.11/site-packages (from -r ../requirements.txt (line 6)) (11.3.0)\n", - "Requirement already satisfied: aiohappyeyeballs>=2.5.0 in /home/vscode/.local/lib/python3.11/site-packages (from aiohttp->-r ../requirements.txt (line 1)) (2.6.1)\n", - "Requirement already satisfied: aiosignal>=1.4.0 in /home/vscode/.local/lib/python3.11/site-packages (from aiohttp->-r ../requirements.txt (line 1)) (1.4.0)\n", - "Requirement already satisfied: attrs>=17.3.0 in /home/vscode/.local/lib/python3.11/site-packages (from aiohttp->-r ../requirements.txt (line 1)) (25.3.0)\n", - "Requirement already satisfied: frozenlist>=1.1.1 in /home/vscode/.local/lib/python3.11/site-packages (from aiohttp->-r ../requirements.txt (line 1)) (1.7.0)\n", - "Requirement already satisfied: multidict<7.0,>=4.5 in /home/vscode/.local/lib/python3.11/site-packages (from aiohttp->-r ../requirements.txt (line 1)) (6.6.4)\n", - "Requirement already satisfied: propcache>=0.2.0 in /home/vscode/.local/lib/python3.11/site-packages (from aiohttp->-r ../requirements.txt (line 1)) (0.3.2)\n", - "Requirement already satisfied: yarl<2.0,>=1.17.0 in /home/vscode/.local/lib/python3.11/site-packages (from aiohttp->-r ../requirements.txt (line 1)) (1.20.1)\n", - "Requirement already satisfied: azure-core>=1.31.0 in /home/vscode/.local/lib/python3.11/site-packages (from azure-identity->-r ../requirements.txt (line 2)) (1.35.1)\n", - "Requirement already satisfied: cryptography>=2.5 in /home/vscode/.local/lib/python3.11/site-packages (from azure-identity->-r ../requirements.txt (line 2)) (46.0.2)\n", - "Requirement already satisfied: msal>=1.30.0 in /home/vscode/.local/lib/python3.11/site-packages (from azure-identity->-r ../requirements.txt (line 2)) (1.34.0)\n", - "Requirement already satisfied: msal-extensions>=1.2.0 in /home/vscode/.local/lib/python3.11/site-packages (from azure-identity->-r ../requirements.txt (line 2)) (1.3.1)\n", - "Requirement already satisfied: typing-extensions>=4.0.0 in /home/vscode/.local/lib/python3.11/site-packages (from azure-identity->-r ../requirements.txt (line 2)) (4.15.0)\n", - "Requirement already satisfied: isodate>=0.6.1 in /home/vscode/.local/lib/python3.11/site-packages (from azure-storage-blob->-r ../requirements.txt (line 3)) (0.7.2)\n", - "Requirement already satisfied: charset_normalizer<4,>=2 in /home/vscode/.local/lib/python3.11/site-packages (from requests->-r ../requirements.txt (line 5)) (3.4.3)\n", - "Requirement already satisfied: idna<4,>=2.5 in /home/vscode/.local/lib/python3.11/site-packages (from requests->-r ../requirements.txt (line 5)) (3.10)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /home/vscode/.local/lib/python3.11/site-packages (from requests->-r ../requirements.txt (line 5)) (2.5.0)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /home/vscode/.local/lib/python3.11/site-packages (from requests->-r ../requirements.txt (line 5)) (2025.8.3)\n", - "Requirement already satisfied: azure-core>=1.31.0 in /home/vscode/.local/lib/python3.11/site-packages (from azure-identity->-r ../requirements.txt (line 2)) (1.35.1)\n", - "Requirement already satisfied: cryptography>=2.5 in /home/vscode/.local/lib/python3.11/site-packages (from azure-identity->-r ../requirements.txt (line 2)) (46.0.2)\n", - "Requirement already satisfied: msal>=1.30.0 in /home/vscode/.local/lib/python3.11/site-packages (from azure-identity->-r ../requirements.txt (line 2)) (1.34.0)\n", - "Requirement already satisfied: msal-extensions>=1.2.0 in /home/vscode/.local/lib/python3.11/site-packages (from azure-identity->-r ../requirements.txt (line 2)) (1.3.1)\n", - "Requirement already satisfied: typing-extensions>=4.0.0 in /home/vscode/.local/lib/python3.11/site-packages (from azure-identity->-r ../requirements.txt (line 2)) (4.15.0)\n", - "Requirement already satisfied: isodate>=0.6.1 in /home/vscode/.local/lib/python3.11/site-packages (from azure-storage-blob->-r ../requirements.txt (line 3)) (0.7.2)\n", - "Requirement already satisfied: charset_normalizer<4,>=2 in /home/vscode/.local/lib/python3.11/site-packages (from requests->-r ../requirements.txt (line 5)) (3.4.3)\n", - "Requirement already satisfied: idna<4,>=2.5 in /home/vscode/.local/lib/python3.11/site-packages (from requests->-r ../requirements.txt (line 5)) (3.10)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /home/vscode/.local/lib/python3.11/site-packages (from requests->-r ../requirements.txt (line 5)) (2.5.0)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /home/vscode/.local/lib/python3.11/site-packages (from requests->-r ../requirements.txt (line 5)) (2025.8.3)\n", - "Requirement already satisfied: six>=1.11.0 in /home/vscode/.local/lib/python3.11/site-packages (from azure-core>=1.31.0->azure-identity->-r ../requirements.txt (line 2)) (1.17.0)\n", - "Requirement already satisfied: six>=1.11.0 in /home/vscode/.local/lib/python3.11/site-packages (from azure-core>=1.31.0->azure-identity->-r ../requirements.txt (line 2)) (1.17.0)\n", - "Requirement already satisfied: cffi>=2.0.0 in /home/vscode/.local/lib/python3.11/site-packages (from cryptography>=2.5->azure-identity->-r ../requirements.txt (line 2)) (2.0.0)\n", - "Requirement already satisfied: PyJWT<3,>=1.0.0 in /home/vscode/.local/lib/python3.11/site-packages (from PyJWT[crypto]<3,>=1.0.0->msal>=1.30.0->azure-identity->-r ../requirements.txt (line 2)) (2.10.1)\n", - "Requirement already satisfied: cffi>=2.0.0 in /home/vscode/.local/lib/python3.11/site-packages (from cryptography>=2.5->azure-identity->-r ../requirements.txt (line 2)) (2.0.0)\n", - "Requirement already satisfied: PyJWT<3,>=1.0.0 in /home/vscode/.local/lib/python3.11/site-packages (from PyJWT[crypto]<3,>=1.0.0->msal>=1.30.0->azure-identity->-r ../requirements.txt (line 2)) (2.10.1)\n", - "Requirement already satisfied: pycparser in /home/vscode/.local/lib/python3.11/site-packages (from cffi>=2.0.0->cryptography>=2.5->azure-identity->-r ../requirements.txt (line 2)) (2.23)\n", - "Requirement already satisfied: pycparser in /home/vscode/.local/lib/python3.11/site-packages (from cffi>=2.0.0->cryptography>=2.5->azure-identity->-r ../requirements.txt (line 2)) (2.23)\n", - "\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.2\u001b[0m\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", - "\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.2\u001b[0m\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", - "Note: you may need to restart the kernel to use updated packages.\n", - "Note: you may need to restart the kernel to use updated packages.\n" - ] - } - ], + "outputs": [], "source": [ "%pip install -r ../requirements.txt" ] @@ -121,79 +51,10 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "bcea7936", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:azure.identity._credentials.environment:No environment configuration found.\n", - "INFO:azure.identity._credentials.managed_identity:ManagedIdentityCredential will use IMDS\n", - "INFO:azure.core.pipeline.policies.http_logging_policy:Request URL: 'http://169.254.169.254/metadata/identity/oauth2/token?api-version=REDACTED&resource=REDACTED'\n", - "Request method: 'GET'\n", - "Request headers:\n", - " 'User-Agent': 'azsdk-python-identity/1.25.0 Python/3.11.13 (Linux-6.8.0-1030-azure-x86_64-with-glibc2.41)'\n", - "No body was attached to the request\n", - "INFO:azure.identity._credentials.managed_identity:ManagedIdentityCredential will use IMDS\n", - "INFO:azure.core.pipeline.policies.http_logging_policy:Request URL: 'http://169.254.169.254/metadata/identity/oauth2/token?api-version=REDACTED&resource=REDACTED'\n", - "Request method: 'GET'\n", - "Request headers:\n", - " 'User-Agent': 'azsdk-python-identity/1.25.0 Python/3.11.13 (Linux-6.8.0-1030-azure-x86_64-with-glibc2.41)'\n", - "No body was attached to the request\n", - "INFO:azure.core.pipeline.policies.http_logging_policy:Response status: 400\n", - "Response headers:\n", - " 'Content-Type': 'application/json; charset=utf-8'\n", - " 'Server': 'IMDS/150.870.65.1854'\n", - " 'x-ms-request-id': '7683a8fc-6110-4d17-ba92-e7986c8af8e0'\n", - " 'Date': 'Wed, 22 Oct 2025 22:06:40 GMT'\n", - " 'Content-Length': '88'\n", - "INFO:azure.core.pipeline.policies.http_logging_policy:Request URL: 'http://169.254.169.254/metadata/identity/oauth2/token?api-version=REDACTED&resource=REDACTED'\n", - "Request method: 'GET'\n", - "Request headers:\n", - " 'Metadata': 'REDACTED'\n", - " 'User-Agent': 'azsdk-python-identity/1.25.0 Python/3.11.13 (Linux-6.8.0-1030-azure-x86_64-with-glibc2.41)'\n", - "No body was attached to the request\n", - "INFO:azure.core.pipeline.policies.http_logging_policy:Response status: 400\n", - "Response headers:\n", - " 'Content-Type': 'application/json; charset=utf-8'\n", - " 'Server': 'IMDS/150.870.65.1854'\n", - " 'x-ms-request-id': '31ec0b5d-182f-4981-8624-34083dd1c063'\n", - " 'Date': 'Wed, 22 Oct 2025 22:06:40 GMT'\n", - " 'Content-Length': '68'\n", - "INFO:azure.core.pipeline.policies.http_logging_policy:Response status: 400\n", - "Response headers:\n", - " 'Content-Type': 'application/json; charset=utf-8'\n", - " 'Server': 'IMDS/150.870.65.1854'\n", - " 'x-ms-request-id': '7683a8fc-6110-4d17-ba92-e7986c8af8e0'\n", - " 'Date': 'Wed, 22 Oct 2025 22:06:40 GMT'\n", - " 'Content-Length': '88'\n", - "INFO:azure.core.pipeline.policies.http_logging_policy:Request URL: 'http://169.254.169.254/metadata/identity/oauth2/token?api-version=REDACTED&resource=REDACTED'\n", - "Request method: 'GET'\n", - "Request headers:\n", - " 'Metadata': 'REDACTED'\n", - " 'User-Agent': 'azsdk-python-identity/1.25.0 Python/3.11.13 (Linux-6.8.0-1030-azure-x86_64-with-glibc2.41)'\n", - "No body was attached to the request\n", - "INFO:azure.core.pipeline.policies.http_logging_policy:Response status: 400\n", - "Response headers:\n", - " 'Content-Type': 'application/json; charset=utf-8'\n", - " 'Server': 'IMDS/150.870.65.1854'\n", - " 'x-ms-request-id': '31ec0b5d-182f-4981-8624-34083dd1c063'\n", - " 'Date': 'Wed, 22 Oct 2025 22:06:40 GMT'\n", - " 'Content-Length': '68'\n", - "INFO:azure.identity._credentials.chained:DefaultAzureCredential acquired a token from AzureDeveloperCliCredential\n", - "INFO:azure.identity._credentials.chained:DefaultAzureCredential acquired a token from AzureDeveloperCliCredential\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "✅ Content Understanding client initialized successfully!\n" - ] - } - ], + "outputs": [], "source": [ "import logging\n", "import json\n", @@ -280,9 +141,7 @@ "\n", "Specify the ID of the analyzer whose training data you want to reuse.\n", "\n", - "**Option 1**: Set `SOURCE_ANALYZER_ID` to an existing analyzer ID from the list above.\n", - "\n", - "**Option 2**: If you don't have an analyzer with training data, uncomment and run the next cell to create one first." + "Set `SOURCE_ANALYZER_ID` to an existing analyzer ID from the list above" ] }, { @@ -314,74 +173,6 @@ "print(f\"Source Analyzer ID: {SOURCE_ANALYZER_ID}\")" ] }, - { - "cell_type": "markdown", - "id": "d7ceffda", - "metadata": {}, - "source": [ - "### Option 2: Create a Source Analyzer with Training Data (Optional)\n", - "\n", - "If you don't have an existing analyzer with training data, run this cell to create one first.\n", - "\n", - "**Prerequisites**:\n", - "- Set environment variables for training data (see [docs/set_env_for_training_data_and_reference_doc.md](../docs/set_env_for_training_data_and_reference_doc.md))\n", - "- Ensure you have labeled training data in `../data/document_training/`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1ce228bd", - "metadata": {}, - "outputs": [], - "source": [ - "# Uncomment this entire cell if you need to create a source analyzer first\n", - "\n", - "# from azure.storage.blob import ContainerSasPermissions\n", - "\n", - "# # Configure training data\n", - "# analyzer_template_path = \"../analyzer_templates/receipt.json\"\n", - "# training_docs_folder = \"../data/document_training\"\n", - "\n", - "# # Get or generate SAS URL\n", - "# training_data_sas_url = os.getenv(\"TRAINING_DATA_SAS_URL\")\n", - "# if not training_data_sas_url:\n", - "# TRAINING_DATA_STORAGE_ACCOUNT_NAME = os.getenv(\"TRAINING_DATA_STORAGE_ACCOUNT_NAME\")\n", - "# TRAINING_DATA_CONTAINER_NAME = os.getenv(\"TRAINING_DATA_CONTAINER_NAME\")\n", - "# if not TRAINING_DATA_STORAGE_ACCOUNT_NAME:\n", - "# raise ValueError(\n", - "# \"Please set either TRAINING_DATA_SAS_URL or both TRAINING_DATA_STORAGE_ACCOUNT_NAME \"\n", - "# \"and TRAINING_DATA_CONTAINER_NAME environment variables.\"\n", - "# )\n", - "# training_data_sas_url = AzureContentUnderstandingClient.generate_temp_container_sas_url(\n", - "# account_name=TRAINING_DATA_STORAGE_ACCOUNT_NAME,\n", - "# container_name=TRAINING_DATA_CONTAINER_NAME,\n", - "# permissions=ContainerSasPermissions(read=True, write=True, list=True),\n", - "# expiry_hours=1,\n", - "# )\n", - "\n", - "# training_data_path = os.getenv(\"TRAINING_DATA_PATH\")\n", - "\n", - "# # Upload training data to blob storage\n", - "# print(\"Uploading training data to blob storage...\")\n", - "# await client.generate_training_data_on_blob(training_docs_folder, training_data_sas_url, training_data_path)\n", - "# print(\"✅ Training data uploaded successfully!\")\n", - "\n", - "# # Create source analyzer\n", - "# SOURCE_ANALYZER_ID = \"source-analyzer-\" + str(uuid.uuid4())\n", - "# print(f\"Creating source analyzer: {SOURCE_ANALYZER_ID}\")\n", - "\n", - "# response = client.begin_create_analyzer(\n", - "# SOURCE_ANALYZER_ID,\n", - "# analyzer_template_path=analyzer_template_path,\n", - "# training_storage_container_sas_url=training_data_sas_url,\n", - "# training_storage_container_path_prefix=training_data_path,\n", - "# )\n", - "# result = client.poll_result(response)\n", - "# print(\"✅ Source analyzer created successfully!\")\n", - "# print(json.dumps(result, indent=2))" - ] - }, { "cell_type": "markdown", "id": "d9b1bc93", From 4a5ff1b988f86a492f9a77fa58dbb83946e7ab51 Mon Sep 17 00:00:00 2001 From: Joe Filcik Date: Tue, 16 Dec 2025 20:44:30 -0500 Subject: [PATCH 4/8] Update README and API testing guide for GA version; Update conversion scripts with "knowledge source" property from the GA API --- python/di_to_cu_migration_tool/README.md | 25 +- .../cu-ga-NoSecrets.http | 570 ++++++++++++++++++ .../cu_converter_generative.py | 19 +- .../cu_converter_neural.py | 19 +- .../di_to_cu_converter.py | 14 +- python/di_to_cu_migration_tool/get_ocr.py | 16 +- .../sample_documents/analyzer_result.json | 2 +- 7 files changed, 638 insertions(+), 27 deletions(-) create mode 100644 python/di_to_cu_migration_tool/cu-ga-NoSecrets.http diff --git a/python/di_to_cu_migration_tool/README.md b/python/di_to_cu_migration_tool/README.md index e473ad0..d5548c8 100644 --- a/python/di_to_cu_migration_tool/README.md +++ b/python/di_to_cu_migration_tool/README.md @@ -1,13 +1,13 @@ # Document Intelligence to Content Understanding Migration Tool (Python) -Welcome! This tool helps convert your Document Intelligence (DI) datasets to the Content Understanding (CU) **Preview.2** 2025-05-01-preview format, as used in AI Foundry. The following DI versions are supported: +Welcome! This tool helps convert your Document Intelligence (DI) datasets to the Content Understanding (CU) **GA** 2025-11-01 format, as used in AI Foundry. The following DI versions are supported: - Custom Extraction Model DI 3.1 GA (2023-07-31) to DI 4.0 GA (2024-11-30) (Document Intelligence Studio) → DI-version = neural - Document Field Extraction Model 4.0 Preview (2024-07-31-preview) (AI Foundry / AI Services / Vision + Document / Document Field Extraction) → DI-version = generative To identify the version of your Document Intelligence dataset, please consult the sample documents in this folder to match your format. You can also verify the version by reviewing your DI project's user experience. For instance, Custom Extraction DI 3.1/4.0 GA appears in Document Intelligence Studio (https://documentintelligence.ai.azure.com/studio), whereas Document Field Extraction DI 4.0 Preview is only available on Azure AI Foundry's preview service (https://ai.azure.com/explore/aiservices/vision/document/extraction). -For migrating from these DI versions to Content Understanding Preview.2, this tool first converts the DI dataset into a CU-compatible format. After conversion, you can create a Content Understanding Analyzer trained on your converted CU dataset. Additionally, you have the option to test its quality against any sample documents. +For migrating from these DI versions to Content Understanding GA (2025-11-01), this tool first converts the DI dataset into a CU-compatible format. After conversion, you can create a Content Understanding Analyzer trained on your converted CU dataset. Additionally, you have the option to test its quality against any sample documents. ## Details About the Tools @@ -43,7 +43,7 @@ Please follow these steps to set up the tool: - **SUBSCRIPTION_KEY:** Update to your Azure AI Service API Key or Subscription ID to authenticate the API requests. - Locate your API Key here: ![Azure AI Service Endpoints With Keys](assets/endpoint-with-keys.png) - If using Azure Active Directory (AAD), please refer to your Subscription ID: ![Azure AI Service Subscription ID](assets/subscription-id.png) - - **API_VERSION:** This is preset to the CU Preview.2 version; no changes are needed. + - **API_VERSION:** This is preset to the CU GA version (2025-11-01); no changes are needed. ## How to Locate Your Document Field Extraction Dataset for Migration @@ -73,8 +73,12 @@ To obtain SAS URLs for a file or folder for any container URL arguments, please 3. Configure permissions and expiry for your SAS URL as follows: - For the **DI source dataset**, please select permissions: _**Read & List**_ +https://jfilcikditestdata.blob.core.windows.net/didata?sv=2025-07-05&spr=https&st=2025-12-16T22%3A17%3A06Z&se=2025-12-17T22%3A17%3A06Z&sr=c&sp=rl&sig=nvUIelZQ9yWEJx3jA%2FjUOIdHn6OVnp5gvKSJ3zgzwvE%3D + - For the **CU target dataset**, please select permissions: _**Read, Add, Create, & Write**_ +https://jfilcikditestdata.blob.core.windows.net/cudata?sv=2025-07-05&spr=https&st=2025-12-16T22%3A19%3A39Z&se=2025-12-17T22%3A19%3A39Z&sr=c&sp=racwl&sig=K82dxEFNpYhuf5JRq3xJ4vc5SYE8A7FfsBnTJbB1VJY%3D + After configuring, click **Generate SAS Token and URL** and copy the URL shown under **Blob SAS URL**. ![Generate SAS Pop-Up](assets/generate-sas-pop-up.png) @@ -98,6 +102,9 @@ If migrating a _DI 3.1/4.0 GA Custom Extraction_ dataset, please run: python ./di_to_cu_converter.py --DI-version neural --analyzer-prefix mySampleAnalyzer \ --source-container-sas-url "https://sourceStorageAccount.blob.core.windows.net/sourceContainer?sourceSASToken" --source-blob-folder diDatasetFolderName \ --target-container-sas-url "https://targetStorageAccount.blob.core.windows.net/targetContainer?targetSASToken" --target-blob-folder cuDatasetFolderName + +python ./di_to_cu_converter.py --DI-version neural --analyzer-prefix mySampleAnalyzer --source-container-sas-url "https://jfilcikditestdata.blob.core.windows.net/didata?sv=2025-07-05&spr=https&st=2025-12-16T22%3A17%3A06Z&se=2025-12-17T22%3A17%3A06Z&sr=c&sp=rl&sig=nvUIelZQ9yWEJx3jA%2FjUOIdHn6OVnp5gvKSJ3zgzwvE%3D" --source-blob-folder diDatasetFolderName \ +--target-container-sas-url "https://jfilcikditestdata.blob.core.windows.net/cudata?sv=2025-07-05&spr=https&st=2025-12-16T22%3A19%3A39Z&se=2025-12-17T22%3A19%3A39Z&sr=c&sp=racwl&sig=K82dxEFNpYhuf5JRq3xJ4vc5SYE8A7FfsBnTJbB1VJY%3D" --target-blob-folder cuDatasetFolderName ``` For this migration, specifying an analyzer prefix is crucial for creating a CU analyzer. Since the fields.json does not define a "doc_type" for identification, the created analyzer ID will be the specified analyzer prefix. @@ -120,9 +127,9 @@ After converting the CU analyzer.json, please run: ``` python ./create_analyzer.py \ ---analyzer-sas-url "https://targetStorageAccount.blob.core.windows.net/targetContainer/cuDatasetFolderName/analyzer.json?targetSASToken" \ ---target-container-sas-url "https://targetStorageAccount.blob.core.windows.net/targetContainer?targetSASToken" \ ---target-blob-folder cuDatasetFolderName +--analyzer-sas-url "https://jfilcikditestdata.blob.core.windows.net/cudata?sv=2025-07-05&spr=https&st=2025-12-16T22%3A19%3A39Z&se=2025-12-17T22%3A19%3A39Z&sr=c&sp=racwl&sig=K82dxEFNpYhuf5JRq3xJ4vc5SYE8A7FfsBnTJbB1VJY%3D" \ +--target-container-sas-url "https://jfilcikditestdata.blob.core.windows.net/cudata?sv=2025-07-05&spr=https&st=2025-12-16T22%3A19%3A39Z&se=2025-12-17T22%3A19%3A39Z&sr=c&sp=racwl&sig=K82dxEFNpYhuf5JRq3xJ4vc5SYE8A7FfsBnTJbB1VJY%3D" \ +--target-blob-folder "di_convert" ``` The `analyzer.json` file is located in the specified target blob container and folder. Please obtain the SAS URL for `analyzer.json` from there. @@ -155,7 +162,7 @@ Below are common issues you might encounter when creating an analyzer or running - **400 Bad Request** errors: Please validate the following: - The endpoint URL is valid. Example: - `https://yourEndpoint/contentunderstanding/analyzers/yourAnalyzerID?api-version=2025-05-01-preview` + `https://yourEndpoint/contentunderstanding/analyzers/yourAnalyzerID?api-version=2025-11-01` - Your converted CU dataset respects the naming constraints below. If needed, please manually correct the `analyzer.json` fields: - Field names start with a letter or underscore - Field name length must be between 1 and 64 characters @@ -174,7 +181,7 @@ Below are common issues you might encounter when creating an analyzer or running - **400 Bad Request**: This implies that you might have an incorrect endpoint or SAS URL. Please ensure that your endpoint is valid and that you are using the correct SAS URL for the document: - `https://yourendpoint/contentunderstanding/analyzers/yourAnalyzerID:analyze?api-version=2025-05-01-preview` + `https://yourendpoint/contentunderstanding/analyzers/yourAnalyzerID:analyze?api-version=2025-11-01` Confirm you are using the correct SAS URL for the document. - **401 Unauthorized**: @@ -189,4 +196,4 @@ Below are common issues you might encounter when creating an analyzer or running 2. Signature field types (e.g., in previous DI versions) are not yet supported in Content Understanding. These will be ignored during migration when creating the analyzer. 3. The content of your training documents is retained in the CU model's metadata, under storage specifically. You can find more details at: https://learn.microsoft.com/en-us/legal/cognitive-services/content-understanding/transparency-note?toc=%2Fazure%2Fai-services%2Fcontent-understanding%2Ftoc.json&bc=%2Fazure%2Fai-services%2Fcontent-understanding%2Fbreadcrumb%2Ftoc.json -4. All conversions are for Content Understanding preview.2 version only. \ No newline at end of file +4. All conversions are for Content Understanding GA (2025-11-01) version. \ No newline at end of file diff --git a/python/di_to_cu_migration_tool/cu-ga-NoSecrets.http b/python/di_to_cu_migration_tool/cu-ga-NoSecrets.http new file mode 100644 index 0000000..f48ec55 --- /dev/null +++ b/python/di_to_cu_migration_tool/cu-ga-NoSecrets.http @@ -0,0 +1,570 @@ +# ==================================================================== +# Azure AI Content Understanding - API Testing Guide +# ==================================================================== +# +# This file demonstrates how to use Azure AI Content Understanding APIs +# to analyze documents, images, videos, and audio using prebuilt and +# custom analyzers. +# +# ⚠️ SETUP REQUIRED - BEFORE RUNNING ANY REQUESTS: +# +# 1. Edit the .env file in this same directory (tools/BugBashSample/.env) +# 2. Set your API_KEY and ENDPOINT_URL values: +# API_KEY="your-subscription-key-here" +# ENDPOINT_URL="https://your-resource.services.ai.azure.com" +# 3. Save the .env file +# 4. Run any request using the "Send Request" link above each HTTP request +# +# Variables will be automatically loaded from .env - do not edit this file. +# +# DOCUMENTATION: +# - Models & Deployments: https://review.learn.microsoft.com/en-us/azure/ai-services/content-understanding/concepts/models-deployments?branch=main +# - Migration Guide (Preview to GA): https://review.learn.microsoft.com/en-us/azure/ai-services/content-understanding/how-to/migration-preview-to-ga +# - Analyzer Reference: https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/concepts/analyzer-reference +# +# ==================================================================== + +# Variables - Automatically loaded from .env file in the same directory +@subscriptionKey = {{$dotenv API_KEY}} +@endpoint = {{$dotenv ENDPOINT_URL}} +@apiVersion = 2025-11-01 + +# ==================================================================== +# SECTION 1: CONFIGURE DEFAULT MODEL DEPLOYMENTS (BYOC) +# ==================================================================== +# +# Configure default model deployments for your resource. This allows you to +# use custom Azure OpenAI deployments (Bring Your Own Compute) instead of +# the default shared models. +# +# Learn more: https://review.learn.microsoft.com/en-us/azure/ai-services/content-understanding/concepts/models-deployments?branch=main +# + +### Set default model deployments for the resource +# This configures which model deployments to use by default for completion and embedding tasks +# Alternatively, if you setup a resource using https://aka.ms/cu-studio then you can skip this step +PATCH {{endpoint}}/contentunderstanding/defaults?api-version={{apiVersion}} +Ocp-Apim-Subscription-Key: {{subscriptionKey}} +Content-Type: application/json + +{ + "modelDeployments": { + "gpt-4.1": "gpt-4.1-datazone", + "gpt-4.1-mini": "gpt-4.1-mini", + "text-embedding-ada-002": "text-embedding-ada-002" + } +} + +### Get current default settings +GET {{endpoint}}/contentunderstanding/defaults?api-version={{apiVersion}} +Ocp-Apim-Subscription-Key: {{subscriptionKey}} + +# ==================================================================== +# SECTION 2: LIST AND EXPLORE ANALYZERS +# ==================================================================== +# +# Analyzers are the core components that process your content. Azure provides +# prebuilt analyzers for common scenarios and you can create custom analyzers. +# +# Prebuilt analyzers include: +# - prebuilt-document: Extract text, tables, and structure from documents +# - prebuilt-invoice: Extract invoice-specific fields +# - prebuilt-video: Analyze video content with scene detection and transcription +# - prebuilt-audio: Transcribe and analyze audio content +# - prebuilt-imageSearch: Analyze and search within images +# +# Learn more: https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/concepts/analyzer-reference +# + +### List all available analyzers +GET {{endpoint}}/contentunderstanding/analyzers?api-version={{apiVersion}} +Content-Type: application/json +Ocp-Apim-Subscription-Key: {{subscriptionKey}} + +### Get details of a specific analyzer +GET {{endpoint}}/contentunderstanding/analyzers/prebuilt-invoice?api-version={{apiVersion}} +Ocp-Apim-Subscription-Key: {{subscriptionKey}} + +# ==================================================================== +# SECTION 3: TEST PREBUILT ANALYZERS +# ==================================================================== +# +# Test the prebuilt analyzers with sample documents to understand their +# capabilities before creating custom analyzers. +# + +### 3.1 Analyze a document with prebuilt-document analyzer +# This analyzer extracts text, tables, key-value pairs, and document structure +# @name documentAnalysis +POST {{endpoint}}/contentunderstanding/analyzers/prebuilt-document:analyze?api-version={{apiVersion}} +Content-Type: application/json +Ocp-Apim-Subscription-Key: {{subscriptionKey}} + +{ + "inputs": [ + { + "url": "https://github.com/Azure-Samples/azure-ai-content-understanding-python/raw/refs/heads/main/data/invoice.pdf" + } + ], + "modelDeployments": { + "gpt-4.1": "gpt-4.1-datazone" + } +} + +### Get document analysis results (uses operation ID from previous request) +GET {{endpoint}}/contentunderstanding/analyzerResults/{{documentAnalysis.response.body.id}}?api-version={{apiVersion}} +Ocp-Apim-Subscription-Key: {{subscriptionKey}} + +### 3.2 Analyze an invoice with prebuilt-invoice analyzer +# Specialized analyzer for invoices that extracts vendor, items, totals, etc. +# @name invoiceAnalysis +POST {{endpoint}}/contentunderstanding/analyzers/prebuilt-invoice:analyze?api-version={{apiVersion}} +Content-Type: application/json +Ocp-Apim-Subscription-Key: {{subscriptionKey}} + +{ + "inputs": [ + { + "url": "https://github.com/Azure-Samples/azure-ai-content-understanding-python/raw/refs/heads/main/data/invoice.pdf" + } + ] +} + +### Get invoice analysis results (uses operation ID from previous request) +GET {{endpoint}}/contentunderstanding/analyzerResults/{{invoiceAnalysis.response.body.id}}?api-version={{apiVersion}} +Ocp-Apim-Subscription-Key: {{subscriptionKey}} + +### 3.3 Analyze an image with prebuilt-imageSearch analyzer +# Extract visual content, text, and enable semantic search within images +# @name imageAnalysis +POST {{endpoint}}/contentunderstanding/analyzers/prebuilt-imageSearch:analyze?api-version={{apiVersion}} +Content-Type: application/json +Ocp-Apim-Subscription-Key: {{subscriptionKey}} + +{ + "inputs": [ + { + "url": "https://github.com/Azure-Samples/azure-ai-content-understanding-python/raw/refs/heads/main/data/pieChart.jpg" + } + ], + "modelDeployments": { + "gpt-4.1": "gpt-4.1-datazone", + "text-embedding-3-large": "text-embedding-3-large" + } +} + +### Get image analysis results (uses operation ID from previous request) +GET {{endpoint}}/contentunderstanding/analyzerResults/{{imageAnalysis.response.body.id}}?api-version={{apiVersion}} +Ocp-Apim-Subscription-Key: {{subscriptionKey}} + +# ==================================================================== +# SECTION 4: CREATE CUSTOM DOCUMENT ANALYZER +# ==================================================================== +# +# Create custom analyzers to extract specific fields from your documents. +# Custom analyzers build on top of prebuilt analyzers and add field schemas +# to define what information to extract. +# +# Learn more: https://review.learn.microsoft.com/en-us/azure/ai-services/content-understanding/how-to/migration-preview-to-ga +# + +### 4.1 Create a custom insurance claim form analyzer +# This analyzer extracts specific fields from insurance claim forms +PUT {{endpoint}}/contentunderstanding/analyzers/claimForm?api-version={{apiVersion}} +Ocp-Apim-Subscription-Key: {{subscriptionKey}} +Content-Type: application/json + +{ + "baseAnalyzerId": "prebuilt-document", + "analyzerId": "claimForm", + "models": { + "completion": "gpt-4.1", + "embedding": "text-embedding-ada-002" + }, + "fieldSchema": { + "fields": { + "PolicyNumber": { + "type": "string", + "method": "extract", + "description": "The insurance policy number associated with this claim." + }, + "ClaimNumber": { + "type": "string", + "method": "extract", + "description": "The unique claim number assigned to this claim." + }, + "TotalClaimAmount": { + "type": "number", + "method": "extract", + "description": "The total amount being claimed." + }, + "AccidentDate": { + "type": "string", + "method": "extract", + "description": "The date when the accident occurred." + }, + "LossType": { + "type": "string", + "method": "classify", + "description": "The type of loss (e.g., collision, theft, fire).", + "enum": ["collision", "theft", "fire", "natural disaster", "vandalism"] + } + }, + "definitions": {} + }, + "omitContent": true +} + +### Get the custom analyzer details +GET {{endpoint}}/contentunderstanding/analyzers/claimForm?api-version={{apiVersion}} +Ocp-Apim-Subscription-Key: {{subscriptionKey}} + +### 4.2 Create a custom invoice analyzer +# Example of a custom invoice analyzer with specific field extraction +PUT {{endpoint}}/contentunderstanding/analyzers/invoice_custom?api-version={{apiVersion}} +Ocp-Apim-Subscription-Key: {{subscriptionKey}} +Content-Type: application/json + +{ + "analyzerId": "invoice_custom", + "baseAnalyzerId": "prebuilt-document", + "description": "Custom invoice analyzer with specific fields", + "scenario": "document", + "models": { + "completion": "gpt-4.1" + }, + "fieldSchema": { + "fields": { + "VendorName": { + "type": "string", + "method": "extract", + "description": "Vendor issuing the invoice" + }, + "Items": { + "type": "array", + "method": "extract", + "items": { + "type": "object", + "properties": { + "Description": { + "type": "string", + "method": "extract", + "description": "Description of the item" + }, + "Amount": { + "type": "number", + "method": "extract", + "description": "Amount of the item" + } + } + } + } + } + } +} + +### Test the custom invoice analyzer +# @name customInvoiceAnalysis +POST {{endpoint}}/contentunderstanding/analyzers/invoice_custom:analyze?api-version={{apiVersion}} +Content-Type: application/json +Ocp-Apim-Subscription-Key: {{subscriptionKey}} + +{ + "inputs": [ + { + "url": "https://documentintelligence.ai.azure.com/documents/samples/read/read-healthcare.png" + } + ], + "modelDeployments": { + "gpt-4.1": "gpt-4.1-datazone" + } +} + +### Get custom invoice results (uses operation ID from previous request) +GET {{endpoint}}/contentunderstanding/analyzerResults/{{customInvoiceAnalysis.response.body.id}}?api-version={{apiVersion}} +Ocp-Apim-Subscription-Key: {{subscriptionKey}} + +### 4.3 Create a complex multi-document analyzer with content categories +# This analyzer can classify and route different document types to specialized analyzers +PUT {{endpoint}}/contentunderstanding/analyzers/insuranceClaim?api-version={{apiVersion}} +Ocp-Apim-Subscription-Key: {{subscriptionKey}} +Content-Type: application/json + +{ + "baseAnalyzerId": "prebuilt-document", + "analyzerId": "insuranceClaim", + "models": { + "completion": "gpt-4.1", + "embedding": "text-embedding-ada-002" + }, + "config": { + // Enable splitting of the input into segments for multi-document files + "enableSegment": true, + "contentCategories": { + "claimForm": { + "description": "The claim form for Zava Insurance", + "analyzerId": "claimForm" + }, + "estimate": { + "description": "The body shop estimate or contractor estimate to fix the property damage.", + "analyzerId": "prebuilt-invoice" + }, + "medicalReport": { + "description": "A doctors assessment or medical report related to injury suffered.", + "analyzerId": "prebuilt-document" + }, + "policeReport": { + "description": "A police or law enforcement report detailing the events that lead to the loss." + } + }, + "omitContent": true + } +} + +### Delete an analyzer (if needed) +DELETE {{endpoint}}/contentunderstanding/analyzers/insuranceClaim?api-version={{apiVersion}} +Ocp-Apim-Subscription-Key: {{subscriptionKey}} + +# ==================================================================== +# SECTION 5: VIDEO ANALYZER EXAMPLES +# ==================================================================== +# +# Video analyzers can transcribe, detect scenes, extract entities, and +# perform custom analysis on video content. +# +# Learn more: https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/concepts/analyzer-reference +# + +### 5.1 Test prebuilt video analyzer +# @name videoAnalysis +POST {{endpoint}}/contentunderstanding/analyzers/prebuilt-video:analyze?api-version={{apiVersion}} +Content-Type: application/json +Ocp-Apim-Subscription-Key: {{subscriptionKey}} + +{ + "inputs": [ + { + "url": "https://github.com/Azure-Samples/azure-ai-content-understanding-python/raw/refs/heads/main/data/FlightSimulator.mp4" + } + ], + "modelDeployments": { + "gpt-4.1": "gpt-4.1-datazone", + "text-embedding-3-large": "text-embedding-3-large" + } +} + +### Get video analysis results (uses operation ID from previous request) +GET {{endpoint}}/contentunderstanding/analyzerResults/{{videoAnalysis.response.body.id}}?api-version={{apiVersion}} +Ocp-Apim-Subscription-Key: {{subscriptionKey}} + +### 5.2 Create custom video analyzer with dynamic chaptering +# This analyzer segments videos into chapters/stories with scene detection +PUT {{endpoint}}/contentunderstanding/analyzers/video_chaptering?api-version={{apiVersion}} +Ocp-Apim-Subscription-Key: {{subscriptionKey}} +Content-Type: application/json + +{ + "description": "Dynamic video chaptering with scene detection", + "scenario": "videoShot", + "baseAnalyzerId": "prebuilt-video", + "models": { + "completion": "gpt-4.1" + }, + "config": { + "returnDetails": true, + "enableSegmentation": true, + "segmentationMode": "custom", + "segmentationDefinition": "Segment the video into stories or chapters. A story (chapter) in a video is a self-contained portion of the program dedicated to a specific news story, topic, or theme. Each segment typically includes a distinct introduction, development, and (sometimes) a conclusion, and can feature a combination of elements such as reporter narration, interviews, sound bites, relevant footage (B-roll), and graphics.", + "locales": ["en-US"] + }, + "fieldSchema": { + "name": "Content Understanding - Dynamic Chaptering", + "fields": { + "Segments": { + "type": "array", + "items": { + "type": "object", + "properties": { + "SegmentId": { + "type": "string" + }, + "SegmentType": { + "type": "string", + "method": "generate", + "description": "The short title or a short summary of the story or chapter." + }, + "Scenes": { + "type": "array", + "items": { + "type": "object", + "properties": { + "Description": { + "type": "string", + "method": "generate", + "description": "A five-word description of the scene. A scene is a smaller segment of the segment where a continous block for storytelling unfolds within a specific time, place, and set of characters. A scene can only belong to a single chapter, and cannot overlap with other scenes. Scenes are sequential across the video." + }, + "StartTimestamp": { + "type": "string", + "description": "the start timestamp of the scene" + }, + "EndTimestamp": { + "type": "string", + "description": "the end timestamp of the scene" + } + } + } + } + } + } + } + } + } +} + +### Get the video analyzer details +GET {{endpoint}}/contentunderstanding/analyzers/video_chaptering?api-version={{apiVersion}} +Ocp-Apim-Subscription-Key: {{subscriptionKey}} + +### Test the custom video chaptering analyzer +# @name videoChapteringAnalysis +POST {{endpoint}}/contentunderstanding/analyzers/video_chaptering:analyze?api-version={{apiVersion}} +Content-Type: application/json +Ocp-Apim-Subscription-Key: {{subscriptionKey}} + +{ + "inputs": [ + { + "url": "https://github.com/Azure-Samples/azure-ai-content-understanding-python/blob/402ec1bf337d54b438581c69dbfb784da74ded38/data/video.mp4" + } + ] +} + +### Get video chaptering results (uses operation ID from previous request) +GET {{endpoint}}/contentunderstanding/analyzerResults/{{videoChapteringAnalysis.response.body.id}}?api-version={{apiVersion}} +Ocp-Apim-Subscription-Key: {{subscriptionKey}} + +# ==================================================================== +# SECTION 6: AUDIO ANALYZER EXAMPLES +# ==================================================================== +# +# Audio analyzers can transcribe speech, detect speakers, and extract +# insights from audio content. +# + +### Test prebuilt audio analyzer (coming soon - placeholder) +# POST {{endpoint}}/contentunderstanding/analyzers/prebuilt-audio:analyze?api-version={{apiVersion}} +# Content-Type: application/json +# Ocp-Apim-Subscription-Key: {{subscriptionKey}} +# +# { +# "inputs": [ +# { +# "url": "https://example.com/audio-sample.mp3" +# } +# ] +# } + +# ==================================================================== +# SECTION 7: ANALYZER MANAGEMENT OPERATIONS +# ==================================================================== +# +# Copy analyzers between resources or regions for deployment purposes. +# + +### Copy analyzer within same resource +POST {{endpoint}}/contentunderstanding/analyzers/insuranceClaimv3:copy?api-version={{apiVersion}} +Content-Type: application/json +Ocp-Apim-Subscription-Key: {{subscriptionKey}} + +{ + "sourceAnalyzerId": "insuranceClaim" +} + +### Grant copy authorization for cross-resource copy +POST {{endpoint}}/contentunderstanding/analyzers/insuranceClaim:grantCopyAuthorization?api-version={{apiVersion}} +Content-Type: application/json +Ocp-Apim-Subscription-Key: {{subscriptionKey}} + +{ + "targetAzureResourceId": "/subscriptions/3b393ccb-47e3-4dea-9004-0c1085b5aba6/resourceGroups/mmi-sea-eft/providers/Microsoft.CognitiveServices/accounts/foundry-sea-eft", + "targetRegion": "southeastasia" +} + +### Copy analyzer to different resource (use authorization from above) +# POST https://foundry-sea-eft.cognitiveservices.azure.com/contentunderstanding/analyzers/insuranceClaimCopy:copy?api-version={{apiVersion}} +# Content-Type: application/json +# Ocp-Apim-Subscription-Key: {{targetSubscriptionKey}} +# +# { +# "targetAzureResourceId": "/subscriptions/3b393ccb-47e3-4dea-9004-0c1085b5aba6/resourceGroups/mmi-sea-eft/providers/Microsoft.CognitiveServices/accounts/foundry-sea-eft", +# "targetRegion": "southeastasia", +# "expiresAt": "2025-11-01T19:18:11.095328+00:00" +# } + +# ==================================================================== +# END OF GUIDE +# ==================================================================== + + +# Test Create a DataSnipper Repro + + + +### 4.1 Create a custom insurance claim form analyzer +# This analyzer extracts specific fields from insurance claim forms +PUT {{endpoint}}/contentunderstanding/analyzers/dsTest?api-version={{apiVersion}} +Ocp-Apim-Subscription-Key: {{subscriptionKey}} +Content-Type: application/json + +{ + "baseAnalyzerId": "prebuilt-document", + "analyzerId": "dsTest", + "models": { + "completion": "gpt-4o", + "embedding": "text-embedding-ada-002" + }, + "fieldSchema": { + "fields": { + "employees": { + "type": "array", + "items": { + "type": "object", + "properties": { + "employee_name": { + "type": "string", + "method": "extract", + "description": "The name of the employee", + "estimateSourceAndConfidence": true + } + }, + "description": "The employee of the company" + }, + "description": "The employees of the company" + } + } + } +} + +### Get the custom analyzer details +GET {{endpoint}}/contentunderstanding/analyzers/dsTest?api-version={{apiVersion}} +Ocp-Apim-Subscription-Key: {{subscriptionKey}} + + + +### Test dsTest analyzer with document +# @name dsTestAnalysis +POST {{endpoint}}/contentunderstanding/analyzers/dsTest:analyze?api-version={{apiVersion}} +Content-Type: application/json +Ocp-Apim-Subscription-Key: {{subscriptionKey}} + +{ + "inputs": [ + { + "url": "https://mmiusw3bbstore.blob.core.windows.net/bugbash-20251020/jfilcik/ADP%20Journals.pdf?sv=2025-07-05&spr=https&st=2025-12-15T17%3A06%3A24Z&se=2025-12-16T17%3A06%3A24Z&skoid=e48ae032-21b6-418b-963c-3129b6a130d3&sktid=72f988bf-86f1-41af-91ab-2d7cd011db47&skt=2025-12-15T17%3A06%3A24Z&ske=2025-12-16T17%3A06%3A24Z&sks=b&skv=2025-07-05&sr=b&sp=r&sig=fX9biRmkc2IPA8BGUrgyZZ651BnB0WPiU54YIyplE2Q%3D" + } + ] +} + +### Get dsTest analysis results (uses operation ID from previous request) +GET {{endpoint}}/contentunderstanding/analyzerResults/{{dsTestAnalysis.response.body.id}}?api-version={{apiVersion}} +Ocp-Apim-Subscription-Key: {{subscriptionKey}} \ No newline at end of file diff --git a/python/di_to_cu_migration_tool/cu_converter_generative.py b/python/di_to_cu_migration_tool/cu_converter_generative.py index f27938d..f384dc7 100644 --- a/python/di_to_cu_migration_tool/cu_converter_generative.py +++ b/python/di_to_cu_migration_tool/cu_converter_generative.py @@ -48,7 +48,7 @@ def format_angle(angle: float) -> float: formatted_num = f"{rounded_angle:.7f}".rstrip('0') # Remove trailing zeros return float(formatted_num) -def convert_fields_to_analyzer(fields_json_path: Path, analyzer_prefix: Optional[str], target_dir: Path, field_definitions: FieldDefinitions) -> dict: +def convert_fields_to_analyzer(fields_json_path: Path, analyzer_prefix: Optional[str], target_dir: Path, field_definitions: FieldDefinitions, target_container_sas_url: str = None, target_blob_folder: str = None) -> dict: """ Convert DI 4.0 preview Custom Document fields.json to analyzer.json format. Args: @@ -79,7 +79,11 @@ def convert_fields_to_analyzer(fields_json_path: Path, analyzer_prefix: Optional # build analyzer.json appropriately analyzer_data = { "analyzerId": analyzer_id, - "baseAnalyzerId": "prebuilt-documentAnalyzer", + "baseAnalyzerId": "prebuilt-document", + "models": { + "completion": "gpt-4.1", + "embedding": "text-embedding-3-large" + }, "config": { "returnDetails": True, # Add the following line as a temp workaround before service issue is fixed. @@ -121,6 +125,17 @@ def convert_fields_to_analyzer(fields_json_path: Path, analyzer_prefix: Optional else: analyzer_json_path = fields_json_path.parent / 'analyzer.json' + # Add knowledgeSources section if container info is provided + if target_container_sas_url and target_blob_folder: + analyzer_data["knowledgeSources"] = [ + { + "kind": "labeledData", + "containerUrl": target_container_sas_url, + "prefix": target_blob_folder, + "fileListPath": "" + } + ] + # Ensure target directory exists analyzer_json_path.parent.mkdir(parents=True, exist_ok=True) diff --git a/python/di_to_cu_migration_tool/cu_converter_neural.py b/python/di_to_cu_migration_tool/cu_converter_neural.py index d825f10..64d4d33 100644 --- a/python/di_to_cu_migration_tool/cu_converter_neural.py +++ b/python/di_to_cu_migration_tool/cu_converter_neural.py @@ -37,7 +37,7 @@ def convert_bounding_regions_to_source(page_number: int, polygon: list) -> str: source = f"D({page_number},{polygon_str})" return source -def convert_fields_to_analyzer_neural(fields_json_path: Path, analyzer_prefix: Optional[str], target_dir: Optional[Path], field_definitions: FieldDefinitions) -> Tuple[dict, dict]: +def convert_fields_to_analyzer_neural(fields_json_path: Path, analyzer_prefix: Optional[str], target_dir: Optional[Path], field_definitions: FieldDefinitions, target_container_sas_url: str = None, target_blob_folder: str = None) -> Tuple[dict, dict]: """ Convert DI 3.1/4.0GA Custom Neural fields.json to analyzer.json format. Args: @@ -67,7 +67,11 @@ def convert_fields_to_analyzer_neural(fields_json_path: Path, analyzer_prefix: O # Build analyzer.json content analyzer_data = { "analyzerId": analyzer_prefix, - "baseAnalyzerId": "prebuilt-documentAnalyzer", + "baseAnalyzerId": "prebuilt-document", + "models": { + "completion": "gpt-4.1", + "embedding": "text-embedding-3-large" + }, "config": { "returnDetails": True, # Add the following line as a temp workaround before service issue is fixed. @@ -132,6 +136,17 @@ def convert_fields_to_analyzer_neural(fields_json_path: Path, analyzer_prefix: O else: analyzer_json_path = fields_json_path.parent / 'analyzer.json' + # Add knowledgeSources section if container info is provided + if target_container_sas_url and target_blob_folder: + analyzer_data["knowledgeSources"] = [ + { + "kind": "labeledData", + "containerUrl": target_container_sas_url, + "prefix": target_blob_folder, + "fileListPath": "" + } + ] + # Ensure target directory exists analyzer_json_path.parent.mkdir(parents=True, exist_ok=True) diff --git a/python/di_to_cu_migration_tool/di_to_cu_converter.py b/python/di_to_cu_migration_tool/di_to_cu_converter.py index 5de14d9..c84111b 100644 --- a/python/di_to_cu_migration_tool/di_to_cu_converter.py +++ b/python/di_to_cu_migration_tool/di_to_cu_converter.py @@ -8,7 +8,7 @@ import shutil import tempfile import typer -from typing import Tuple +from typing import Optional, Tuple # imports from external packages (in requirements.txt) from rich import print # For colored output @@ -161,7 +161,7 @@ def main( print(f"[yellow]WARNING: The following signatures were removed from the dataset: {removed_signatures}[/yellow]\n") print("Second: Running DI to CU dataset conversion...") - analyzer_data, ocr_files = running_cu_conversion(temp_dir, temp_target_dir, DI_version, analyzer_prefix, removed_signatures) + analyzer_data, ocr_files = running_cu_conversion(temp_dir, temp_target_dir, DI_version, analyzer_prefix, removed_signatures, target_container_sas_url, target_blob_folder) # Run OCR on the pdf files run_cu_layout_ocr(ocr_files, temp_target_dir, subscription_key) @@ -232,15 +232,17 @@ def running_field_type_conversion(temp_source_dir: Path, temp_dir: Path, DI_vers return removed_signatures -def running_cu_conversion(temp_dir: Path, temp_target_dir: Path, DI_version: str, analyzer_prefix: str, removed_signatures: list) -> Tuple[dict, list]: +def running_cu_conversion(temp_dir: Path, temp_target_dir: Path, DI_version: str, analyzer_prefix: Optional[str], removed_signatures: list, target_container_sas_url: str, target_blob_folder: str) -> Tuple[dict, list]: """ - Function to run the DI to CU conversion + Function to run the CU conversion Args: temp_dir (Path): The path to the source directory temp_target_dir (Path): The path to the target directory DI_version (str): The version of DI being used analyzer_prefix (str): The prefix for the analyzer name removed_signatures (list): The list of removed signatures that will not be used in the CU converter + target_container_sas_url (str): The target container SAS URL for training data + target_blob_folder (str): The target blob folder prefix for training data """ # Creating a FieldDefinitons object to handle the converison of definitions in the fields.json field_definitions = FieldDefinitions() @@ -251,9 +253,9 @@ def running_cu_conversion(temp_dir: Path, temp_target_dir: Path, DI_version: str assert fields_path.exists(), "fields.json is needed. Fields.json is missing from the given dataset." if DI_version == "generative": - analyzer_data = cu_converter_generative.convert_fields_to_analyzer(fields_path, analyzer_prefix, temp_target_dir, field_definitions) + analyzer_data = cu_converter_generative.convert_fields_to_analyzer(fields_path, analyzer_prefix, temp_target_dir, field_definitions, target_container_sas_url, target_blob_folder) elif DI_version == "neural": - analyzer_data, fields_dict = cu_converter_neural.convert_fields_to_analyzer_neural(fields_path, analyzer_prefix, temp_target_dir, field_definitions) + analyzer_data, fields_dict = cu_converter_neural.convert_fields_to_analyzer_neural(fields_path, analyzer_prefix, temp_target_dir, field_definitions, target_container_sas_url, target_blob_folder) ocr_files = [] # List to store paths to pdf files to get OCR results from later for file in files: diff --git a/python/di_to_cu_migration_tool/get_ocr.py b/python/di_to_cu_migration_tool/get_ocr.py index a1b849b..32c0584 100644 --- a/python/di_to_cu_migration_tool/get_ocr.py +++ b/python/di_to_cu_migration_tool/get_ocr.py @@ -70,7 +70,11 @@ def build_analyzer(credential, current_token, host, api_version, subscriptionKey request_body = { "analyzerId": analyzer_id, "description": "Sample analyzer", - "baseAnalyzerId": "prebuilt-documentAnalyzer", + "baseAnalyzerId": "prebuilt-document", + "models": { + "completion": "gpt-4.1", + "embedding": "text-embedding-3-large" + }, "config": { "returnDetails": True, "enableOcr": True, @@ -82,8 +86,7 @@ def build_analyzer(credential, current_token, host, api_version, subscriptionKey "fieldSchema": {}, "warnings": [], "status": "ready", - "processingLocation": "geography", - "mode": "standard" + "processingLocation": "geography" } endpoint = f"{host}/contentunderstanding/analyzers/{analyzer_id}?api-version={api_version}" print("[yellow]Creating sample analyzer to attain CU Layout results...[/yellow]") @@ -138,9 +141,8 @@ def run_cu_layout_ocr(input_files: list, output_dir_string: str, subscription_ke output_dir = Path(output_dir_string) output_dir.mkdir(parents=True, exist_ok=True) - # Need to create analyzer with empty schema - analyzer_id = build_analyzer(credential, current_token, host, api_version, subscription_key) - url = f"{host}/contentunderstanding/analyzers/{analyzer_id}:analyze?api-version={api_version}" + # Use prebuilt-read analyzer directly - no need to create a custom analyzer + url = f"{host}/contentunderstanding/analyzers/prebuilt-read:analyze?api-version={api_version}" for file in input_files: try: @@ -150,7 +152,7 @@ def run_cu_layout_ocr(input_files: list, output_dir_string: str, subscription_ke current_token = get_token(credential, current_token) headers = { "Authorization": f"Bearer {current_token.token}", - "Apim-Subscription-id": f"{subscription_key}", + "Ocp-Apim-Subscription-Key": f"{subscription_key}", "Content-Type": "application/pdf", } diff --git a/python/di_to_cu_migration_tool/sample_documents/analyzer_result.json b/python/di_to_cu_migration_tool/sample_documents/analyzer_result.json index bfa151f..f1507dc 100644 --- a/python/di_to_cu_migration_tool/sample_documents/analyzer_result.json +++ b/python/di_to_cu_migration_tool/sample_documents/analyzer_result.json @@ -3,7 +3,7 @@ "status": "Succeeded", "result": { "analyzerId": "mySampleAnalyzer", - "apiVersion": "2025-05-01-preview", + "apiVersion": "2025-11-01", "createdAt": "2025-05-30T15:47:15Z", "warnings": [], "contents": [ From 4669a5d44e61c8fc19469e5b15c85fdbf7d697da Mon Sep 17 00:00:00 2001 From: Joe Filcik Date: Tue, 16 Dec 2025 20:49:20 -0500 Subject: [PATCH 5/8] Remove http test file --- .../cu-ga-NoSecrets.http | 570 ------------------ 1 file changed, 570 deletions(-) delete mode 100644 python/di_to_cu_migration_tool/cu-ga-NoSecrets.http diff --git a/python/di_to_cu_migration_tool/cu-ga-NoSecrets.http b/python/di_to_cu_migration_tool/cu-ga-NoSecrets.http deleted file mode 100644 index f48ec55..0000000 --- a/python/di_to_cu_migration_tool/cu-ga-NoSecrets.http +++ /dev/null @@ -1,570 +0,0 @@ -# ==================================================================== -# Azure AI Content Understanding - API Testing Guide -# ==================================================================== -# -# This file demonstrates how to use Azure AI Content Understanding APIs -# to analyze documents, images, videos, and audio using prebuilt and -# custom analyzers. -# -# ⚠️ SETUP REQUIRED - BEFORE RUNNING ANY REQUESTS: -# -# 1. Edit the .env file in this same directory (tools/BugBashSample/.env) -# 2. Set your API_KEY and ENDPOINT_URL values: -# API_KEY="your-subscription-key-here" -# ENDPOINT_URL="https://your-resource.services.ai.azure.com" -# 3. Save the .env file -# 4. Run any request using the "Send Request" link above each HTTP request -# -# Variables will be automatically loaded from .env - do not edit this file. -# -# DOCUMENTATION: -# - Models & Deployments: https://review.learn.microsoft.com/en-us/azure/ai-services/content-understanding/concepts/models-deployments?branch=main -# - Migration Guide (Preview to GA): https://review.learn.microsoft.com/en-us/azure/ai-services/content-understanding/how-to/migration-preview-to-ga -# - Analyzer Reference: https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/concepts/analyzer-reference -# -# ==================================================================== - -# Variables - Automatically loaded from .env file in the same directory -@subscriptionKey = {{$dotenv API_KEY}} -@endpoint = {{$dotenv ENDPOINT_URL}} -@apiVersion = 2025-11-01 - -# ==================================================================== -# SECTION 1: CONFIGURE DEFAULT MODEL DEPLOYMENTS (BYOC) -# ==================================================================== -# -# Configure default model deployments for your resource. This allows you to -# use custom Azure OpenAI deployments (Bring Your Own Compute) instead of -# the default shared models. -# -# Learn more: https://review.learn.microsoft.com/en-us/azure/ai-services/content-understanding/concepts/models-deployments?branch=main -# - -### Set default model deployments for the resource -# This configures which model deployments to use by default for completion and embedding tasks -# Alternatively, if you setup a resource using https://aka.ms/cu-studio then you can skip this step -PATCH {{endpoint}}/contentunderstanding/defaults?api-version={{apiVersion}} -Ocp-Apim-Subscription-Key: {{subscriptionKey}} -Content-Type: application/json - -{ - "modelDeployments": { - "gpt-4.1": "gpt-4.1-datazone", - "gpt-4.1-mini": "gpt-4.1-mini", - "text-embedding-ada-002": "text-embedding-ada-002" - } -} - -### Get current default settings -GET {{endpoint}}/contentunderstanding/defaults?api-version={{apiVersion}} -Ocp-Apim-Subscription-Key: {{subscriptionKey}} - -# ==================================================================== -# SECTION 2: LIST AND EXPLORE ANALYZERS -# ==================================================================== -# -# Analyzers are the core components that process your content. Azure provides -# prebuilt analyzers for common scenarios and you can create custom analyzers. -# -# Prebuilt analyzers include: -# - prebuilt-document: Extract text, tables, and structure from documents -# - prebuilt-invoice: Extract invoice-specific fields -# - prebuilt-video: Analyze video content with scene detection and transcription -# - prebuilt-audio: Transcribe and analyze audio content -# - prebuilt-imageSearch: Analyze and search within images -# -# Learn more: https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/concepts/analyzer-reference -# - -### List all available analyzers -GET {{endpoint}}/contentunderstanding/analyzers?api-version={{apiVersion}} -Content-Type: application/json -Ocp-Apim-Subscription-Key: {{subscriptionKey}} - -### Get details of a specific analyzer -GET {{endpoint}}/contentunderstanding/analyzers/prebuilt-invoice?api-version={{apiVersion}} -Ocp-Apim-Subscription-Key: {{subscriptionKey}} - -# ==================================================================== -# SECTION 3: TEST PREBUILT ANALYZERS -# ==================================================================== -# -# Test the prebuilt analyzers with sample documents to understand their -# capabilities before creating custom analyzers. -# - -### 3.1 Analyze a document with prebuilt-document analyzer -# This analyzer extracts text, tables, key-value pairs, and document structure -# @name documentAnalysis -POST {{endpoint}}/contentunderstanding/analyzers/prebuilt-document:analyze?api-version={{apiVersion}} -Content-Type: application/json -Ocp-Apim-Subscription-Key: {{subscriptionKey}} - -{ - "inputs": [ - { - "url": "https://github.com/Azure-Samples/azure-ai-content-understanding-python/raw/refs/heads/main/data/invoice.pdf" - } - ], - "modelDeployments": { - "gpt-4.1": "gpt-4.1-datazone" - } -} - -### Get document analysis results (uses operation ID from previous request) -GET {{endpoint}}/contentunderstanding/analyzerResults/{{documentAnalysis.response.body.id}}?api-version={{apiVersion}} -Ocp-Apim-Subscription-Key: {{subscriptionKey}} - -### 3.2 Analyze an invoice with prebuilt-invoice analyzer -# Specialized analyzer for invoices that extracts vendor, items, totals, etc. -# @name invoiceAnalysis -POST {{endpoint}}/contentunderstanding/analyzers/prebuilt-invoice:analyze?api-version={{apiVersion}} -Content-Type: application/json -Ocp-Apim-Subscription-Key: {{subscriptionKey}} - -{ - "inputs": [ - { - "url": "https://github.com/Azure-Samples/azure-ai-content-understanding-python/raw/refs/heads/main/data/invoice.pdf" - } - ] -} - -### Get invoice analysis results (uses operation ID from previous request) -GET {{endpoint}}/contentunderstanding/analyzerResults/{{invoiceAnalysis.response.body.id}}?api-version={{apiVersion}} -Ocp-Apim-Subscription-Key: {{subscriptionKey}} - -### 3.3 Analyze an image with prebuilt-imageSearch analyzer -# Extract visual content, text, and enable semantic search within images -# @name imageAnalysis -POST {{endpoint}}/contentunderstanding/analyzers/prebuilt-imageSearch:analyze?api-version={{apiVersion}} -Content-Type: application/json -Ocp-Apim-Subscription-Key: {{subscriptionKey}} - -{ - "inputs": [ - { - "url": "https://github.com/Azure-Samples/azure-ai-content-understanding-python/raw/refs/heads/main/data/pieChart.jpg" - } - ], - "modelDeployments": { - "gpt-4.1": "gpt-4.1-datazone", - "text-embedding-3-large": "text-embedding-3-large" - } -} - -### Get image analysis results (uses operation ID from previous request) -GET {{endpoint}}/contentunderstanding/analyzerResults/{{imageAnalysis.response.body.id}}?api-version={{apiVersion}} -Ocp-Apim-Subscription-Key: {{subscriptionKey}} - -# ==================================================================== -# SECTION 4: CREATE CUSTOM DOCUMENT ANALYZER -# ==================================================================== -# -# Create custom analyzers to extract specific fields from your documents. -# Custom analyzers build on top of prebuilt analyzers and add field schemas -# to define what information to extract. -# -# Learn more: https://review.learn.microsoft.com/en-us/azure/ai-services/content-understanding/how-to/migration-preview-to-ga -# - -### 4.1 Create a custom insurance claim form analyzer -# This analyzer extracts specific fields from insurance claim forms -PUT {{endpoint}}/contentunderstanding/analyzers/claimForm?api-version={{apiVersion}} -Ocp-Apim-Subscription-Key: {{subscriptionKey}} -Content-Type: application/json - -{ - "baseAnalyzerId": "prebuilt-document", - "analyzerId": "claimForm", - "models": { - "completion": "gpt-4.1", - "embedding": "text-embedding-ada-002" - }, - "fieldSchema": { - "fields": { - "PolicyNumber": { - "type": "string", - "method": "extract", - "description": "The insurance policy number associated with this claim." - }, - "ClaimNumber": { - "type": "string", - "method": "extract", - "description": "The unique claim number assigned to this claim." - }, - "TotalClaimAmount": { - "type": "number", - "method": "extract", - "description": "The total amount being claimed." - }, - "AccidentDate": { - "type": "string", - "method": "extract", - "description": "The date when the accident occurred." - }, - "LossType": { - "type": "string", - "method": "classify", - "description": "The type of loss (e.g., collision, theft, fire).", - "enum": ["collision", "theft", "fire", "natural disaster", "vandalism"] - } - }, - "definitions": {} - }, - "omitContent": true -} - -### Get the custom analyzer details -GET {{endpoint}}/contentunderstanding/analyzers/claimForm?api-version={{apiVersion}} -Ocp-Apim-Subscription-Key: {{subscriptionKey}} - -### 4.2 Create a custom invoice analyzer -# Example of a custom invoice analyzer with specific field extraction -PUT {{endpoint}}/contentunderstanding/analyzers/invoice_custom?api-version={{apiVersion}} -Ocp-Apim-Subscription-Key: {{subscriptionKey}} -Content-Type: application/json - -{ - "analyzerId": "invoice_custom", - "baseAnalyzerId": "prebuilt-document", - "description": "Custom invoice analyzer with specific fields", - "scenario": "document", - "models": { - "completion": "gpt-4.1" - }, - "fieldSchema": { - "fields": { - "VendorName": { - "type": "string", - "method": "extract", - "description": "Vendor issuing the invoice" - }, - "Items": { - "type": "array", - "method": "extract", - "items": { - "type": "object", - "properties": { - "Description": { - "type": "string", - "method": "extract", - "description": "Description of the item" - }, - "Amount": { - "type": "number", - "method": "extract", - "description": "Amount of the item" - } - } - } - } - } - } -} - -### Test the custom invoice analyzer -# @name customInvoiceAnalysis -POST {{endpoint}}/contentunderstanding/analyzers/invoice_custom:analyze?api-version={{apiVersion}} -Content-Type: application/json -Ocp-Apim-Subscription-Key: {{subscriptionKey}} - -{ - "inputs": [ - { - "url": "https://documentintelligence.ai.azure.com/documents/samples/read/read-healthcare.png" - } - ], - "modelDeployments": { - "gpt-4.1": "gpt-4.1-datazone" - } -} - -### Get custom invoice results (uses operation ID from previous request) -GET {{endpoint}}/contentunderstanding/analyzerResults/{{customInvoiceAnalysis.response.body.id}}?api-version={{apiVersion}} -Ocp-Apim-Subscription-Key: {{subscriptionKey}} - -### 4.3 Create a complex multi-document analyzer with content categories -# This analyzer can classify and route different document types to specialized analyzers -PUT {{endpoint}}/contentunderstanding/analyzers/insuranceClaim?api-version={{apiVersion}} -Ocp-Apim-Subscription-Key: {{subscriptionKey}} -Content-Type: application/json - -{ - "baseAnalyzerId": "prebuilt-document", - "analyzerId": "insuranceClaim", - "models": { - "completion": "gpt-4.1", - "embedding": "text-embedding-ada-002" - }, - "config": { - // Enable splitting of the input into segments for multi-document files - "enableSegment": true, - "contentCategories": { - "claimForm": { - "description": "The claim form for Zava Insurance", - "analyzerId": "claimForm" - }, - "estimate": { - "description": "The body shop estimate or contractor estimate to fix the property damage.", - "analyzerId": "prebuilt-invoice" - }, - "medicalReport": { - "description": "A doctors assessment or medical report related to injury suffered.", - "analyzerId": "prebuilt-document" - }, - "policeReport": { - "description": "A police or law enforcement report detailing the events that lead to the loss." - } - }, - "omitContent": true - } -} - -### Delete an analyzer (if needed) -DELETE {{endpoint}}/contentunderstanding/analyzers/insuranceClaim?api-version={{apiVersion}} -Ocp-Apim-Subscription-Key: {{subscriptionKey}} - -# ==================================================================== -# SECTION 5: VIDEO ANALYZER EXAMPLES -# ==================================================================== -# -# Video analyzers can transcribe, detect scenes, extract entities, and -# perform custom analysis on video content. -# -# Learn more: https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/concepts/analyzer-reference -# - -### 5.1 Test prebuilt video analyzer -# @name videoAnalysis -POST {{endpoint}}/contentunderstanding/analyzers/prebuilt-video:analyze?api-version={{apiVersion}} -Content-Type: application/json -Ocp-Apim-Subscription-Key: {{subscriptionKey}} - -{ - "inputs": [ - { - "url": "https://github.com/Azure-Samples/azure-ai-content-understanding-python/raw/refs/heads/main/data/FlightSimulator.mp4" - } - ], - "modelDeployments": { - "gpt-4.1": "gpt-4.1-datazone", - "text-embedding-3-large": "text-embedding-3-large" - } -} - -### Get video analysis results (uses operation ID from previous request) -GET {{endpoint}}/contentunderstanding/analyzerResults/{{videoAnalysis.response.body.id}}?api-version={{apiVersion}} -Ocp-Apim-Subscription-Key: {{subscriptionKey}} - -### 5.2 Create custom video analyzer with dynamic chaptering -# This analyzer segments videos into chapters/stories with scene detection -PUT {{endpoint}}/contentunderstanding/analyzers/video_chaptering?api-version={{apiVersion}} -Ocp-Apim-Subscription-Key: {{subscriptionKey}} -Content-Type: application/json - -{ - "description": "Dynamic video chaptering with scene detection", - "scenario": "videoShot", - "baseAnalyzerId": "prebuilt-video", - "models": { - "completion": "gpt-4.1" - }, - "config": { - "returnDetails": true, - "enableSegmentation": true, - "segmentationMode": "custom", - "segmentationDefinition": "Segment the video into stories or chapters. A story (chapter) in a video is a self-contained portion of the program dedicated to a specific news story, topic, or theme. Each segment typically includes a distinct introduction, development, and (sometimes) a conclusion, and can feature a combination of elements such as reporter narration, interviews, sound bites, relevant footage (B-roll), and graphics.", - "locales": ["en-US"] - }, - "fieldSchema": { - "name": "Content Understanding - Dynamic Chaptering", - "fields": { - "Segments": { - "type": "array", - "items": { - "type": "object", - "properties": { - "SegmentId": { - "type": "string" - }, - "SegmentType": { - "type": "string", - "method": "generate", - "description": "The short title or a short summary of the story or chapter." - }, - "Scenes": { - "type": "array", - "items": { - "type": "object", - "properties": { - "Description": { - "type": "string", - "method": "generate", - "description": "A five-word description of the scene. A scene is a smaller segment of the segment where a continous block for storytelling unfolds within a specific time, place, and set of characters. A scene can only belong to a single chapter, and cannot overlap with other scenes. Scenes are sequential across the video." - }, - "StartTimestamp": { - "type": "string", - "description": "the start timestamp of the scene" - }, - "EndTimestamp": { - "type": "string", - "description": "the end timestamp of the scene" - } - } - } - } - } - } - } - } - } -} - -### Get the video analyzer details -GET {{endpoint}}/contentunderstanding/analyzers/video_chaptering?api-version={{apiVersion}} -Ocp-Apim-Subscription-Key: {{subscriptionKey}} - -### Test the custom video chaptering analyzer -# @name videoChapteringAnalysis -POST {{endpoint}}/contentunderstanding/analyzers/video_chaptering:analyze?api-version={{apiVersion}} -Content-Type: application/json -Ocp-Apim-Subscription-Key: {{subscriptionKey}} - -{ - "inputs": [ - { - "url": "https://github.com/Azure-Samples/azure-ai-content-understanding-python/blob/402ec1bf337d54b438581c69dbfb784da74ded38/data/video.mp4" - } - ] -} - -### Get video chaptering results (uses operation ID from previous request) -GET {{endpoint}}/contentunderstanding/analyzerResults/{{videoChapteringAnalysis.response.body.id}}?api-version={{apiVersion}} -Ocp-Apim-Subscription-Key: {{subscriptionKey}} - -# ==================================================================== -# SECTION 6: AUDIO ANALYZER EXAMPLES -# ==================================================================== -# -# Audio analyzers can transcribe speech, detect speakers, and extract -# insights from audio content. -# - -### Test prebuilt audio analyzer (coming soon - placeholder) -# POST {{endpoint}}/contentunderstanding/analyzers/prebuilt-audio:analyze?api-version={{apiVersion}} -# Content-Type: application/json -# Ocp-Apim-Subscription-Key: {{subscriptionKey}} -# -# { -# "inputs": [ -# { -# "url": "https://example.com/audio-sample.mp3" -# } -# ] -# } - -# ==================================================================== -# SECTION 7: ANALYZER MANAGEMENT OPERATIONS -# ==================================================================== -# -# Copy analyzers between resources or regions for deployment purposes. -# - -### Copy analyzer within same resource -POST {{endpoint}}/contentunderstanding/analyzers/insuranceClaimv3:copy?api-version={{apiVersion}} -Content-Type: application/json -Ocp-Apim-Subscription-Key: {{subscriptionKey}} - -{ - "sourceAnalyzerId": "insuranceClaim" -} - -### Grant copy authorization for cross-resource copy -POST {{endpoint}}/contentunderstanding/analyzers/insuranceClaim:grantCopyAuthorization?api-version={{apiVersion}} -Content-Type: application/json -Ocp-Apim-Subscription-Key: {{subscriptionKey}} - -{ - "targetAzureResourceId": "/subscriptions/3b393ccb-47e3-4dea-9004-0c1085b5aba6/resourceGroups/mmi-sea-eft/providers/Microsoft.CognitiveServices/accounts/foundry-sea-eft", - "targetRegion": "southeastasia" -} - -### Copy analyzer to different resource (use authorization from above) -# POST https://foundry-sea-eft.cognitiveservices.azure.com/contentunderstanding/analyzers/insuranceClaimCopy:copy?api-version={{apiVersion}} -# Content-Type: application/json -# Ocp-Apim-Subscription-Key: {{targetSubscriptionKey}} -# -# { -# "targetAzureResourceId": "/subscriptions/3b393ccb-47e3-4dea-9004-0c1085b5aba6/resourceGroups/mmi-sea-eft/providers/Microsoft.CognitiveServices/accounts/foundry-sea-eft", -# "targetRegion": "southeastasia", -# "expiresAt": "2025-11-01T19:18:11.095328+00:00" -# } - -# ==================================================================== -# END OF GUIDE -# ==================================================================== - - -# Test Create a DataSnipper Repro - - - -### 4.1 Create a custom insurance claim form analyzer -# This analyzer extracts specific fields from insurance claim forms -PUT {{endpoint}}/contentunderstanding/analyzers/dsTest?api-version={{apiVersion}} -Ocp-Apim-Subscription-Key: {{subscriptionKey}} -Content-Type: application/json - -{ - "baseAnalyzerId": "prebuilt-document", - "analyzerId": "dsTest", - "models": { - "completion": "gpt-4o", - "embedding": "text-embedding-ada-002" - }, - "fieldSchema": { - "fields": { - "employees": { - "type": "array", - "items": { - "type": "object", - "properties": { - "employee_name": { - "type": "string", - "method": "extract", - "description": "The name of the employee", - "estimateSourceAndConfidence": true - } - }, - "description": "The employee of the company" - }, - "description": "The employees of the company" - } - } - } -} - -### Get the custom analyzer details -GET {{endpoint}}/contentunderstanding/analyzers/dsTest?api-version={{apiVersion}} -Ocp-Apim-Subscription-Key: {{subscriptionKey}} - - - -### Test dsTest analyzer with document -# @name dsTestAnalysis -POST {{endpoint}}/contentunderstanding/analyzers/dsTest:analyze?api-version={{apiVersion}} -Content-Type: application/json -Ocp-Apim-Subscription-Key: {{subscriptionKey}} - -{ - "inputs": [ - { - "url": "https://mmiusw3bbstore.blob.core.windows.net/bugbash-20251020/jfilcik/ADP%20Journals.pdf?sv=2025-07-05&spr=https&st=2025-12-15T17%3A06%3A24Z&se=2025-12-16T17%3A06%3A24Z&skoid=e48ae032-21b6-418b-963c-3129b6a130d3&sktid=72f988bf-86f1-41af-91ab-2d7cd011db47&skt=2025-12-15T17%3A06%3A24Z&ske=2025-12-16T17%3A06%3A24Z&sks=b&skv=2025-07-05&sr=b&sp=r&sig=fX9biRmkc2IPA8BGUrgyZZ651BnB0WPiU54YIyplE2Q%3D" - } - ] -} - -### Get dsTest analysis results (uses operation ID from previous request) -GET {{endpoint}}/contentunderstanding/analyzerResults/{{dsTestAnalysis.response.body.id}}?api-version={{apiVersion}} -Ocp-Apim-Subscription-Key: {{subscriptionKey}} \ No newline at end of file From e2f5b16255cedbba6b7e5404417e76be220aabde Mon Sep 17 00:00:00 2001 From: Joe Filcik Date: Tue, 16 Dec 2025 20:57:13 -0500 Subject: [PATCH 6/8] Update contstants and readme --- python/di_to_cu_migration_tool/README.md | 27 ++++++++++++++++----- python/di_to_cu_migration_tool/constants.py | 2 +- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/python/di_to_cu_migration_tool/README.md b/python/di_to_cu_migration_tool/README.md index d5548c8..737a4ba 100644 --- a/python/di_to_cu_migration_tool/README.md +++ b/python/di_to_cu_migration_tool/README.md @@ -27,8 +27,26 @@ Here is a detailed breakdown of the three CLI tools and their functionality: * **call_analyze.py** * This CLI tool verifies that the migration completed successfully and assesses the quality of the created analyzer. + ## Setup +## Prerequisites + +⚠️ **IMPORTANT: Before using this migration tool**, ensure your Azure AI Foundry resource is properly configured for Content Understanding: + +1. **Configure Default Model Deployments**: You must set default model deployments in your Content Understanding in your Foundry Resource before creating or running analyzers. + + To do this walk through the prerequisites here: + - [REST API Quickstart Guide](https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/quickstart/use-rest-api?tabs=portal%2Cdocument) + + For more details about defaults checkout this documentation: + - [Models and Deployments Documentation](https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/concepts/models-deployments) + +2. **Verify you can create and use a basic Content Understanding analyzer** in your Azure AI Foundry resource before attempting migration. This ensures all prerequisites are met. + +3. Complete all setup steps outlined in the REST API documentation above, including authentication and model deployment configuration. + +### Tool Setup Please follow these steps to set up the tool: 1. Install dependencies by running: @@ -102,9 +120,6 @@ If migrating a _DI 3.1/4.0 GA Custom Extraction_ dataset, please run: python ./di_to_cu_converter.py --DI-version neural --analyzer-prefix mySampleAnalyzer \ --source-container-sas-url "https://sourceStorageAccount.blob.core.windows.net/sourceContainer?sourceSASToken" --source-blob-folder diDatasetFolderName \ --target-container-sas-url "https://targetStorageAccount.blob.core.windows.net/targetContainer?targetSASToken" --target-blob-folder cuDatasetFolderName - -python ./di_to_cu_converter.py --DI-version neural --analyzer-prefix mySampleAnalyzer --source-container-sas-url "https://jfilcikditestdata.blob.core.windows.net/didata?sv=2025-07-05&spr=https&st=2025-12-16T22%3A17%3A06Z&se=2025-12-17T22%3A17%3A06Z&sr=c&sp=rl&sig=nvUIelZQ9yWEJx3jA%2FjUOIdHn6OVnp5gvKSJ3zgzwvE%3D" --source-blob-folder diDatasetFolderName \ ---target-container-sas-url "https://jfilcikditestdata.blob.core.windows.net/cudata?sv=2025-07-05&spr=https&st=2025-12-16T22%3A19%3A39Z&se=2025-12-17T22%3A19%3A39Z&sr=c&sp=racwl&sig=K82dxEFNpYhuf5JRq3xJ4vc5SYE8A7FfsBnTJbB1VJY%3D" --target-blob-folder cuDatasetFolderName ``` For this migration, specifying an analyzer prefix is crucial for creating a CU analyzer. Since the fields.json does not define a "doc_type" for identification, the created analyzer ID will be the specified analyzer prefix. @@ -127,9 +142,9 @@ After converting the CU analyzer.json, please run: ``` python ./create_analyzer.py \ ---analyzer-sas-url "https://jfilcikditestdata.blob.core.windows.net/cudata?sv=2025-07-05&spr=https&st=2025-12-16T22%3A19%3A39Z&se=2025-12-17T22%3A19%3A39Z&sr=c&sp=racwl&sig=K82dxEFNpYhuf5JRq3xJ4vc5SYE8A7FfsBnTJbB1VJY%3D" \ ---target-container-sas-url "https://jfilcikditestdata.blob.core.windows.net/cudata?sv=2025-07-05&spr=https&st=2025-12-16T22%3A19%3A39Z&se=2025-12-17T22%3A19%3A39Z&sr=c&sp=racwl&sig=K82dxEFNpYhuf5JRq3xJ4vc5SYE8A7FfsBnTJbB1VJY%3D" \ ---target-blob-folder "di_convert" +--analyzer-sas-url "https://targetStorageAccount.blob.core.windows.net/targetContainer/cuDatasetFolderName/analyzer.json?targetSASToken" \ +--target-container-sas-url "https://targetStorageAccount.blob.core.windows.net/targetContainer?targetSASToken" \ +--target-blob-folder cuDatasetFolderName ``` The `analyzer.json` file is located in the specified target blob container and folder. Please obtain the SAS URL for `analyzer.json` from there. diff --git a/python/di_to_cu_migration_tool/constants.py b/python/di_to_cu_migration_tool/constants.py index 09dc972..73f9e0c 100644 --- a/python/di_to_cu_migration_tool/constants.py +++ b/python/di_to_cu_migration_tool/constants.py @@ -1,6 +1,6 @@ # Supported DI versions DI_VERSIONS = ["generative", "neural"] -CU_API_VERSION = "2025-05-01-preview" +CU_API_VERSION = "2025-11-01" # constants MAX_FIELD_COUNT = 100 From 806b737e976f21ab645ec4813465c7dfb36c783a Mon Sep 17 00:00:00 2001 From: Joe Filcik Date: Wed, 21 Jan 2026 17:41:54 -0500 Subject: [PATCH 7/8] Add legal transcript example and reflow notebook for inline line numbers. - Created a Jupyter notebook to demonstrate processing legal documents with Azure Content Understanding. - Implemented functionality to reflow output to include inline line numbers for better citation and reference. - Included detailed markdown explanations and code for loading, analyzing, and reflowing legal transcripts. --- .../reflow_markdown_with_line_numbers.py | 332 ++++++++ data/legal_examples/Trascript Example.pdf | Bin 0 -> 132 bytes notebooks/legal_transcript_reflow.ipynb | 746 ++++++++++++++++++ 3 files changed, 1078 insertions(+) create mode 100644 Customers/Eve Legal/reflow_markdown_with_line_numbers.py create mode 100644 data/legal_examples/Trascript Example.pdf create mode 100644 notebooks/legal_transcript_reflow.ipynb diff --git a/Customers/Eve Legal/reflow_markdown_with_line_numbers.py b/Customers/Eve Legal/reflow_markdown_with_line_numbers.py new file mode 100644 index 0000000..8f0cdb2 --- /dev/null +++ b/Customers/Eve Legal/reflow_markdown_with_line_numbers.py @@ -0,0 +1,332 @@ +""" +Reflow Content Understanding JSON output to include line numbers inline with text. + +This script reads the JSON output from Azure Content Understanding and generates +a new markdown output where line numbers (commonly found in legal documents, +depositions, and transcripts) are included inline with the corresponding text. + +Content Understanding's default behavior groups line numbers (which appear on the +left margin of pages) separately from the main text content. This script uses +the bounding box coordinates from the 'source' field to determine vertical position +and match line numbers with their corresponding text lines. + +Usage: + python reflow_markdown_with_line_numbers.py [--output ] [--page ] + +Example: + python reflow_markdown_with_line_numbers.py test_output/document.json --page 1 --output reflowed.md +""" + +import argparse +import json +import re +from dataclasses import dataclass +from pathlib import Path +from typing import Optional + + +@dataclass +class LineElement: + """Represents a line or element from the document with its position.""" + content: str + y_position: float # Top Y coordinate + x_position: float # Left X coordinate + page_number: int + is_line_number: bool = False + + def __repr__(self): + return f"LineElement(content='{self.content[:30]}...', y={self.y_position:.2f}, x={self.x_position:.2f}, is_num={self.is_line_number})" + + +def parse_source_coordinates(source: str) -> tuple[int, float, float, float, float]: + """ + Parse the source coordinate string from Content Understanding. + + The source format is: D(pageNumber,x1,y1,x2,y2,x3,y3,x4,y4) + where the points represent a quadrilateral (upper-left, upper-right, lower-right, lower-left) + + Args: + source: The source string from Content Understanding JSON + + Returns: + Tuple of (page_number, left_x, top_y, right_x, bottom_y) + """ + # Match the D(...) pattern and extract values + match = re.match(r'D\((\d+),([^)]+)\)', source) + if not match: + raise ValueError(f"Invalid source format: {source}") + + page_number = int(match.group(1)) + coords = [float(x) for x in match.group(2).split(',')] + + if len(coords) == 8: + # Bounding polygon format: x1,y1,x2,y2,x3,y3,x4,y4 + # Points are: upper-left, upper-right, lower-right, lower-left + x1, y1, x2, y2, x3, y3, x4, y4 = coords + left_x = min(x1, x4) + top_y = min(y1, y2) + right_x = max(x2, x3) + bottom_y = max(y3, y4) + elif len(coords) == 4: + # Axis-aligned bounding box format: left, top, width, height + left_x, top_y, width, height = coords + right_x = left_x + width + bottom_y = top_y + height + else: + raise ValueError(f"Unexpected coordinate count in source: {source}") + + return page_number, left_x, top_y, right_x, bottom_y + + +def is_line_number(content: str) -> bool: + """Check if the content appears to be a line number (1-99).""" + return content.strip().isdigit() and 1 <= int(content.strip()) <= 99 + + +def is_noise_element(content: str) -> bool: + """Check if content is noise (bullets, single dots) that should be filtered.""" + content = content.strip() + # Filter out single bullets/dots that CU uses for structure + return content in ['·', '•', '∙'] or (len(content) == 1 and not content.isalnum()) + + +def extract_lines_from_page(page_data: dict) -> list[LineElement]: + """ + Extract all lines from a page and categorize them. + + Args: + page_data: The page object from Content Understanding JSON + + Returns: + List of LineElement objects with position information + """ + elements = [] + page_number = page_data.get('pageNumber', 1) + + # Process lines array which contains the text content + for line in page_data.get('lines', []): + content = line.get('content', '').strip() + source = line.get('source', '') + + if not source or not content: + continue + + try: + parsed_page, left_x, top_y, right_x, bottom_y = parse_source_coordinates(source) + + # Skip noise elements (bullets, etc.) + if is_noise_element(content): + continue + + element = LineElement( + content=content, + y_position=top_y, + x_position=left_x, + page_number=parsed_page, + is_line_number=is_line_number(content) + ) + elements.append(element) + + except ValueError as e: + print(f"Warning: Could not parse source for line '{content[:30]}...': {e}") + continue + + return elements + + +def group_lines_by_vertical_position(elements: list[LineElement], + y_tolerance: float = 0.15) -> list[list[LineElement]]: + """ + Group elements that appear on the same horizontal line (same Y position). + + Args: + elements: List of LineElement objects + y_tolerance: Tolerance for considering elements on the same line (in inches for PDFs) + + Returns: + List of groups, where each group contains elements on the same line + """ + if not elements: + return [] + + # Sort by Y position (top to bottom) + sorted_elements = sorted(elements, key=lambda e: e.y_position) + + groups = [] + current_group = [sorted_elements[0]] + current_y = sorted_elements[0].y_position + + for element in sorted_elements[1:]: + if abs(element.y_position - current_y) <= y_tolerance: + # Same line + current_group.append(element) + else: + # New line + groups.append(current_group) + current_group = [element] + current_y = element.y_position + + # Don't forget the last group + if current_group: + groups.append(current_group) + + return groups + + +def reflow_page_with_line_numbers(page_data: dict, + separator: str = " | ") -> str: + """ + Reflow a single page's content to include line numbers inline. + + Args: + page_data: The page object from Content Understanding JSON + separator: String to separate line number from content + + Returns: + Reflowed markdown string for this page + """ + elements = extract_lines_from_page(page_data) + + if not elements: + return "" + + # Group by vertical position + line_groups = group_lines_by_vertical_position(elements) + + output_lines = [] + + for group in line_groups: + # Sort elements within the group by X position (left to right) + group.sort(key=lambda e: e.x_position) + + # Separate line numbers from content + line_numbers = [e for e in group if e.is_line_number] + content_elements = [e for e in group if not e.is_line_number] + + if not content_elements: + # Skip lines with only line numbers (shouldn't happen but safety check) + continue + + # Combine content elements + combined_content = ' '.join(e.content for e in content_elements) + + # Prepend line number if found + if line_numbers: + # Use the first (leftmost) line number + line_num = line_numbers[0].content + output_lines.append(f"{line_num}{separator}{combined_content}") + else: + # No line number for this line (e.g., headers, footers) + output_lines.append(combined_content) + + return '\n'.join(output_lines) + + +def reflow_document(json_data: dict, + target_page: Optional[int] = None, + separator: str = " | ") -> str: + """ + Reflow an entire document or specific page with line numbers inline. + + Args: + json_data: The full JSON response from Content Understanding + target_page: If specified, only process this page number (1-indexed) + separator: String to separate line number from content + + Returns: + Reflowed markdown string + """ + contents = json_data.get('result', {}).get('contents', []) + + if not contents: + raise ValueError("No contents found in JSON data") + + # Get the first content (document) + content = contents[0] + + if content.get('kind') != 'document': + print(f"Warning: Content kind is '{content.get('kind')}', expected 'document'") + + pages = content.get('pages', []) + + if not pages: + raise ValueError("No pages found in document content") + + output_parts = [] + + for page in pages: + page_number = page.get('pageNumber', 0) + + if target_page is not None and page_number != target_page: + continue + + page_output = reflow_page_with_line_numbers(page, separator) + + if page_output: + if target_page is None: + output_parts.append(f"\n") + output_parts.append(page_output) + output_parts.append("") # Blank line between pages + + return '\n'.join(output_parts) + + +def main(): + parser = argparse.ArgumentParser( + description='Reflow Content Understanding JSON to include line numbers inline with text.', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + Process all pages: + python reflow_markdown_with_line_numbers.py document.json + + Process specific page: + python reflow_markdown_with_line_numbers.py document.json --page 30 + + Custom output file and separator: + python reflow_markdown_with_line_numbers.py document.json --output reflowed.md --separator " | " +""" + ) + + parser.add_argument('input_json', type=str, + help='Path to the Content Understanding JSON output file') + parser.add_argument('--output', '-o', type=str, default=None, + help='Output file path (default: print to stdout)') + parser.add_argument('--page', '-p', type=int, default=None, + help='Process only this page number (1-indexed)') + parser.add_argument('--separator', '-s', type=str, default=' | ', + help='Separator between line number and content (default: " | ")') + + args = parser.parse_args() + + # Read input JSON + input_path = Path(args.input_json) + if not input_path.exists(): + print(f"Error: Input file not found: {input_path}") + return 1 + + with open(input_path, 'r', encoding='utf-8') as f: + json_data = json.load(f) + + # Process document + try: + result = reflow_document(json_data, args.page, args.separator) + except ValueError as e: + print(f"Error: {e}") + return 1 + + # Output result + if args.output: + output_path = Path(args.output) + output_path.parent.mkdir(parents=True, exist_ok=True) + with open(output_path, 'w', encoding='utf-8') as f: + f.write(result) + print(f"Output written to: {output_path}") + else: + print(result) + + return 0 + + +if __name__ == '__main__': + exit(main()) diff --git a/data/legal_examples/Trascript Example.pdf b/data/legal_examples/Trascript Example.pdf new file mode 100644 index 0000000000000000000000000000000000000000..6036de6bf9a9e667955504b394c621fb721a86fd GIT binary patch literal 132 zcmWN?OA^8$3;@u5Pr(H&gc3fzO_P8yqtX%V!qe;9yeq#)>n~O3Jjbr}z0KRBjP-y0 z(8~R($C+7Qu=FNs)Wf3R5o}>c20|j_Q~+a4R_ta1TRB|G;MkA=B5lAVF*8v1k`kB{ OX&LRC6?pHB3(F5N$0zpy literal 0 HcmV?d00001 diff --git a/notebooks/legal_transcript_reflow.ipynb b/notebooks/legal_transcript_reflow.ipynb new file mode 100644 index 0000000..3eedb9c --- /dev/null +++ b/notebooks/legal_transcript_reflow.ipynb @@ -0,0 +1,746 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "84ae1183", + "metadata": {}, + "source": [ + "# Legal Transcript Line Number Reflow\n", + "\n", + "This notebook demonstrates how to process legal documents (depositions, court transcripts, trial records) with Azure Content Understanding and reflow the output to include inline line numbers.\n", + "\n", + "## The Challenge\n", + "\n", + "Legal transcripts have a standardized format with **line numbers in the left margin** (typically 1-25 per page). These line numbers are critical for:\n", + "- Citing specific testimony in legal briefs\n", + "- Cross-referencing during depositions and trials\n", + "- Creating accurate legal summaries\n", + "\n", + "By default, Content Understanding's markdown output groups these margin line numbers separately from the main text content. This notebook shows how to **reflow the output** to include line numbers inline with each text line.\n", + "\n", + "## Workflow\n", + "1. **Load PDF** - Read the local legal transcript file\n", + "2. **Content Extraction** - Use Azure Content Understanding to extract text with position data\n", + "3. **Reflow** - Match line numbers with text using bounding box coordinates\n", + "4. **Output** - Generate markdown with inline line numbers (e.g., `1 | witness testimony...`)" + ] + }, + { + "cell_type": "markdown", + "id": "9fae8a24", + "metadata": {}, + "source": [ + "## Prerequisites\n", + "1. Ensure your Azure AI service is configured by following the [configuration steps](../README.md#configure-azure-ai-service-resource).\n", + "2. Install the required packages to run this sample." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "1756b078", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: aiohttp in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from -r ../requirements.txt (line 1)) (3.13.2)\n", + "Requirement already satisfied: azure-identity in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from -r ../requirements.txt (line 2)) (1.20.0)\n", + "Requirement already satisfied: azure-storage-blob in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from -r ../requirements.txt (line 3)) (12.25.1)\n", + "Requirement already satisfied: python-dotenv in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from -r ../requirements.txt (line 4)) (1.2.1)\n", + "Requirement already satisfied: requests in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from -r ../requirements.txt (line 5)) (2.32.3)\n", + "Requirement already satisfied: Pillow in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from -r ../requirements.txt (line 6)) (12.0.0)\n", + "Requirement already satisfied: aiohappyeyeballs>=2.5.0 in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from aiohttp->-r ../requirements.txt (line 1)) (2.6.1)\n", + "Requirement already satisfied: aiosignal>=1.4.0 in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from aiohttp->-r ../requirements.txt (line 1)) (1.4.0)\n", + "Requirement already satisfied: attrs>=17.3.0 in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from aiohttp->-r ../requirements.txt (line 1)) (25.4.0)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from aiohttp->-r ../requirements.txt (line 1)) (1.8.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from aiohttp->-r ../requirements.txt (line 1)) (6.7.0)\n", + "Requirement already satisfied: propcache>=0.2.0 in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from aiohttp->-r ../requirements.txt (line 1)) (0.4.1)\n", + "Requirement already satisfied: yarl<2.0,>=1.17.0 in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from aiohttp->-r ../requirements.txt (line 1)) (1.22.0)\n", + "Requirement already satisfied: idna>=2.0 in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from yarl<2.0,>=1.17.0->aiohttp->-r ../requirements.txt (line 1)) (3.11)\n", + "Requirement already satisfied: azure-core>=1.31.0 in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from azure-identity->-r ../requirements.txt (line 2)) (1.37.0)\n", + "Requirement already satisfied: cryptography>=2.5 in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from azure-identity->-r ../requirements.txt (line 2)) (46.0.3)\n", + "Requirement already satisfied: msal>=1.30.0 in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from azure-identity->-r ../requirements.txt (line 2)) (1.34.0)\n", + "Requirement already satisfied: msal-extensions>=1.2.0 in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from azure-identity->-r ../requirements.txt (line 2)) (1.3.1)\n", + "Requirement already satisfied: typing-extensions>=4.0.0 in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from azure-identity->-r ../requirements.txt (line 2)) (4.15.0)\n", + "Requirement already satisfied: isodate>=0.6.1 in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from azure-storage-blob->-r ../requirements.txt (line 3)) (0.7.2)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from requests->-r ../requirements.txt (line 5)) (3.4.4)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from requests->-r ../requirements.txt (line 5)) (2.6.2)\n", + "Requirement already satisfied: certifi>=2017.4.17 in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from requests->-r ../requirements.txt (line 5)) (2025.11.12)\n", + "Requirement already satisfied: cffi>=2.0.0 in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from cryptography>=2.5->azure-identity->-r ../requirements.txt (line 2)) (2.0.0)\n", + "Requirement already satisfied: pycparser in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from cffi>=2.0.0->cryptography>=2.5->azure-identity->-r ../requirements.txt (line 2)) (2.23)\n", + "Requirement already satisfied: PyJWT<3,>=1.0.0 in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from PyJWT[crypto]<3,>=1.0.0->msal>=1.30.0->azure-identity->-r ../requirements.txt (line 2)) (2.10.1)\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install -r ../requirements.txt" + ] + }, + { + "cell_type": "markdown", + "id": "6c13e310", + "metadata": {}, + "source": [ + "## Create Azure AI Content Understanding Client" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "6480fbb8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Client created successfully\n", + " Endpoint: https://mmi-usw3-eft-foundry.services.ai.azure.com/\n", + " Credential: Subscription Key\n", + " API Version: 2025-11-01\n" + ] + } + ], + "source": [ + "from datetime import datetime\n", + "import logging\n", + "import os\n", + "import sys\n", + "from typing import Any, Optional\n", + "from dotenv import find_dotenv, load_dotenv\n", + "\n", + "# Add the parent directory to the Python path to import the helper modules\n", + "sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'python'))\n", + "from content_understanding_client import AzureContentUnderstandingClient\n", + "from extension.sample_helper import save_json_to_file \n", + "from azure.identity import DefaultAzureCredential\n", + "\n", + "load_dotenv(find_dotenv())\n", + "logging.basicConfig(level=logging.INFO)\n", + "\n", + "# For authentication, you can use either token-based auth or subscription key\n", + "AZURE_AI_ENDPOINT = os.getenv(\"AZURE_AI_ENDPOINT\")\n", + "AZURE_AI_API_KEY = os.getenv(\"AZURE_AI_API_KEY\")\n", + "API_VERSION = \"2025-11-01\"\n", + "\n", + "# Create token provider for Azure AD authentication\n", + "def token_provider():\n", + " credential = DefaultAzureCredential()\n", + " token = credential.get_token(\"https://cognitiveservices.azure.com/.default\")\n", + " return token.token\n", + "\n", + "# Create the Content Understanding client\n", + "try:\n", + " client = AzureContentUnderstandingClient(\n", + " endpoint=AZURE_AI_ENDPOINT,\n", + " api_version=API_VERSION,\n", + " subscription_key=AZURE_AI_API_KEY,\n", + " token_provider=token_provider if not AZURE_AI_API_KEY else None,\n", + " x_ms_useragent=\"azure-ai-content-understanding-python-sample-legal-reflow\"\n", + " )\n", + " credential_type = \"Subscription Key\" if AZURE_AI_API_KEY else \"Azure AD Token\"\n", + " print(f\"✅ Client created successfully\")\n", + " print(f\" Endpoint: {AZURE_AI_ENDPOINT}\")\n", + " print(f\" Credential: {credential_type}\")\n", + " print(f\" API Version: {API_VERSION}\")\n", + "except Exception as e:\n", + " print(f\"❌ Failed to create client: {e}\")\n", + " raise" + ] + }, + { + "cell_type": "markdown", + "id": "4bb5f26a", + "metadata": {}, + "source": [ + "## Configure Model Deployments\n", + "\n", + "> **💡 Note:** This step is only required **once per Azure Content Understanding resource**." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "f4941027", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "📋 Configuring default model deployments...\n", + "✅ Default model deployments configured successfully\n" + ] + } + ], + "source": [ + "# Get model deployment names from environment variables\n", + "GPT_4_1_DEPLOYMENT = os.getenv(\"GPT_4_1_DEPLOYMENT\")\n", + "GPT_4_1_MINI_DEPLOYMENT = os.getenv(\"GPT_4_1_MINI_DEPLOYMENT\")\n", + "TEXT_EMBEDDING_3_LARGE_DEPLOYMENT = os.getenv(\"TEXT_EMBEDDING_3_LARGE_DEPLOYMENT\")\n", + "\n", + "# Check if required deployments are configured\n", + "missing_deployments = []\n", + "if not GPT_4_1_DEPLOYMENT:\n", + " missing_deployments.append(\"GPT_4_1_DEPLOYMENT\")\n", + "if not GPT_4_1_MINI_DEPLOYMENT:\n", + " missing_deployments.append(\"GPT_4_1_MINI_DEPLOYMENT\")\n", + "if not TEXT_EMBEDDING_3_LARGE_DEPLOYMENT:\n", + " missing_deployments.append(\"TEXT_EMBEDDING_3_LARGE_DEPLOYMENT\")\n", + "\n", + "if missing_deployments:\n", + " print(f\"⚠️ Warning: Missing model deployment configuration(s): {missing_deployments}\")\n", + " print(\" Add these to your .env file and restart the kernel.\")\n", + "else:\n", + " print(f\"📋 Configuring default model deployments...\")\n", + " try:\n", + " result = client.update_defaults({\n", + " \"gpt-4.1\": GPT_4_1_DEPLOYMENT,\n", + " \"gpt-4.1-mini\": GPT_4_1_MINI_DEPLOYMENT,\n", + " \"text-embedding-3-large\": TEXT_EMBEDDING_3_LARGE_DEPLOYMENT\n", + " })\n", + " print(f\"✅ Default model deployments configured successfully\")\n", + " except Exception as e:\n", + " print(f\"❌ Failed to configure defaults: {e}\")\n", + " raise" + ] + }, + { + "cell_type": "markdown", + "id": "b8544696", + "metadata": {}, + "source": [ + "## Analyze Legal Transcript\n", + "\n", + "We'll use a publicly available deposition transcript from the Internet Archive. This is a real legal document with the standard line-numbered format used in depositions.\n", + "\n", + "**Sample Document:** [Farr Deposition Transcript](https://archive.org/details/799436-farr-deposition-transcript) (15 pages, Public Domain)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18932e9f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🔍 Analyzing legal transcript from local file...\n", + " Document: c:\\src\\azure-ai-content-understanding-python\\data\\legal_examples\\Trascript Example.pdf\n", + " Analyzer: prebuilt-documentSearch\n", + " File size: 1,666,047 bytes\n" + ] + }, + { + "ename": "TypeError", + "evalue": "AzureContentUnderstandingClient.begin_analyze_binary() got an unexpected keyword argument 'binary_data'", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mTypeError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[6]\u001b[39m\u001b[32m, line 17\u001b[39m\n\u001b[32m 14\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33m File size: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(document_bytes)\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m,\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m bytes\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 16\u001b[39m \u001b[38;5;66;03m# Analyze the document using binary content\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m17\u001b[39m response = \u001b[43mclient\u001b[49m\u001b[43m.\u001b[49m\u001b[43mbegin_analyze_binary\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 18\u001b[39m \u001b[43m \u001b[49m\u001b[43manalyzer_id\u001b[49m\u001b[43m=\u001b[49m\u001b[43manalyzer_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 19\u001b[39m \u001b[43m \u001b[49m\u001b[43mbinary_data\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdocument_bytes\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 20\u001b[39m \u001b[43m \u001b[49m\u001b[43mcontent_type\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mapplication/pdf\u001b[39;49m\u001b[33;43m'\u001b[39;49m\n\u001b[32m 21\u001b[39m \u001b[43m)\u001b[49m\n\u001b[32m 23\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33m⏳ Waiting for analysis to complete...\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 24\u001b[39m result = client.poll_result(response)\n", + "\u001b[31mTypeError\u001b[39m: AzureContentUnderstandingClient.begin_analyze_binary() got an unexpected keyword argument 'binary_data'" + ] + } + ], + "source": [ + "# Analyze legal transcript from local file\n", + "# Using the transcript example from the data/legal_examples folder\n", + "document_path = os.path.join(os.path.dirname(os.getcwd()), 'data', 'legal_examples', 'Trascript Example.pdf')\n", + "analyzer_id = 'prebuilt-documentSearch'\n", + "\n", + "print(f\"🔍 Analyzing legal transcript from local file...\")\n", + "print(f\" Document: {document_path}\")\n", + "print(f\" Analyzer: {analyzer_id}\")\n", + "\n", + "# Verify file exists\n", + "if not os.path.exists(document_path):\n", + " raise FileNotFoundError(f\"Document not found: {document_path}\")\n", + "\n", + "file_size = os.path.getsize(document_path)\n", + "print(f\" File size: {file_size:,} bytes\")\n", + "\n", + "# Analyze the document using binary file path\n", + "response = client.begin_analyze_binary(\n", + " analyzer_id=analyzer_id,\n", + " file_location=document_path\n", + ")\n", + "\n", + "print(f\"⏳ Waiting for analysis to complete...\")\n", + "result = client.poll_result(response)\n", + "print(f\"✅ Analysis completed!\")\n", + "\n", + "# Get document info\n", + "contents = result.get(\"result\", {}).get(\"contents\", [])\n", + "if contents:\n", + " content = contents[0]\n", + " if content.get(\"kind\") == \"document\":\n", + " print(f\"\\n📄 Document Information:\")\n", + " print(f\" Pages: {content.get('startPageNumber')} - {content.get('endPageNumber')}\")\n", + " print(f\" Total pages: {content.get('endPageNumber') - content.get('startPageNumber') + 1}\")\n", + "\n", + "# Save the full result for processing\n", + "saved_json_path = save_json_to_file(result, filename_prefix=\"legal_transcript_analysis\")\n", + "print(f\"\\n💾 Full analysis saved to: {saved_json_path}\")" + ] + }, + { + "cell_type": "markdown", + "id": "c45dcbdb", + "metadata": {}, + "source": [ + "## View Default Markdown Output\n", + "\n", + "Let's first look at Content Understanding's default markdown output. Notice how the **line numbers are grouped separately** at the bottom of each page's content rather than inline with the text." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b5b60915", + "metadata": {}, + "outputs": [], + "source": [ + "# Show the default markdown output (first 2000 characters)\n", + "markdown = content.get(\"markdown\", \"\")\n", + "\n", + "print(\"📄 Default Markdown Output (first 2000 chars):\")\n", + "print(\"=\" * 60)\n", + "print(markdown[:2000])\n", + "print(\"=\" * 60)\n", + "print(f\"\\n... (Total length: {len(markdown)} characters)\")" + ] + }, + { + "cell_type": "markdown", + "id": "eb101887", + "metadata": {}, + "source": [ + "## How Reflow Works\n", + "\n", + "The reflow algorithm uses **bounding box coordinates** from the JSON output to match line numbers with their corresponding text:\n", + "\n", + "### Step 1: Parse Coordinates\n", + "Every element in CU's JSON has a `source` field with position data:\n", + "```\n", + "\"source\": \"D(1,1.0309,1.1277,1.131,1.1277,1.131,1.2711,1.0309,1.2711)\"\n", + " D(page, x1,y1, x2,y2, x3,y3, x4,y4)\n", + "```\n", + "\n", + "### Step 2: Group by Vertical Position\n", + "Elements with similar Y values (within ~0.15 inches) are on the same horizontal line.\n", + "\n", + "### Step 3: Sort Left-to-Right\n", + "Within each group, sort by X coordinate. Line numbers (X ≈ 1.0\") come before text content (X ≈ 1.3\"+).\n", + "\n", + "### Step 4: Combine\n", + "Pair line numbers with their corresponding text and output as `N | text content`." + ] + }, + { + "cell_type": "markdown", + "id": "13abdb4f", + "metadata": {}, + "source": [ + "## Reflow Functions\n", + "\n", + "Here are the core functions for reflowing Content Understanding output to include inline line numbers:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4a421c01", + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "from dataclasses import dataclass\n", + "from typing import Optional\n", + "\n", + "\n", + "@dataclass\n", + "class LineElement:\n", + " \"\"\"Represents a line or element from the document with its position.\"\"\"\n", + " content: str\n", + " y_position: float # Top Y coordinate\n", + " x_position: float # Left X coordinate\n", + " page_number: int\n", + " is_line_number: bool = False\n", + "\n", + "\n", + "def parse_source_coordinates(source: str) -> tuple[int, float, float, float, float]:\n", + " \"\"\"\n", + " Parse the source coordinate string from Content Understanding.\n", + " \n", + " The source format is: D(pageNumber,x1,y1,x2,y2,x3,y3,x4,y4)\n", + " where the points represent a quadrilateral (upper-left, upper-right, lower-right, lower-left)\n", + " \n", + " Returns:\n", + " Tuple of (page_number, left_x, top_y, right_x, bottom_y)\n", + " \"\"\"\n", + " match = re.match(r'D\\((\\d+),([^)]+)\\)', source)\n", + " if not match:\n", + " raise ValueError(f\"Invalid source format: {source}\")\n", + " \n", + " page_number = int(match.group(1))\n", + " coords = [float(x) for x in match.group(2).split(',')]\n", + " \n", + " if len(coords) == 8:\n", + " # Bounding polygon: x1,y1,x2,y2,x3,y3,x4,y4\n", + " x1, y1, x2, y2, x3, y3, x4, y4 = coords\n", + " left_x = min(x1, x4)\n", + " top_y = min(y1, y2)\n", + " elif len(coords) == 4:\n", + " # Axis-aligned bounding box: left, top, width, height\n", + " left_x, top_y, width, height = coords\n", + " else:\n", + " raise ValueError(f\"Unexpected coordinate count: {source}\")\n", + " \n", + " return page_number, left_x, top_y, 0, 0 # We only need left_x and top_y\n", + "\n", + "\n", + "def is_line_number(content: str) -> bool:\n", + " \"\"\"Check if content is a line number (1-99).\"\"\"\n", + " return content.strip().isdigit() and 1 <= int(content.strip()) <= 99\n", + "\n", + "\n", + "def is_noise_element(content: str) -> bool:\n", + " \"\"\"Check if content is noise (bullets, single dots) that should be filtered.\"\"\"\n", + " content = content.strip()\n", + " return content in ['·', '•', '∙'] or (len(content) == 1 and not content.isalnum())\n", + "\n", + "\n", + "def extract_lines_from_page(page_data: dict) -> list[LineElement]:\n", + " \"\"\"Extract all lines from a page with position information.\"\"\"\n", + " elements = []\n", + " page_number = page_data.get('pageNumber', 1)\n", + " \n", + " for line in page_data.get('lines', []):\n", + " content = line.get('content', '').strip()\n", + " source = line.get('source', '')\n", + " \n", + " if not source or not content or is_noise_element(content):\n", + " continue\n", + " \n", + " try:\n", + " parsed_page, left_x, top_y, _, _ = parse_source_coordinates(source)\n", + " element = LineElement(\n", + " content=content,\n", + " y_position=top_y,\n", + " x_position=left_x,\n", + " page_number=parsed_page,\n", + " is_line_number=is_line_number(content)\n", + " )\n", + " elements.append(element)\n", + " except ValueError:\n", + " continue\n", + " \n", + " return elements\n", + "\n", + "\n", + "def group_lines_by_vertical_position(elements: list[LineElement], \n", + " y_tolerance: float = 0.15) -> list[list[LineElement]]:\n", + " \"\"\"Group elements that appear on the same horizontal line.\"\"\"\n", + " if not elements:\n", + " return []\n", + " \n", + " sorted_elements = sorted(elements, key=lambda e: e.y_position)\n", + " groups = []\n", + " current_group = [sorted_elements[0]]\n", + " current_y = sorted_elements[0].y_position\n", + " \n", + " for element in sorted_elements[1:]:\n", + " if abs(element.y_position - current_y) <= y_tolerance:\n", + " current_group.append(element)\n", + " else:\n", + " groups.append(current_group)\n", + " current_group = [element]\n", + " current_y = element.y_position\n", + " \n", + " if current_group:\n", + " groups.append(current_group)\n", + " \n", + " return groups\n", + "\n", + "\n", + "def reflow_page_with_line_numbers(page_data: dict, separator: str = \" | \") -> str:\n", + " \"\"\"Reflow a single page's content to include line numbers inline.\"\"\"\n", + " elements = extract_lines_from_page(page_data)\n", + " if not elements:\n", + " return \"\"\n", + " \n", + " line_groups = group_lines_by_vertical_position(elements)\n", + " output_lines = []\n", + " \n", + " for group in line_groups:\n", + " # Sort by X position (left to right)\n", + " group.sort(key=lambda e: e.x_position)\n", + " \n", + " line_numbers = [e for e in group if e.is_line_number]\n", + " content_elements = [e for e in group if not e.is_line_number]\n", + " \n", + " if not content_elements:\n", + " continue\n", + " \n", + " combined_content = ' '.join(e.content for e in content_elements)\n", + " \n", + " if line_numbers:\n", + " line_num = line_numbers[0].content\n", + " output_lines.append(f\"{line_num}{separator}{combined_content}\")\n", + " else:\n", + " output_lines.append(combined_content)\n", + " \n", + " return '\\n'.join(output_lines)\n", + "\n", + "\n", + "def reflow_document(json_data: dict, target_page: Optional[int] = None, \n", + " separator: str = \" | \") -> str:\n", + " \"\"\"Reflow an entire document or specific page with line numbers inline.\"\"\"\n", + " contents = json_data.get('result', {}).get('contents', [])\n", + " if not contents:\n", + " raise ValueError(\"No contents found in JSON data\")\n", + " \n", + " content = contents[0]\n", + " pages = content.get('pages', [])\n", + " if not pages:\n", + " raise ValueError(\"No pages found in document content\")\n", + " \n", + " output_parts = []\n", + " \n", + " for page in pages:\n", + " page_number = page.get('pageNumber', 0)\n", + " if target_page is not None and page_number != target_page:\n", + " continue\n", + " \n", + " page_output = reflow_page_with_line_numbers(page, separator)\n", + " if page_output:\n", + " if target_page is None:\n", + " output_parts.append(f\"\\n\\n\")\n", + " output_parts.append(page_output)\n", + " \n", + " return '\\n'.join(output_parts)\n", + "\n", + "print(\"✅ Reflow functions loaded successfully!\")" + ] + }, + { + "cell_type": "markdown", + "id": "2d8d139d", + "metadata": {}, + "source": [ + "## Reflow a Single Page\n", + "\n", + "Let's reflow page 3 of the transcript to see the line numbers inline with the text:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ed1b4ab", + "metadata": {}, + "outputs": [], + "source": [ + "# Reflow a single page (page 3)\n", + "page_to_reflow = 3\n", + "\n", + "print(f\"📄 Reflowed Output for Page {page_to_reflow}:\")\n", + "print(\"=\" * 60)\n", + "reflowed_page = reflow_document(result, target_page=page_to_reflow)\n", + "print(reflowed_page)\n", + "print(\"=\" * 60)" + ] + }, + { + "cell_type": "markdown", + "id": "a3a635c8", + "metadata": {}, + "source": [ + "## Reflow Entire Document\n", + "\n", + "Now let's reflow the entire document and save it to a file:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc9a0906", + "metadata": {}, + "outputs": [], + "source": [ + "# Reflow the entire document\n", + "print(\"📄 Reflowing entire document...\")\n", + "reflowed_document = reflow_document(result)\n", + "\n", + "# Save to file\n", + "output_path = os.path.join(os.getcwd(), 'test_output', 'legal_transcript_reflowed.md')\n", + "os.makedirs(os.path.dirname(output_path), exist_ok=True)\n", + "\n", + "with open(output_path, 'w', encoding='utf-8') as f:\n", + " f.write(reflowed_document)\n", + "\n", + "print(f\"✅ Reflowed document saved to: {output_path}\")\n", + "print(f\" Total characters: {len(reflowed_document)}\")\n", + "\n", + "# Show first 3000 characters\n", + "print(\"\\n📄 Preview (first 3000 characters):\")\n", + "print(\"=\" * 60)\n", + "print(reflowed_document[:3000])\n", + "print(\"=\" * 60)" + ] + }, + { + "cell_type": "markdown", + "id": "28ae13e7", + "metadata": {}, + "source": [ + "## Compare: Before vs After\n", + "\n", + "Let's compare the default output with the reflowed output for a specific page:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2d5dd7e7", + "metadata": {}, + "outputs": [], + "source": [ + "# Compare default vs reflowed for page 3\n", + "page_number = 3\n", + "\n", + "# Get the original markdown (extract just page 3 content - approximation)\n", + "original_lines = markdown.split('\\n')\n", + "\n", + "print(\"📊 COMPARISON: Default vs Reflowed Output\")\n", + "print(\"\\n\" + \"=\" * 30 + \" DEFAULT OUTPUT \" + \"=\" * 30)\n", + "print(\"(Line numbers grouped separately at page bottom)\")\n", + "print(\"-\" * 76)\n", + "\n", + "# Show a sample of the default output\n", + "sample_start = 200\n", + "sample_end = 800\n", + "print(markdown[sample_start:sample_end])\n", + "print(\"...\")\n", + "\n", + "print(\"\\n\" + \"=\" * 30 + \" REFLOWED OUTPUT \" + \"=\" * 29)\n", + "print(\"(Line numbers inline with text)\")\n", + "print(\"-\" * 76)\n", + "\n", + "# Show the reflowed output for page 3\n", + "reflowed_page = reflow_document(result, target_page=page_number)\n", + "print(reflowed_page[:800])\n", + "print(\"...\")" + ] + }, + { + "cell_type": "markdown", + "id": "a0fc1e27", + "metadata": {}, + "source": [ + "## Using the Standalone Script\n", + "\n", + "For batch processing or command-line usage, you can use the standalone script located at `python/reflow_markdown_with_line_numbers.py`:\n", + "\n", + "```bash\n", + "# Process a specific page\n", + "python python/reflow_markdown_with_line_numbers.py analysis.json --page 3\n", + "\n", + "# Process all pages and save to file\n", + "python python/reflow_markdown_with_line_numbers.py analysis.json --output reflowed.md\n", + "\n", + "# Custom separator\n", + "python python/reflow_markdown_with_line_numbers.py analysis.json --separator \" | \" --output reflowed.md\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "580894a1", + "metadata": {}, + "outputs": [], + "source": [ + "# Example: Run the standalone script on our saved JSON\n", + "import subprocess\n", + "\n", + "script_path = os.path.join(os.path.dirname(os.getcwd()), 'python', 'reflow_markdown_with_line_numbers.py')\n", + "output_file = os.path.join(os.getcwd(), 'test_output', 'legal_transcript_reflowed_script.md')\n", + "\n", + "print(f\"🔧 Running standalone reflow script...\")\n", + "print(f\" Input: {saved_json_path}\")\n", + "print(f\" Output: {output_file}\")\n", + "\n", + "result_code = subprocess.run(\n", + " ['python', script_path, saved_json_path, '--output', output_file],\n", + " capture_output=True,\n", + " text=True\n", + ")\n", + "\n", + "if result_code.returncode == 0:\n", + " print(f\"✅ Script completed successfully!\")\n", + " print(result_code.stdout)\n", + "else:\n", + " print(f\"❌ Script failed:\")\n", + " print(result_code.stderr)" + ] + }, + { + "cell_type": "markdown", + "id": "0ac0ae60", + "metadata": {}, + "source": [ + "## Summary\n", + "\n", + "This notebook demonstrated how to:\n", + "\n", + "1. **Extract content** from legal transcripts using Azure Content Understanding's `prebuilt-documentSearch` analyzer\n", + "2. **Understand the JSON structure** including the `source` field with bounding box coordinates\n", + "3. **Reflow the output** to include line numbers inline with text by:\n", + " - Parsing bounding box coordinates to determine element positions\n", + " - Grouping elements by vertical position (Y coordinate)\n", + " - Matching line numbers with their corresponding text content\n", + "4. **Use the standalone script** for batch processing\n", + "\n", + "### Use Cases\n", + "\n", + "This technique is valuable for:\n", + "- **Legal document processing** - Depositions, trial transcripts, court records\n", + "- **Academic citations** - Line-numbered source materials\n", + "- **Content indexing** - Building searchable databases with line-level citations\n", + "- **AI-powered legal research** - RAG applications that need line-accurate references\n", + "\n", + "### Learn More\n", + "\n", + "- [Content Understanding Document Elements](https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/document/elements)\n", + "- [Document Markdown Representation](https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/document/markdown)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From b46417fad7bafdfba8e3f54524d0532061e609d4 Mon Sep 17 00:00:00 2001 From: Joe Filcik Date: Thu, 22 Jan 2026 11:29:48 -0500 Subject: [PATCH 8/8] Fixing up output and switching to using prebuilt-layout --- notebooks/legal_transcript_reflow.ipynb | 438 +++++++++++++++++++++--- 1 file changed, 381 insertions(+), 57 deletions(-) diff --git a/notebooks/legal_transcript_reflow.ipynb b/notebooks/legal_transcript_reflow.ipynb index 3eedb9c..685dab2 100644 --- a/notebooks/legal_transcript_reflow.ipynb +++ b/notebooks/legal_transcript_reflow.ipynb @@ -37,44 +37,10 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "1756b078", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: aiohttp in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from -r ../requirements.txt (line 1)) (3.13.2)\n", - "Requirement already satisfied: azure-identity in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from -r ../requirements.txt (line 2)) (1.20.0)\n", - "Requirement already satisfied: azure-storage-blob in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from -r ../requirements.txt (line 3)) (12.25.1)\n", - "Requirement already satisfied: python-dotenv in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from -r ../requirements.txt (line 4)) (1.2.1)\n", - "Requirement already satisfied: requests in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from -r ../requirements.txt (line 5)) (2.32.3)\n", - "Requirement already satisfied: Pillow in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from -r ../requirements.txt (line 6)) (12.0.0)\n", - "Requirement already satisfied: aiohappyeyeballs>=2.5.0 in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from aiohttp->-r ../requirements.txt (line 1)) (2.6.1)\n", - "Requirement already satisfied: aiosignal>=1.4.0 in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from aiohttp->-r ../requirements.txt (line 1)) (1.4.0)\n", - "Requirement already satisfied: attrs>=17.3.0 in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from aiohttp->-r ../requirements.txt (line 1)) (25.4.0)\n", - "Requirement already satisfied: frozenlist>=1.1.1 in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from aiohttp->-r ../requirements.txt (line 1)) (1.8.0)\n", - "Requirement already satisfied: multidict<7.0,>=4.5 in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from aiohttp->-r ../requirements.txt (line 1)) (6.7.0)\n", - "Requirement already satisfied: propcache>=0.2.0 in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from aiohttp->-r ../requirements.txt (line 1)) (0.4.1)\n", - "Requirement already satisfied: yarl<2.0,>=1.17.0 in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from aiohttp->-r ../requirements.txt (line 1)) (1.22.0)\n", - "Requirement already satisfied: idna>=2.0 in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from yarl<2.0,>=1.17.0->aiohttp->-r ../requirements.txt (line 1)) (3.11)\n", - "Requirement already satisfied: azure-core>=1.31.0 in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from azure-identity->-r ../requirements.txt (line 2)) (1.37.0)\n", - "Requirement already satisfied: cryptography>=2.5 in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from azure-identity->-r ../requirements.txt (line 2)) (46.0.3)\n", - "Requirement already satisfied: msal>=1.30.0 in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from azure-identity->-r ../requirements.txt (line 2)) (1.34.0)\n", - "Requirement already satisfied: msal-extensions>=1.2.0 in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from azure-identity->-r ../requirements.txt (line 2)) (1.3.1)\n", - "Requirement already satisfied: typing-extensions>=4.0.0 in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from azure-identity->-r ../requirements.txt (line 2)) (4.15.0)\n", - "Requirement already satisfied: isodate>=0.6.1 in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from azure-storage-blob->-r ../requirements.txt (line 3)) (0.7.2)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from requests->-r ../requirements.txt (line 5)) (3.4.4)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from requests->-r ../requirements.txt (line 5)) (2.6.2)\n", - "Requirement already satisfied: certifi>=2017.4.17 in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from requests->-r ../requirements.txt (line 5)) (2025.11.12)\n", - "Requirement already satisfied: cffi>=2.0.0 in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from cryptography>=2.5->azure-identity->-r ../requirements.txt (line 2)) (2.0.0)\n", - "Requirement already satisfied: pycparser in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from cffi>=2.0.0->cryptography>=2.5->azure-identity->-r ../requirements.txt (line 2)) (2.23)\n", - "Requirement already satisfied: PyJWT<3,>=1.0.0 in c:\\src\\azure-ai-content-understanding-python\\.venv\\lib\\site-packages (from PyJWT[crypto]<3,>=1.0.0->msal>=1.30.0->azure-identity->-r ../requirements.txt (line 2)) (2.10.1)\n", - "Note: you may need to restart the kernel to use updated packages.\n" - ] - } - ], + "outputs": [], "source": [ "%pip install -r ../requirements.txt" ] @@ -163,7 +129,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "id": "f4941027", "metadata": {}, "outputs": [ @@ -222,7 +188,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "18932e9f", "metadata": {}, "outputs": [ @@ -232,19 +198,46 @@ "text": [ "🔍 Analyzing legal transcript from local file...\n", " Document: c:\\src\\azure-ai-content-understanding-python\\data\\legal_examples\\Trascript Example.pdf\n", - " Analyzer: prebuilt-documentSearch\n", + " Analyzer: prebuilt-layout\n", " File size: 1,666,047 bytes\n" ] }, { - "ename": "TypeError", - "evalue": "AzureContentUnderstandingClient.begin_analyze_binary() got an unexpected keyword argument 'binary_data'", - "output_type": "error", - "traceback": [ - "\u001b[31m---------------------------------------------------------------------------\u001b[39m", - "\u001b[31mTypeError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[6]\u001b[39m\u001b[32m, line 17\u001b[39m\n\u001b[32m 14\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33m File size: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(document_bytes)\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m,\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m bytes\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 16\u001b[39m \u001b[38;5;66;03m# Analyze the document using binary content\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m17\u001b[39m response = \u001b[43mclient\u001b[49m\u001b[43m.\u001b[49m\u001b[43mbegin_analyze_binary\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 18\u001b[39m \u001b[43m \u001b[49m\u001b[43manalyzer_id\u001b[49m\u001b[43m=\u001b[49m\u001b[43manalyzer_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 19\u001b[39m \u001b[43m \u001b[49m\u001b[43mbinary_data\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdocument_bytes\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 20\u001b[39m \u001b[43m \u001b[49m\u001b[43mcontent_type\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mapplication/pdf\u001b[39;49m\u001b[33;43m'\u001b[39;49m\n\u001b[32m 21\u001b[39m \u001b[43m)\u001b[49m\n\u001b[32m 23\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33m⏳ Waiting for analysis to complete...\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 24\u001b[39m result = client.poll_result(response)\n", - "\u001b[31mTypeError\u001b[39m: AzureContentUnderstandingClient.begin_analyze_binary() got an unexpected keyword argument 'binary_data'" + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:content_understanding_client:Analyzing binary file c:\\src\\azure-ai-content-understanding-python\\data\\legal_examples\\Trascript Example.pdf with analyzer: prebuilt-layout\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⏳ Waiting for analysis to complete...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:content_understanding_client:Request 9e72763b-ecdf-4b8d-8f9b-a6733d30b6b1 in progress ...\n", + "INFO:content_understanding_client:Request 9e72763b-ecdf-4b8d-8f9b-a6733d30b6b1 in progress ...\n", + "INFO:content_understanding_client:Request 9e72763b-ecdf-4b8d-8f9b-a6733d30b6b1 in progress ...\n", + "INFO:content_understanding_client:Request result is ready after 7.11 seconds.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Analysis completed!\n", + "\n", + "📄 Document Information:\n", + " Pages: 1 - 52\n", + " Total pages: 52\n", + "💾 Analysis result saved to: test_output\\legal_transcript_analysis_20260122_162606.json\n", + "\n", + "💾 Full analysis saved to: test_output\\legal_transcript_analysis_20260122_162606.json\n" ] } ], @@ -252,7 +245,7 @@ "# Analyze legal transcript from local file\n", "# Using the transcript example from the data/legal_examples folder\n", "document_path = os.path.join(os.path.dirname(os.getcwd()), 'data', 'legal_examples', 'Trascript Example.pdf')\n", - "analyzer_id = 'prebuilt-documentSearch'\n", + "analyzer_id = 'prebuilt-layout'\n", "\n", "print(f\"🔍 Analyzing legal transcript from local file...\")\n", "print(f\" Document: {document_path}\")\n", @@ -301,10 +294,204 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "b5b60915", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "📄 Default Markdown Output (first 2000 chars):\n", + "============================================================\n", + "# (B&W) PROTECTED BY MINNESOTA TOBACCO LITIGATION PROTECTIVE ORDER\n", + "\n", + "SUPERIOR COURT OF NEW JERSEY\n", + "MERCER COUNTY-LAW DIVISION,\n", + "DOCKET NO. L-90-2940\n", + "\n", + ":\n", + "\n", + ":\n", + "\n", + "IN RE:\n", + "IN THE MATTER OF\n", + "SUSAN MICHAUD\n", + "\n", + ":\n", + "\n", + "DEPOSITION OF:\n", + "\n", + ":\n", + "\n", + "Susan Michaud\n", + "\n", + ":\n", + "\n", + ":\n", + "\n", + "Transcript of proceedings taken on July 13, 1990,\n", + "at 1 pm, at the office of Mason, Griffin & Pierson, 101 Poor\n", + "Farm Road, Princeton, NJ 08540.\n", + "\n", + "682499390\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "# (B&W) PROTECTED BY MINNESOTA TOBACCO LITIGATION PROTECTIVE ORDER\n", + "\n", + "1\n", + "2\n", + "3\n", + "4\n", + "5\n", + "6\n", + "7\n", + "8\n", + "9\n", + "10\n", + "11\n", + "12\n", + "13\n", + "14\n", + "15\n", + "16\n", + "17\n", + "18\n", + "19\n", + "20\n", + "21\n", + "22\n", + "23\n", + "24\n", + "25\n", + "\n", + "2\n", + "\n", + "APPEARANCES\n", + "\n", + "On behalf of\n", + "Susan Michaud:\n", + "\n", + "MASON, GRIFFIN & PIERSON\n", + "BY: Stephanie J. Briody, Esq.\n", + "101 Poor Farm Road\n", + "Princton, NJ 08540\n", + "\n", + "On behalf of Dr. Alfred\n", + "Cook, Dr. Charles Howard &\n", + "Princeton Radiology Assoc.\n", + "\n", + "JACKSON & VAURIO\n", + "BY: John Zen Jackson, Esq.\n", + "1000 Herrontown Road\n", + "Princeton, NJ 08540\n", + "\n", + "682499391\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "## (B&W) PROTECTED BY MINNESOTA TOBACCO LITIGATION PROTECTIVE ORDER\n", + "\n", + "3\n", + "\n", + "1\n", + "2\n", + "3\n", + "4\n", + "5\n", + "6\n", + "7\n", + "8\n", + "9\n", + "10\n", + "11\n", + "12\n", + "13\n", + "14\n", + "15\n", + "16\n", + "17\n", + "18\n", + "19\n", + "20\n", + "21\n", + "22\n", + "23\n", + "24\n", + "25\n", + "\n", + "INDEX\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
WITNESS:DIRECTCROSSREDIRECTRECROSS
Susan Michaud4404444
\n", + "\n", + "\n", + "EXHIBITS:\n", + "Diagram (P-1)\n", + "\n", + "EVIDENCE\n", + "\n", + "IDENTIFICATION\n", + "\n", + "23\n", + "\n", + "682499392\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "### (B&W) PROTECTED BY MINNESOTA TOBACCO LITIGATION PROTECTIVE ORDER\n", + "\n", + "4\n", + "\n", + "Susan Michaud, M-I-C-H-A-U-D, sworn by the Notary Public,\n", + "testified as follows.\n", + "\n", + "DIRECT EXAMINATION BY\n", + "\n", + "MS. BRIODY:\n", + "\n", + "Q.\n", + "Susan, how old are you at the present time?\n", + "\n", + "A.\n", + "Just turned thirty-eight.\n", + "\n", + "Q.\n", + "And are you married?\n", + "\n", + "A. Yes, I am.\n", + "\n", + "Q.\n", + "And for how many years have you been married?\n", + "\n", + "============================================================\n", + "\n", + "... (Total length: 66709 characters)\n" + ] + } + ], "source": [ "# Show the default markdown output (first 2000 characters)\n", "markdown = content.get(\"markdown\", \"\")\n", @@ -354,10 +541,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "4a421c01", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Reflow functions loaded successfully!\n" + ] + } + ], "source": [ "import re\n", "from dataclasses import dataclass\n", @@ -541,10 +736,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "7ed1b4ab", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "📄 Reflowed Output for Page 3:\n", + "============================================================\n", + "(B&W) PROTECTED BY MINNESOTA TOBACCO LITIGATION PROTECTIVE ORDER\n", + "1 | INDEX\n", + "2 | WITNESS: DIRECT CROSS REDIRECT RECROSS\n", + "3 | Susan Michaud\n", + "6 | EXHIBITS: EVIDENCE IDENTIFICATION\n", + "7 | Diagram (P-1)\n", + "682499392\n", + "http://legacy.library.ucsf.e6u/tid/fuq07a00/pdf.industrydocuments.ucsf.edu/docs/khhl0001\n", + "============================================================\n" + ] + } + ], "source": [ "# Reflow a single page (page 3)\n", "page_to_reflow = 3\n", @@ -568,10 +781,121 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "cc9a0906", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "📄 Reflowing entire document...\n", + "✅ Reflowed document saved to: c:\\src\\azure-ai-content-understanding-python\\notebooks\\test_output\\legal_transcript_reflowed.md\n", + " Total characters: 65678\n", + "\n", + "📄 Preview (first 3000 characters):\n", + "============================================================\n", + "\n", + "\n", + "\n", + "(B&W) PROTECTED BY MINNESOTA TOBACCO LITIGATION PROTECTIVE ORDER\n", + "SUPERIOR COURT OF NEW JERSEY\n", + "MERCER COUNTY-LAW DIVISION,\n", + "DOCKET NO. L-90-2940\n", + "IN RE: IN THE MATTER OF\n", + "SUSAN MICHAUD\n", + "DEPOSITION OF:\n", + "Susan Michaud\n", + "Transcript of proceedings taken on July 13, 1990,\n", + "at 1 pm, at the office of Mason, Griffin & Pierson, 101 Poor\n", + "Farm Road, Princeton, NJ 08540.\n", + "682499390\n", + "http://legacy.library.ucsf.e6u/tid/fuq07a00/pdfv.industrydocuments.ucsf.edu/docs/khhl0001\n", + "\n", + "\n", + "\n", + "(B&W) PROTECTED BY MINNESOTA TOBACCO LITIGATION PROTECTIVE ORDER\n", + "2 | APPEARANCES\n", + "3 | On behalf of\n", + "Susan Michaud: MASON, GRIFFIN & PIERSON\n", + "4 | BY: Stephanie J. Briody, Esq.\n", + "101 Poor Farm Road\n", + "5 | Princton, NJ 08540\n", + "6 | On behalf of Dr. Alfred\n", + "Cook, Dr. Charles Howard &\n", + "7 | Princeton Radiology Assoc. JACKSON & VAURIO\n", + "BY: John Zen Jackson, Esq.\n", + "8 | 1000 Herrontown Road\n", + "Princeton, NJ 08540\n", + "682499391\n", + "http://legacy.library.ucsf.e6u/tid/fuq07a00/pdfv.industrydocumėnts.ucsf.edu/docs/khhl0001\n", + "\n", + "\n", + "\n", + "(B&W) PROTECTED BY MINNESOTA TOBACCO LITIGATION PROTECTIVE ORDER\n", + "1 | INDEX\n", + "2 | WITNESS: DIRECT CROSS REDIRECT RECROSS\n", + "3 | Susan Michaud\n", + "6 | EXHIBITS: EVIDENCE IDENTIFICATION\n", + "7 | Diagram (P-1)\n", + "682499392\n", + "http://legacy.library.ucsf.e6u/tid/fuq07a00/pdf.industrydocuments.ucsf.edu/docs/khhl0001\n", + "\n", + "\n", + "\n", + "(B&W) PROTECTED BY MINNESOTA TOBACCO LITIGATION PROTECTIVE ORDER\n", + "1 | Susan Michaud, M-I-C-H-A-U-D, sworn by the Notary Public,\n", + "2 | testified as follows.\n", + "3 | DIRECT EXAMINATION BY\n", + "4 | MS. BRIODY:\n", + "5 | Q. Susan, how old are you at the present time?\n", + "6 | A. Just turned thirty-eight.\n", + "7 | Q. And are you married?\n", + "8 | A. Yes, I am.\n", + "9 | Q. And for how many years have you been married?\n", + "10 | A. Nineteen.\n", + "11 | Q. What year were you married?\n", + "12 | A. '71.\n", + "13 | Q. And to whom are you married?\n", + "14 | A. Thomas Michaud.\n", + "15 | Q. Do you have any children?\n", + "16 | A. Yes, I have one.\n", + "17 | Q. Is it a boy or a girl?\n", + "18 | A. A fourteen year old boy, almost fifteen.\n", + "19 | Q. What's his name?\n", + "20 | A. Matthew.\n", + "21 | Q. Did you go to high school in Princeton?\n", + "22 | A. Yes.\n", + "23 | Q. And what is your educational background?\n", + "24 | A. I have about thirty college credits beyond high school\n", + "25 | and that's all.\n", + "682499393\n", + "http://legacy.library.ucsf.edu/tid/fuq07a00/pdfv.industrydocuments.ucsf.edu/docs/khhl0001\n", + "\n", + "\n", + "\n", + "(B&W) PROTECTED BY MINNESOTA TOBACCO LITIGATION PROTECTIVE ORDER\n", + "1 | Q. Where did you get those credits?\n", + "2 | A. At Mercer County Community College.\n", + "3 | Q. Did you grow up in the Princeton area?\n", + "4 | A. Yes.\n", + "5 | Q. Where did you go to middle school or junior high\n", + "6 | school?\n", + "7 | A. Princeton community--Community Park.\n", + "8 | Q. It's called Community Park?\n", + "9 | A. It's called Community Park.\n", + "10 | Q. In Princeton?\n", + "11 | A. Yes.\n", + "12 | Q. For whom do you work?\n", + "13 | A. Nassau Federal Savings and Loan.\n", + "14 | Q. And what kind of work do you do for them?\n", + "15 | A. I am the director of their Human Resource Department.\n", + "16 | Q. And for how long have you worked for the bank?\n", + "17 | A. In September it will be thr\n", + "============================================================\n" + ] + } + ], "source": [ "# Reflow the entire document\n", "print(\"📄 Reflowing entire document...\")\n", @@ -699,7 +1023,7 @@ "\n", "This notebook demonstrated how to:\n", "\n", - "1. **Extract content** from legal transcripts using Azure Content Understanding's `prebuilt-documentSearch` analyzer\n", + "1. **Extract content** from legal transcripts using Azure Content Understanding's `prebuilt-layout` analyzer\n", "2. **Understand the JSON structure** including the `source` field with bounding box coordinates\n", "3. **Reflow the output** to include line numbers inline with text by:\n", " - Parsing bounding box coordinates to determine element positions\n",