From 44990412c6ab8429fd5911ba136d2aedf6803349 Mon Sep 17 00:00:00 2001 From: Suman Sahoo Date: Sun, 11 May 2025 22:06:26 +0530 Subject: [PATCH 1/2] Enhanced OCR processing with new API endpoints for image upload and order retrieval. Implemented AI-powered comparison feature for expected vs. actual values, including detailed review statuses and confidence scoring. Updated database logging to include comparison results. Added new comparison prompt for semantic analysis of data. --- vision/config/api_keys.py | 6 +-- vision/routes.py | 16 ++++-- vision/tasks/process_ocr_task.py | 27 ++++++++-- vision/utils/db_operations.py | 20 +++++-- vision/utils/llm_invoke.py | 18 +++++++ vision/utils/prompt/comparison_prompt.txt | 63 +++++++++++++++++++++++ vision/utils/prompt/load_prompt.py | 13 ++++- 7 files changed, 146 insertions(+), 17 deletions(-) create mode 100644 vision/utils/prompt/comparison_prompt.txt diff --git a/vision/config/api_keys.py b/vision/config/api_keys.py index c592fd5..c6a13dd 100644 --- a/vision/config/api_keys.py +++ b/vision/config/api_keys.py @@ -7,12 +7,12 @@ langchain_api_key = os.getenv("LANGCHAIN_API_KEY") roboflow_api_key = os.getenv("ROBOFLOW_API_KEY") -if google_api_key is None or langchain_api_key is None: +if google_api_key is None or langchain_api_key is None or roboflow_api_key is None: raise ValueError( - "Environment variables GOOGLE_API_KEY and LANGCHAIN_API_KEY must be set" + "Environment variables GOOGLE_API_KEY and LANGCHAIN_API_KEY and ROBOFLOW_API_KEY must be set" ) os.environ["GOOGLE_API_KEY"] = google_api_key os.environ["LANGCHAIN_TRACING_V2"] = "true" os.environ["LANGCHAIN_API_KEY"] = langchain_api_key -os.environ["ROBOFLOW_API_KEY"] = roboflow_api_key \ No newline at end of file +os.environ["ROBOFLOW_API_KEY"] = roboflow_api_key diff --git a/vision/routes.py b/vision/routes.py index 6f0c8c2..293f0cd 100644 --- a/vision/routes.py +++ b/vision/routes.py @@ -1,6 +1,6 @@ import time import os -from fastapi import APIRouter, UploadFile, File, Form, HTTPException +from fastapi import APIRouter, UploadFile, File, Form, HTTPException, Path from fastapi.responses import JSONResponse from .config.logging_config import configure_logging from .config.mongo import db @@ -32,9 +32,17 @@ async def process_ocr(image: UploadFile = File(...), expected_values: str = Form expected_values = parse_json_content(expected_values) - process_ocr_task.delay(temp_image_path, image.content_type, expected_values) + # Start the Celery task + task = process_ocr_task.delay(temp_image_path, image.content_type, expected_values) - return JSONResponse(content={"status": "success"}) + return JSONResponse( + content={ + "status": "success", + "message": "OCR processing started successfully", + "task_id": task.id, + "processing_note": "The OCR data will be processed and compared with expected values if provided." + } + ) except Exception as e: logger.error("Error during OCR processing at process_ocr: %s", str(e)) raise HTTPException(status_code=500, detail="Internal Server Error") @@ -45,8 +53,10 @@ async def get_orders(page: int = 1, limit: int = 10): skip = (page - 1) * limit total_orders = db["logs"].count_documents({}) orders = list(db["logs"].find().skip(skip).limit(limit)) + for order in orders: order["_id"] = str(order["_id"]) + return JSONResponse( content={ "orders": orders, diff --git a/vision/tasks/process_ocr_task.py b/vision/tasks/process_ocr_task.py index fd868c1..8b840cf 100644 --- a/vision/tasks/process_ocr_task.py +++ b/vision/tasks/process_ocr_task.py @@ -1,9 +1,10 @@ from ..config.celery_worker import celery_app import os +import json from ..utils.image_processing import segment_image, encode_image_to_base64, draw_bounding_boxes from ..utils.llm_invoke import LLMInvoker -from ..utils.sanitize import strip_json_markers, parse_json_content -from ..utils.prompt.load_prompt import load_input_prompt +from ..utils.sanitize import parse_json_content +from ..utils.prompt.load_prompt import load_input_prompt, load_comparison_prompt from ..constants import MODEL_NAMES from ..utils.db_operations import store_order_log_in_db, get_next_order_id from ..config.roboflow import get_roboflow_client @@ -30,12 +31,28 @@ def process_ocr_task(temp_image_path, image_content_type, expected_values): encoded_image_base64 = encode_image_to_base64(output_image_path) logger.info("Image encoding completed") - # Call LLM + # Call LLM for OCR extraction input_prompt = load_input_prompt() llm_invoker = LLMInvoker(MODEL_NAMES["GEMINI_FLASH_LITE"]) ai_msg = llm_invoker.invoke(input_prompt, image_content_type, encoded_image_base64) actual_values = parse_json_content(ai_msg.content) - logger.info("AI message received from LLM") + logger.info("AI message received from LLM for OCR extraction") + + # Perform comparison with expected values if provided + review_result = None + if expected_values: + logger.info("Expected values provided, performing comparison") + comparison_prompt = load_comparison_prompt() + + # Create a text-only comparison prompt with expected and actual values + comparison_text = f"{comparison_prompt}\n\nEXPECTED VALUES:\n{json.dumps(expected_values, indent=2)}\n\nACTUAL VALUES:\n{json.dumps(actual_values, indent=2)}" + + comparison_invoker = LLMInvoker(MODEL_NAMES["GEMINI_FLASH_LITE"]) + comparison_msg = comparison_invoker.invoke_text_only(comparison_text) + logger.info(f"Comparison message: {comparison_msg}") + review_result = parse_json_content(comparison_msg.content) + logger.info(f"Comparison result: {review_result}") + logger.info(f"Comparison completed with review status: {review_result.get('overall_review', {}).get('status', 'unknown')}") # Clean up os.remove(output_image_path) @@ -43,4 +60,4 @@ def process_ocr_task(temp_image_path, image_content_type, expected_values): os.remove(temp_image_path) # Store results in DB - store_order_log_in_db(order_id, expected_values, actual_values) + store_order_log_in_db(order_id, expected_values, actual_values, review_result) diff --git a/vision/utils/db_operations.py b/vision/utils/db_operations.py index 712bf88..09ab787 100644 --- a/vision/utils/db_operations.py +++ b/vision/utils/db_operations.py @@ -4,10 +4,22 @@ def get_next_order_id() -> int: """Get the next order ID by counting existing documents""" return db["logs"].count_documents({}) + 1 -def store_order_log_in_db(order_id: int, expected_values: list, actual_values: dict) -> None: - """Insert a new log record into the database""" - db["logs"].insert_one({ +def store_order_log_in_db(order_id: int, expected_values: list, actual_values: dict, review_result: dict = None) -> None: + """Insert a new log record into the database + + Args: + order_id: Unique identifier for the order + expected_values: Expected values provided by the user + actual_values: Actual values extracted by OCR + review_result: Comparison results and review status from AI comparison + """ + log_entry = { "order_id": order_id, "expected_values": expected_values, "actual_values": actual_values - }) + } + + if review_result: + log_entry["review"] = review_result + + db["logs"].insert_one(log_entry) diff --git a/vision/utils/llm_invoke.py b/vision/utils/llm_invoke.py index 895192e..730de6c 100644 --- a/vision/utils/llm_invoke.py +++ b/vision/utils/llm_invoke.py @@ -20,3 +20,21 @@ def invoke(self, prompt: str, image_content_type: str, segmented_image_base64: s ) ai_msg = self.llm.invoke([message]) return ai_msg + + def invoke_text_only(self, prompt: str): + """ + Invoke the LLM with text-only prompt, no image. + + Args: + prompt (str): The text prompt to send to the LLM + + Returns: + The AI message response + """ + message = HumanMessage( + content=[ + {"type": "text", "text": prompt} + ] + ) + ai_msg = self.llm.invoke([message]) + return ai_msg diff --git a/vision/utils/prompt/comparison_prompt.txt b/vision/utils/prompt/comparison_prompt.txt new file mode 100644 index 0000000..a4b3564 --- /dev/null +++ b/vision/utils/prompt/comparison_prompt.txt @@ -0,0 +1,63 @@ +You are a sophisticated AI system designed to compare expected data with actual extracted data from product packaging and fruit analysis. Your task is to determine if the actual values match the expected values semantically, even if they are not exact text matches. + +TASK: +1. Compare each field in the expected values with the corresponding field in the actual values. +2. Consider semantic equivalence rather than exact string matching. +3. Analyze each comparison pair and determine if they are: + - MATCH: The values are semantically equivalent even if written differently + - PARTIAL MATCH: The values have some overlapping information but don't fully match + - MISMATCH: The values have significantly different meanings or critical information is missing + +REVIEW PROCESS: +1. Field-by-Field Comparison: + - For each field in the expected values, find its corresponding field in the actual values + - Compare the values, accounting for: + * Different formatting (dates, weights, measurements) + * Synonyms or alternative phrasing + * Abbreviated vs. full forms + * Case differences + * Minor spelling variations + +2. Intelligent Analysis: + - For ingredients lists, check if all important ingredients are represented (order may differ) + - For dates, normalize formats before comparison (YYYY-MM-DD) + - For weights/measures, normalize units before comparison (convert g to grams, etc.) + - For brand/manufacturer names, consider parent companies and subsidiaries as matches + +3. Overall Review Status: + - Based on the field-by-field comparison, determine an overall status: + * "approved" - All critical fields match or have acceptable minor variations + * "needs_review" - Some fields have partial matches or minor discrepancies requiring human review + * "unmatched" - Critical fields have significant discrepancies + +RESPONSE FORMAT: +Provide your analysis as a JSON object with the following structure: +``` +{ + "field_comparisons": [ + { + "field": "Manufacturer", + "expected": "Original expected value", + "actual": "Original actual value", + "status": "MATCH/PARTIAL_MATCH/MISMATCH", + "reasoning": "Brief explanation of why this status was assigned" + }, + ... + ], + "overall_review": { + "status": "approved/needs_review/unmatched", + "confidence_score": 0.XX, + "explanation": "Brief explanation of the overall status determination", + "critical_issues": ["List any critical issues that led to needs_review or unmatched status"] + } +} +``` + +IMPORTANT CONSIDERATIONS: +- Prioritize critical fields (product name, expiry date, ingredients) over less important ones +- For fruit freshness, consider classification within 1 stage as a partial match (e.g., "Ripe" vs "Overripe") +- Be lenient with formatting differences but strict with numerical values in critical fields +- Consider cultural and regional variations in product naming and description +- Factor in common OCR errors and misspellings in your comparison + +Your goal is to provide a reliable assessment that would align with a human expert's judgment on whether the actual extracted data sufficiently matches what was expected. diff --git a/vision/utils/prompt/load_prompt.py b/vision/utils/prompt/load_prompt.py index 3f9cbe9..030a8f0 100644 --- a/vision/utils/prompt/load_prompt.py +++ b/vision/utils/prompt/load_prompt.py @@ -1,5 +1,3 @@ - - import os def load_input_prompt(): @@ -12,3 +10,14 @@ def load_input_prompt(): with open(prompt_path, "r") as file: prompt = file.read().strip() return prompt + +def load_comparison_prompt(): + """ + Loads the comparison prompt from the comparison_prompt.txt file located in the same directory as this script. + Returns: + str: The contents of the comparison prompt file. + """ + prompt_path = os.path.join(os.path.dirname(__file__), "comparison_prompt.txt") + with open(prompt_path, "r") as file: + prompt = file.read().strip() + return prompt From 634a3a33a849176fca8f7eebeeed83d80b5339a2 Mon Sep 17 00:00:00 2001 From: Suman Sahoo Date: Wed, 21 May 2025 16:05:41 +0530 Subject: [PATCH 2/2] Added default values JSON, implemented Sidebar component for navigation, and created image processing script for preprocessing images. --- .../default_examples/default_values.json | 28 +++++++++++++++++ frontend/src/components/Sidebar.jsx | 18 +++++++++++ utils 19-33-32-223/image_process.py | 30 +++++++++++++++++++ 3 files changed, 76 insertions(+) create mode 100644 frontend/src/assets/default_examples/default_values.json create mode 100644 frontend/src/components/Sidebar.jsx create mode 100644 utils 19-33-32-223/image_process.py diff --git a/frontend/src/assets/default_examples/default_values.json b/frontend/src/assets/default_examples/default_values.json new file mode 100644 index 0000000..be20ca3 --- /dev/null +++ b/frontend/src/assets/default_examples/default_values.json @@ -0,0 +1,28 @@ +[ + { + "image": "example1.png", + "values": { + "manufacturer": "Example Manufacturer", + "productName": "Example Product", + "ingredients": "Example Ingredients", + "manufacturingDate": "2023-01-01", + "expiryDate": "2024-01-01", + "netWeight": "100g", + "barcode": "123456789", + "otherDetails": "Example Details" + } + }, + { + "image": "example2.png", + "values": { + "manufacturer": "Another Manufacturer", + "productName": "Another Product", + "ingredients": "Another Ingredients", + "manufacturingDate": "2023-02-01", + "expiryDate": "2024-02-01", + "netWeight": "200g", + "barcode": "987654321", + "otherDetails": "Another Details" + } + } +] diff --git a/frontend/src/components/Sidebar.jsx b/frontend/src/components/Sidebar.jsx new file mode 100644 index 0000000..55347f2 --- /dev/null +++ b/frontend/src/components/Sidebar.jsx @@ -0,0 +1,18 @@ +import { Link } from 'react-router-dom'; + +function Sidebar() { + return ( + + ); +} + +export default Sidebar; \ No newline at end of file diff --git a/utils 19-33-32-223/image_process.py b/utils 19-33-32-223/image_process.py new file mode 100644 index 0000000..d1b04a8 --- /dev/null +++ b/utils 19-33-32-223/image_process.py @@ -0,0 +1,30 @@ +import cv2 +import numpy as np + +def preprocess_image(img_path): + # Read the image + img = cv2.imread(img_path) + + # Normalize brightness and contrast using histogram equalization + img_yuv = cv2.cvtColor(img, cv2.COLOR_BGR2YUV) + img_yuv[:,:,0] = cv2.equalizeHist(img_yuv[:,:,0]) + img_normalized = cv2.cvtColor(img_yuv, cv2.COLOR_YUV2BGR) + + # Apply bilateral filtering to reduce noise but preserve edges + img_filtered = cv2.bilateralFilter(img_normalized, 9, 75, 75) + + # Segmentation using GrabCut + mask = np.zeros(img.shape[:2], np.uint8) + bgdModel = np.zeros((1, 65), np.float64) + fgdModel = np.zeros((1, 65), np.float64) + rect = (50, 50, img.shape[1]-50, img.shape[0]-50) # Rectangle for the object + cv2.grabCut(img_filtered, mask, rect, bgdModel, fgdModel, 5, cv2.GC_INIT_WITH_RECT) + mask2 = np.where((mask == 2) | (mask == 0), 0, 1).astype('uint8') + img_segmented = img_filtered * mask2[:, :, np.newaxis] + + return img_segmented + + +# Example usage +img_filtered = preprocess_image('dataset/object_1/image.png') +cv2.imwrite('preprocessed_image.jpg', img_filtered)