diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4a27ee1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,61 @@ +# Environment variables +.env +.env.local +.env.*.local + +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Virtual environments +venv/ +.venv/ +env/ +ENV/ +env.bak/ +venv.bak/ + +# IDEs +.vscode/ +.idea/ +*.swp +*.swo +*~ +.DS_Store + +# Jupyter Notebook +.ipynb_checkpoints + +# Testing +.pytest_cache/ +.coverage +htmlcov/ +.tox/ + +# Logs +*.log +logs/ + +# Temporary files +*.tmp +*.temp +tmp/ +temp/ \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..8018075 --- /dev/null +++ b/README.md @@ -0,0 +1,116 @@ +# Image Description Generator + +A FastAPI web application that generates detailed descriptions of images using Claude's vision capabilities. + +## Features + +- ๐ผ๏ธ **Image Upload UI**: Drag-and-drop or browse to upload images +- ๐ **HEIC Support**: Automatically converts HEIC images to JPEG +- ๐ **Smart Resizing**: Automatically optimizes large images (max 5MB, 1568px) +- ๐จ **Multiple Formats**: Supports JPEG, PNG, WebP, GIF, and HEIC +- ๐ค **AI-Powered**: Uses Claude Sonnet 4.5 for detailed image descriptions +- โก **Real-time Processing**: Fast image analysis with loading indicators + +## Project Structure + +``` +Oct-4-Hackathon-2025-/ +โโโ src/ +โ โโโ __init__.py # Package initialization +โ โโโ img_generator.py # Main FastAPI application +โ โโโ vocab_generator.py # Vocabulary generation utilities +โ โโโ text_to_speek.py # Text-to-speech functionality +โ โโโ .env # Environment variables +โโโ requirements.txt # Python dependencies +โโโ .gitignore # Git ignore rules +โโโ README.md # This file +``` + +## Setup + +### 1. Create Virtual Environment + +```bash +python -m venv .venv +source .venv/bin/activate # On Windows: .venv\Scripts\activate +``` + +### 2. Install Dependencies + +```bash +pip install -r requirements.txt +``` + +### 3. Configure Environment Variables + +Create a `.env` file in the `src/` directory: + +```env +ANTHROPIC_API_KEY=your_api_key_here +IMG_GENERATOR_ANTHRPIC_API_KEY=your_api_key_here +``` + +### 4. Run the Application + +```bash +python src/img_generator.py +``` + +The application will be available at `http://localhost:8000` + +## Usage + +1. Open your browser and navigate to `http://localhost:8000` +2. Upload an image by: + - Dragging and dropping it into the upload area + - Clicking "Browse Files" to select an image +3. Click "Analyze Image" to generate a description +4. View the AI-generated description below + +## API Endpoints + +### `GET /` +Returns the web UI for image upload + +### `POST /describe-image` +Upload an image and receive a detailed description + +**Request:** +- Method: `POST` +- Content-Type: `multipart/form-data` +- Body: `file` (image file) + +**Response:** +```json +{ + "success": true, + "filename": "example.jpg", + "description": "Detailed image description...", + "model": "claude-sonnet-4-5-20250929" +} +``` + +## Supported Image Formats + +- JPEG/JPG +- PNG +- WebP +- GIF +- HEIC (automatically converted to JPEG) + +## Image Processing Features + +- **Automatic HEIC Conversion**: HEIC images are converted to JPEG for API compatibility +- **Smart Resizing**: Images larger than 5MB or exceeding 1568px in any dimension are automatically resized +- **Quality Optimization**: Uses LANCZOS resampling and compression for optimal quality/size balance + +## Technologies Used + +- **FastAPI**: Web framework +- **Anthropic Claude API**: AI image description +- **Pillow**: Image processing +- **pillow-heif**: HEIC format support + +## License + +MIT License diff --git a/README_WEB_UI.md b/README_WEB_UI.md new file mode 100644 index 0000000..31d05a8 --- /dev/null +++ b/README_WEB_UI.md @@ -0,0 +1,51 @@ +# Infinite Craft-Style Context Wordlist Generator + +A beautiful web UI inspired by Infinite Craft that generates contextual wordlists using AI. + +## Setup + +All dependencies are already installed in the `myenv` conda environment! + +**To run the app:** +```bash +./run_app.sh +``` + +Or manually activate the conda environment: +```bash +conda activate myenv +python app.py +``` + +**Open your browser:** +Navigate to http://localhost:5000 + +> Note: API keys are already configured in the code + +## Usage + +1. Enter any context in the input field (e.g., "software development team meeting", "cooking in a professional kitchen", "underwater marine biology") +2. Click "Generate Words" +3. Wait a moment while AI generates ~100 relevant vocabulary terms +4. Terms appear as draggable cards in the Infinite Craft style +5. Use the search box to filter terms +6. Click the download button to export the wordlist as a text file +7. Click the trash icon to clear all terms + +## Features + +- ๐จ **Infinite Craft-style UI** with animated particle background +- ๐ **Context-based generation** using Claude and OpenAI embeddings +- ๐ **Real-time search** to filter displayed terms +- ๐ฅ **Export functionality** to download wordlists +- ๐ฏ **Draggable cards** for visual organization +- โก **Fast and efficient** vocabulary generation + +## How It Works + +The app uses: +- **Claude (Sonnet 4.5)** to generate candidate terms +- **OpenAI embeddings** for semantic similarity +- **spaCy** for linguistic analysis +- **MMR algorithm** for diversity +- **Category-based quotas** for balanced results diff --git a/app.py b/app.py new file mode 100644 index 0000000..2431d6d --- /dev/null +++ b/app.py @@ -0,0 +1,249 @@ +from flask import Flask, render_template, request, jsonify +from flask_cors import CORS +import anthropic +import os +from openai import OpenAI +from rank_terms import generate_terms +import base64 +from PIL import Image +import io +from dotenv import load_dotenv + +# Load environment variables +load_dotenv() + +app = Flask(__name__) +CORS(app) + +# API keys from .env file +ANTHROPIC_KEY = os.getenv("ANTHROPIC_API_KEY") +OPENAI_KEY = os.getenv("OPENAI_API_KEY") + +# Initialize clients +anthropic_client = anthropic.Anthropic(api_key=ANTHROPIC_KEY) +openai_client = OpenAI(api_key=OPENAI_KEY) +print("โ API clients initialized") + +@app.route('/') +def index(): + return render_template('index.html') + +def add_emojis_to_terms(terms, anthropic_client): + """ + Add emojis to a list of terms using a single API call. + Returns a list of terms with emojis prepended. + """ + # Format terms as a comma-separated list + terms_str = ", ".join(terms) + + prompt = f"""For each of these words/phrases, add a single relevant emoji that best represents it. + +Words: {terms_str} + +Return ONLY a comma-separated list with each word prefixed by its emoji and a space. +Format: "emoji word, emoji word, emoji word" + +Example input: "run, think, water" +Example output: "๐ run, ๐ญ think, ๐ง water" + +Be concise. Use the most appropriate single emoji for each term. Output the list on one line.""" + + try: + message = anthropic_client.messages.create( + model="claude-sonnet-4-5-20250929", + max_tokens=2000, + messages=[{"role": "user", "content": prompt}] + ) + + response_text = message.content[0].text.strip() + + # Parse the response - split by comma and clean up + emoji_terms = [term.strip() for term in response_text.split(',')] + + # Fallback: if parsing fails, return terms with default emoji + if len(emoji_terms) != len(terms): + return [f"โจ {term}" for term in terms] + + return emoji_terms + + except Exception as e: + print(f"Error adding emojis: {e}") + # Fallback: return terms with default emoji + return [f"โจ {term}" for term in terms] + +@app.route('/generate', methods=['POST']) +def generate(): + try: + data = request.json + context = data.get('context', '') + + if not context: + return jsonify({'error': 'Context is required'}), 400 + + # Generate terms using the rank_terms module + result = generate_terms( + context, + n=100, + anthropic_client=anthropic_client, + openai_client=openai_client + ) + + # Extract just the terms + terms = [item['term'] for item in result['terms']] + + # Add emojis with a single API call + print("Adding emojis to terms...") + emoji_terms = add_emojis_to_terms(terms, anthropic_client) + + return jsonify({ + 'success': True, + 'terms': emoji_terms, + 'context': context + }) + + except Exception as e: + return jsonify({ + 'success': False, + 'error': str(e) + }), 500 + +@app.route('/generate-sentences', methods=['POST']) +def generate_sentences(): + try: + data = request.json + words = data.get('words', []) + + if not words: + return jsonify({'error': 'Words are required'}), 400 + + # Remove emojis from words for cleaner sentence generation + clean_words = [word.split(' ', 1)[-1] if ' ' in word else word for word in words] + words_str = ", ".join(clean_words) + + prompt = f"""Create 15-20 different short, simple sentences using ONLY these words IN THIS EXACT ORDER: {words_str} + +CRITICAL RULES: +- Use ONLY the words provided - DO NOT add any other content words +- You may ONLY add function words (the, a, an, is, are, was, were, to, at, in, on, etc.) +- You may conjugate verbs as necessary (add -s, -ed, -ing) +- You may add plural markers (-s, -es) +- Keep the exact order of the content words given +- Make the sentences grammatically correct +- Be natural and simple +- Vary the sentence structures and function words used +- Show different ways to express the same idea with the given words + +Return ONLY the sentences, one per line. No numbering, no extra text.""" + + message = anthropic_client.messages.create( + model="claude-sonnet-4-5-20250929", + max_tokens=2500, + messages=[{"role": "user", "content": prompt}] + ) + + response_text = message.content[0].text.strip() + sentences = [s.strip() for s in response_text.split('\n') if s.strip()] + + return jsonify({ + 'success': True, + 'sentences': sentences + }) + + except Exception as e: + return jsonify({ + 'success': False, + 'error': str(e) + }), 500 + +@app.route('/analyze-image', methods=['POST']) +def analyze_image(): + try: + if 'image' not in request.files: + return jsonify({'error': 'No image file provided'}), 400 + + file = request.files['image'] + if file.filename == '': + return jsonify({'error': 'No file selected'}), 400 + + # Read and process the image + image_bytes = file.read() + + # Resize if needed (max 5MB, max dimension 1568px) + image = Image.open(io.BytesIO(image_bytes)) + + # Convert RGBA to RGB if needed + if image.mode in ('RGBA', 'LA', 'P'): + background = Image.new('RGB', image.size, (255, 255, 255)) + if image.mode == 'P': + image = image.convert('RGBA') + background.paste(image, mask=image.split()[-1] if image.mode in ('RGBA', 'LA') else None) + image = background + elif image.mode != 'RGB': + image = image.convert('RGB') + + # Resize if too large + max_dimension = 1568 + if max(image.size) > max_dimension: + ratio = max_dimension / max(image.size) + new_size = tuple(int(dim * ratio) for dim in image.size) + image = image.resize(new_size, Image.Resampling.LANCZOS) + + # Convert back to bytes + img_byte_arr = io.BytesIO() + image.save(img_byte_arr, format='JPEG', quality=85) + img_byte_arr.seek(0) + image_bytes = img_byte_arr.read() + + # Encode to base64 + image_base64 = base64.standard_b64encode(image_bytes).decode("utf-8") + + # Generate description using Claude's vision + prompt = """Describe this image in a way that would help generate vocabulary words for someone learning to communicate. +Focus on: +- Main objects and subjects +- Actions taking place +- Setting and environment +- Important details +- Overall context + +Provide a clear, concise description (2-3 sentences).""" + + message = anthropic_client.messages.create( + model="claude-sonnet-4-5-20250929", + max_tokens=1024, + messages=[ + { + "role": "user", + "content": [ + { + "type": "image", + "source": { + "type": "base64", + "media_type": "image/jpeg", + "data": image_base64, + }, + }, + { + "type": "text", + "text": prompt + } + ], + } + ], + ) + + description = message.content[0].text.strip() + + return jsonify({ + 'success': True, + 'description': description + }) + + except Exception as e: + return jsonify({ + 'success': False, + 'error': str(e) + }), 500 + +if __name__ == '__main__': + app.run(debug=True, port=5001) diff --git a/rank_terms.py b/rank_terms.py new file mode 100644 index 0000000..e8fd2c9 --- /dev/null +++ b/rank_terms.py @@ -0,0 +1,632 @@ +""" +Term Ranking System for Context-Based Vocabulary Generation + +Given a context sentence, generates ~100 relevant, usable terms for that scenario. +Uses embeddings, semantic similarity, and diversity algorithms. +""" + +import anthropic +import os +import json +import numpy as np +from typing import List, Dict, Tuple, Optional +from collections import defaultdict, Counter +import spacy +from sklearn.metrics.pairwise import cosine_similarity +from sklearn.cluster import KMeans +import re +from openai import OpenAI +from dotenv import load_dotenv + +# Load environment variables +load_dotenv() + +# Try to load spaCy model +try: + nlp = spacy.load("en_core_web_sm") +except: + print("Downloading spaCy model...") + os.system("python -m spacy download en_core_web_sm") + nlp = spacy.load("en_core_web_sm") + + +# Configuration +CONFIG = { + "target_count": 100, + "neighbor_pool": 500, # Reduced for API efficiency + "cluster_k": 10, + "mmr_lambda": 0.7, + "spread_threshold": 0.38, + "category_quotas": { + "Action/Task": 25, + "Tech/Tool": 20, + "Problem/Error": 10, + "Data/Artifact": 20, + "Concept/Method": 15, + "Event/Logistics": 10 + }, + "seeds": { + "action": ["work", "create", "write", "read", "help", "learn", "teach", "talk", + "click", "type", "save", "open", "edit", "share", "test", "run", + "tokenize", "debug", "parse", "analyze"], + "decor": ["room", "chair", "vibe", "light", "atmosphere", "wall", "ceiling", + "furniture", "decoration", "ambiance", "setting"] + }, + "stoplist_extra": ["folks", "guys", "stuff", "thing", "really", "very", "quite"] +} + + +def embed_text(text: str, openai_client: OpenAI) -> np.ndarray: + """ + Embed text using OpenAI's text-embedding-3-small model. + Fast, cheap, and high quality (1536 dimensions). + """ + # Handle empty or very short text + if not text or len(text.strip()) < 2: + return np.zeros(1536) + + try: + response = openai_client.embeddings.create( + model="text-embedding-3-small", + input=text, + encoding_format="float" + ) + + embedding = np.array(response.data[0].embedding) + return embedding + + except Exception as e: + print(f"Warning: Embedding failed for '{text[:50]}...': {e}") + return np.zeros(1536) + + +def embed_batch(texts: List[str], openai_client: OpenAI, batch_size: int = 100) -> List[np.ndarray]: + """ + Embed multiple texts in batches for efficiency. + OpenAI allows up to 2048 texts per request. + """ + embeddings = [] + + for i in range(0, len(texts), batch_size): + batch = texts[i:i+batch_size] + + try: + response = openai_client.embeddings.create( + model="text-embedding-3-small", + input=batch, + encoding_format="float" + ) + + batch_embeddings = [np.array(item.embedding) for item in response.data] + embeddings.extend(batch_embeddings) + + except Exception as e: + print(f"Warning: Batch embedding failed for batch {i//batch_size}: {e}") + # Fallback: add zero vectors + embeddings.extend([np.zeros(1536) for _ in batch]) + + return embeddings + + +def generate_candidate_terms(client: anthropic.Anthropic, context: str, n: int = 500) -> List[str]: + """ + Generate candidate terms using LLM. + """ + prompt = f"""Given this context: "{context}" + +Generate {n} SINGLE WORDS for a VOCABULARY LIST that would help someone discuss this type of situation. + +CRITICAL: Generate GENERAL, REUSABLE vocabulary - NOT specific image descriptions! + +โ BAD (too specific to this exact scenario): +- "yacht", "five friends", "another boat", "ceiling-mounted", "huskyboard" + +โ GOOD (general vocabulary for this TYPE of situation): +- For BOATING: "boat", "water", "sail", "friend", "trip", "ocean", "wave", "captain" +- For CLASSROOM: "student", "teacher", "learn", "desk", "board", "question", "study" +- For HOME: "cook", "eat", "sleep", "relax", "family", "room", "comfortable" + +Match vocabulary to the DOMAIN: +- Boating/water โ boat, water, sail, ocean, wave, dock, captain, crew, anchor +- School/learning โ student, teacher, study, learn, desk, board, question, test +- Work/office โ work, meeting, task, project, deadline, colleague, email +- Home โ cook, eat, sleep, relax, family, room, comfortable, clean +- Tech/coding โ code, program, debug, test, build, deploy (ONLY if context is technical) + +Rules: +1. SINGLE words only (maximum 2 words for compound terms like "swimming pool") +2. GENERAL vocabulary for the situation type, not specific details +3. NO numbers or quantities ("five", "twenty", "several" is OK but "five friends" is NOT) +4. NO articles or demonstratives ("another boat", "the yacht", "this person") +5. Include: basic verbs, basic nouns, common adjectives, useful descriptive words +6. NO proper nouns or brand names + +Output ONLY single words, comma-separated.""" + + message = client.messages.create( + model="claude-sonnet-4-5-20250929", + max_tokens=3000, + messages=[ + {"role": "user", "content": prompt} + ] + ) + + response_text = message.content[0].text.strip() + + # Clean response + if response_text.startswith('```'): + lines = response_text.split('\n') + for line in lines: + if line.strip() and not line.startswith('```'): + response_text = line + break + + # Parse terms + terms = [term.strip() for term in response_text.split(',') if term.strip()] + terms = [term for term in terms if not term.startswith('```')] + + return terms + + +def extract_terms_from_text(text: str) -> List[str]: + """ + Extract noun chunks, entities, and key terms from text using spaCy. + """ + doc = nlp(text) + terms = [] + + # Noun chunks + terms.extend([chunk.lemma_.lower() for chunk in doc.noun_chunks]) + + # Named entities (ORG, PRODUCT, etc.) + terms.extend([ent.text for ent in doc.ents + if ent.label_ in ['ORG', 'PRODUCT', 'GPE', 'EVENT', 'LAW']]) + + # Important nouns and verbs + terms.extend([token.lemma_ for token in doc + if token.pos_ in ['NOUN', 'PROPN', 'VERB'] + and not token.is_stop and token.is_alpha]) + + return terms + + +def normalize_and_dedupe(terms: List[str]) -> List[str]: + """ + Normalize terms and remove duplicates. + Filter out overly specific phrases and proper nouns. + """ + normalized = [] + seen = set() + + # Common proper nouns to filter (libraries, brands, specific products) + proper_noun_filter = { + 'spacy', 'nltk', 'sklearn', 'pytorch', 'tensorflow', 'keras', 'numpy', 'pandas', + 'matplotlib', 'jupyter', 'openai', 'anthropic', 'claude', 'chatgpt', 'gpt', + 'python', 'javascript', 'typescript', 'java', 'react', 'vue', 'angular', + 'docker', 'kubernetes', 'aws', 'azure', 'gcp', 'github', 'gitlab', + 'postgresql', 'mysql', 'mongodb', 'redis', 'elasticsearch', + 'fastapi', 'django', 'flask', 'express', 'nextjs', 'node', + 'vscode', 'pycharm', 'intellij', 'eclipse', 'vim', 'emacs' + } + + # Phrases that indicate overly specific/descriptive content + bad_phrase_patterns = [ + 'in short', 'in summary', 'in brief', 'in other words', + 'another', 'this', 'that', 'these', 'those', + 'five friend', 'three people', 'two student', 'ten person' + ] + + for term in terms: + # Basic normalization + term = term.strip().lower() + + # Skip if too short, too long, or in stoplist + if len(term) < 2 or len(term) > 30: # Reduced max length to 30 + continue + if term in CONFIG["stoplist_extra"]: + continue + + # Skip overly specific descriptive phrases + if any(pattern in term for pattern in bad_phrase_patterns): + continue + + # Skip multi-word phrases with "another", "the", articles + word_count = len(term.split()) + if word_count > 2: # Reject anything with more than 2 words + continue + if word_count == 2 and any(word in term.split() for word in ['another', 'the', 'a', 'an', 'this', 'that']): + continue + + # Filter out most proper nouns (keep only essential generic ones) + doc = nlp(term) + if len(doc) > 0: + # Skip if it's a proper noun AND in our filter list + if doc[0].pos_ == 'PROPN' and term in proper_noun_filter: + continue + lemma = doc[0].lemma_ + else: + lemma = term + + # Check for near-duplicates (simple approach) + if lemma not in seen: + normalized.append(term) + seen.add(lemma) + + return normalized + + +def categorize_term(term: str) -> str: + """ + Categorize a term into one of the predefined categories. + """ + doc = nlp(term) + + # Tech/Tool patterns + tech_patterns = [ + r'(?i)(spacy|nltk|sklearn|pytorch|tensorflow|pandas|numpy|matplotlib|jupyter|faiss)', + r'(?i)(python|java|javascript|sql|api|framework|library)', + ] + for pattern in tech_patterns: + if re.search(pattern, term): + return "Tech/Tool" + + # Problem/Error patterns + error_patterns = [ + r'(?i)(error|exception|fail|unexpected|issue|bug|warning|crash)', + r'(?i)(wrong|invalid|corrupt|missing|broken)', + ] + for pattern in error_patterns: + if re.search(pattern, term): + return "Problem/Error" + + # Data/Artifact patterns + data_patterns = [ + r'(?i)(data|dataset|model|output|input|file|document|corpus)', + r'(?i)(matrix|vector|tensor|array|table|schema|weights)', + ] + for pattern in data_patterns: + if re.search(pattern, term): + return "Data/Artifact" + + # Event/Logistics patterns + event_patterns = [ + r'(?i)(presentation|talk|workshop|session|meeting|check-in|raffle)', + r'(?i)(schedule|agenda|timer|break|lunch)', + ] + for pattern in event_patterns: + if re.search(pattern, term): + return "Event/Logistics" + + # Action/Task (verbs) + if len(doc) > 0 and doc[0].pos_ == 'VERB': + return "Action/Task" + + # Concept/Method (abstract nouns) + concept_patterns = [ + r'(?i)(tokenization|lemmatization|normalization|embedding|similarity)', + r'(?i)(algorithm|method|technique|approach|process|analysis)', + ] + for pattern in concept_patterns: + if re.search(pattern, term): + return "Concept/Method" + + # Default based on POS + if len(doc) > 0: + if doc[0].pos_ == 'VERB': + return "Action/Task" + elif doc[0].pos_ in ['NOUN', 'PROPN']: + return "Concept/Method" + + return "Concept/Method" + + +def compute_term_vectors(terms: List[str], openai_client: OpenAI) -> Dict[str, np.ndarray]: + """ + Compute embeddings for each term using OpenAI in batches. + """ + print(f" Embedding {len(terms)} terms in batches...") + embeddings = embed_batch(terms, openai_client, batch_size=100) + + vectors = {} + for term, emb in zip(terms, embeddings): + vectors[term] = emb + + return vectors + + +def compute_signals(terms: List[str], term_vectors: Dict[str, np.ndarray], + ctx_vec: np.ndarray, openai_client: OpenAI) -> Dict[str, Dict]: + """ + Compute relevance signals for each term. + """ + # Compute prototype vectors + action_vecs = embed_batch(CONFIG["seeds"]["action"][:5], openai_client) + decor_vecs = embed_batch(CONFIG["seeds"]["decor"][:5], openai_client) + + proto_action = np.mean(action_vecs, axis=0) + proto_decor = np.mean(decor_vecs, axis=0) + + signals = {} + + for term in terms: + if term not in term_vectors: + continue + + v = term_vectors[term] + + # Similarity to context + sim_topic = cosine_similarity([v], [ctx_vec])[0][0] + + # Action margin + sim_action = cosine_similarity([v], [proto_action])[0][0] + sim_decor = cosine_similarity([v], [proto_decor])[0][0] + action_margin = sim_action - sim_decor + + signals[term] = { + "sim_topic": float(sim_topic), + "action_margin": float(action_margin), + } + + return signals + + +def score_terms(signals: Dict[str, Dict]) -> Dict[str, float]: + """ + Compute final scores from signals. + """ + scores = {} + + # Normalize signals + all_sim_topic = [s["sim_topic"] for s in signals.values()] + all_action_margin = [s["action_margin"] for s in signals.values()] + + min_sim = min(all_sim_topic) if all_sim_topic else 0 + max_sim = max(all_sim_topic) if all_sim_topic else 1 + min_action = min(all_action_margin) if all_action_margin else 0 + max_action = max(all_action_margin) if all_action_margin else 1 + + for term, sig in signals.items(): + # Normalize + norm_sim = (sig["sim_topic"] - min_sim) / (max_sim - min_sim + 1e-6) + norm_action = (sig["action_margin"] - min_action) / (max_action - min_action + 1e-6) + + # Combined score + score = 0.7 * norm_sim + 0.3 * norm_action + scores[term] = float(score) + + return scores + + +def diversify_mmr(terms: List[str], vectors: Dict[str, np.ndarray], + scores: Dict[str, float], n: int, lambda_param: float = 0.7) -> List[str]: + """ + Maximal Marginal Relevance diversification. + """ + selected = [] + remaining = list(terms) + + # Start with highest-scoring term + remaining.sort(key=lambda t: scores.get(t, 0), reverse=True) + selected.append(remaining.pop(0)) + + while len(selected) < n and remaining: + best_term = None + best_mmr = -float('inf') + + for term in remaining: + if term not in vectors: + continue + + # Relevance score + relevance = scores.get(term, 0) + + # Max similarity to already selected + max_sim = 0 + for sel_term in selected: + if sel_term in vectors: + sim = cosine_similarity([vectors[term]], [vectors[sel_term]])[0][0] + max_sim = max(max_sim, sim) + + # MMR score + mmr = lambda_param * relevance - (1 - lambda_param) * max_sim + + if mmr > best_mmr: + best_mmr = mmr + best_term = term + + if best_term: + selected.append(best_term) + remaining.remove(best_term) + else: + break + + return selected + + +def diversify_with_quotas(terms: List[str], vectors: Dict[str, np.ndarray], + scores: Dict[str, float], categories: Dict[str, str], + target_n: int) -> List[str]: + """ + Diversify using category quotas and MMR. + """ + # Group by category + by_category = defaultdict(list) + for term in terms: + cat = categories.get(term, "Concept/Method") + by_category[cat].append(term) + + # Sort within each category + for cat in by_category: + by_category[cat].sort(key=lambda t: scores.get(t, 0), reverse=True) + + # Apply quotas + selected = [] + quotas = CONFIG["category_quotas"] + + for cat, quota in quotas.items(): + if cat in by_category: + # Take top items up to quota + cat_terms = by_category[cat][:quota * 2] # Get extras for MMR + # Apply MMR within category + if cat_terms: + mmr_terms = diversify_mmr(cat_terms, vectors, scores, + min(quota, len(cat_terms)), + CONFIG["mmr_lambda"]) + selected.extend(mmr_terms) + + # Fill remaining slots with highest-scoring terms not yet selected + if len(selected) < target_n: + remaining = [t for t in terms if t not in selected] + remaining.sort(key=lambda t: scores.get(t, 0), reverse=True) + selected.extend(remaining[:target_n - len(selected)]) + + return selected[:target_n] + + +def generate_terms(context: str, n: int = 100, + anthropic_client: anthropic.Anthropic = None, + openai_client: OpenAI = None) -> dict: + """ + Main pipeline: generate ranked terms for a given context. + """ + print(f"\n{'='*70}") + print(f"Generating {n} terms for context:") + print(f" \"{context}\"") + print(f"{'='*70}\n") + + if anthropic_client is None: + anthropic_client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY")) + + if openai_client is None: + openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) + + # 1. Embed context + print("1. Embedding context with OpenAI...") + ctx_vec = embed_text(context, openai_client) + + # 2. Generate candidates + print("2. Generating candidate terms with Claude...") + candidates = generate_candidate_terms(anthropic_client, context, CONFIG["neighbor_pool"]) + + # Add terms extracted from context itself + candidates.extend(extract_terms_from_text(context)) + + print(f" Generated {len(candidates)} raw candidates") + + # 3. Normalize and dedupe + print("3. Normalizing and deduplicating...") + candidates = normalize_and_dedupe(candidates) + print(f" {len(candidates)} unique candidates after normalization") + + # 4. Compute vectors + print("4. Computing term vectors with OpenAI embeddings...") + term_vectors = compute_term_vectors(candidates, openai_client) + + # 5. Compute signals + print("5. Computing relevance signals...") + signals = compute_signals(candidates, term_vectors, ctx_vec, openai_client) + + # 6. Score terms + print("6. Scoring terms...") + scores = score_terms(signals) + + # 7. Categorize + print("7. Categorizing terms...") + categories = {term: categorize_term(term) for term in candidates} + + # 8. Diversify + print("8. Applying diversity selection...") + selected = diversify_with_quotas(candidates, term_vectors, scores, categories, n) + + # 9. Build result + print(f"\nโ Selected {len(selected)} terms\n") + + result = { + "context": context, + "terms": [ + { + "term": term, + "score": round(scores.get(term, 0), 3), + "category": categories.get(term, "Concept/Method") + } + for term in selected + ] + } + + # Sort by score within result + result["terms"].sort(key=lambda x: x["score"], reverse=True) + + return result + + +def print_results(result: dict): + """ + Pretty-print results. + """ + print(f"{'='*70}") + print(f"RESULTS FOR: {result['context']}") + print(f"{'='*70}\n") + + # Group by category + by_cat = defaultdict(list) + for item in result["terms"]: + by_cat[item["category"]].append(item) + + for cat in CONFIG["category_quotas"].keys(): + if cat in by_cat: + print(f"\n[{cat}] ({len(by_cat[cat])} terms)") + for item in by_cat[cat][:15]: # Show top 15 per category + print(f" โข {item['term']:30s} (score: {item['score']:.3f})") + + print(f"\n{'='*70}") + print(f"Total: {len(result['terms'])} terms") + print(f"{'='*70}\n") + + +if __name__ == "__main__": + import sys + + # API keys from .env file + anthropic_key = os.getenv("ANTHROPIC_API_KEY") + openai_key = os.getenv("OPENAI_API_KEY") + + if not anthropic_key or not openai_key: + print("ERROR: API keys not found in .env file") + print("Please create a .env file with ANTHROPIC_API_KEY and OPENAI_API_KEY") + sys.exit(1) + + # Get context from command line or prompt + if len(sys.argv) > 1: + context = " ".join(sys.argv[1:]) + else: + context = input("Enter context sentence: ") + + # Initialize clients + try: + anthropic_client = anthropic.Anthropic(api_key=anthropic_key) + openai_client = OpenAI(api_key=openai_key) + except Exception as e: + print(f"ERROR initializing API clients: {e}") + sys.exit(1) + + # Generate terms + try: + result = generate_terms(context, n=100, + anthropic_client=anthropic_client, + openai_client=openai_client) + + # Print results + print_results(result) + + # Save to JSON + output_file = "ranked_terms.json" + with open(output_file, 'w') as f: + json.dump(result, f, indent=2) + + print(f"\nResults saved to {output_file}") + + except Exception as e: + print(f"\nERROR: {e}") + import traceback + traceback.print_exc() + sys.exit(1) diff --git a/requirements-temp.txt b/requirements-temp.txt new file mode 100644 index 0000000..2550302 --- /dev/null +++ b/requirements-temp.txt @@ -0,0 +1,7 @@ +flask +flask-cors +anthropic +openai +spacy +scikit-learn +numpy diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..570aa91 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +anthropic +fastapi +uvicorn +python-multipart +python-dotenv +Pillow +pillow-heif diff --git a/run_app.sh b/run_app.sh new file mode 100755 index 0000000..85bbafe --- /dev/null +++ b/run_app.sh @@ -0,0 +1,5 @@ +#!/bin/bash +# Run the Flask app using the myenv conda environment + +echo "Starting Flask app with myenv conda environment..." +/opt/anaconda3/envs/myenv/bin/python app.py diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..46824eb --- /dev/null +++ b/src/__init__.py @@ -0,0 +1,8 @@ +""" +Image Description Generator Package + +A FastAPI application that generates detailed descriptions of images +using Claude's vision capabilities. +""" + +__version__ = "1.0.0" diff --git a/src/img_generator.py b/src/img_generator.py new file mode 100644 index 0000000..e0e2499 --- /dev/null +++ b/src/img_generator.py @@ -0,0 +1,513 @@ +from fastapi import FastAPI, File, UploadFile, HTTPException +from fastapi.responses import JSONResponse, HTMLResponse +import anthropic +import os +from dotenv import load_dotenv +import base64 +from typing import Dict +from PIL import Image +import pillow_heif +from io import BytesIO + +# Load environment variables from the parent directory +load_dotenv(os.path.join(os.path.dirname(__file__), '..', '.env')) + +# Register HEIF opener with Pillow +pillow_heif.register_heif_opener() + +app = FastAPI() + +def get_image_description(image_bytes: bytes, mime_type: str) -> Dict[str, str]: + """ + Generate a detailed description of an image using Claude's vision capabilities. + + Args: + image_bytes: The image file as bytes + mime_type: The MIME type of the image (e.g., 'image/jpeg', 'image/png') + + Returns: + A dictionary containing the image description and details + """ + # Get API key - try IMG_GENERATOR specific key first, then fall back to general key + api_key = os.getenv("IMG_GENERATOR_ANTHRPIC_API_KEY") + + if not api_key: + raise ValueError("No API key found. Please set ANTHROPIC_API_KEY or IMG_GENERATOR_ANTHRPIC_API_KEY in .env file") + + # Initialize the Anthropic client + client = anthropic.Anthropic(api_key=api_key) + + # Encode image to base64 + image_base64 = base64.standard_b64encode(image_bytes).decode("utf-8") + + # Create the prompt for Claude + prompt = """Please provide a comprehensive description of this image. Include: + +1. Overall Description: What the image shows at a high level +2. Key Elements: Main objects, people, or subjects in the image +3. Details: Colors, composition, setting, and any notable features +4. Context: What the image appears to be about or its purpose +5. Mood/Atmosphere: The feeling or tone conveyed by the image + +Be specific and descriptive.""" + + # Call the Claude API with vision + message = client.messages.create( + model="claude-sonnet-4-5-20250929", + max_tokens=1024, + messages=[ + { + "role": "user", + "content": [ + { + "type": "image", + "source": { + "type": "base64", + "media_type": mime_type, + "data": image_base64, + }, + }, + { + "type": "text", + "text": prompt + } + ], + } + ], + ) + + # Extract the response + description = message.content[0].text + + return { + "description": description, + "model": "claude-sonnet-4-5-20250929" + } + + +def resize_image_if_needed(image_bytes: bytes, mime_type: str, max_size_mb: float = 5.0, max_dimension: int = 1568) -> tuple[bytes, str]: + """ + Resize image if it's too large in file size or dimensions. + Claude API has limits: max 5MB per image, recommended max dimension 1568px. + + Args: + image_bytes: The image as bytes + mime_type: The MIME type of the image + max_size_mb: Maximum file size in MB (default 5.0) + max_dimension: Maximum width or height in pixels (default 1568) + + Returns: + Tuple of (resized_bytes, mime_type) + """ + # Check file size + size_mb = len(image_bytes) / (1024 * 1024) + + # Open the image + img = Image.open(BytesIO(image_bytes)) + + # Get current dimensions + width, height = img.size + needs_resize = False + + # Check if dimensions are too large + if width > max_dimension or height > max_dimension: + needs_resize = True + # Calculate new dimensions maintaining aspect ratio + if width > height: + new_width = max_dimension + new_height = int(height * (max_dimension / width)) + else: + new_height = max_dimension + new_width = int(width * (max_dimension / height)) + elif size_mb > max_size_mb: + # If file is too large but dimensions are ok, reduce dimensions by 20% + needs_resize = True + new_width = int(width * 0.8) + new_height = int(height * 0.8) + + if needs_resize: + # Resize the image + img = img.resize((new_width, new_height), Image.Resampling.LANCZOS) + + # Convert to RGB if necessary + if img.mode not in ('RGB', 'L'): + img = img.convert('RGB') + + # Determine output format and quality + output_format = 'JPEG' + if mime_type == 'image/png': + output_format = 'PNG' + elif mime_type == 'image/webp': + output_format = 'WEBP' + elif mime_type == 'image/gif': + output_format = 'GIF' + else: + # Default to JPEG for all other formats + output_format = 'JPEG' + mime_type = 'image/jpeg' + + # Save with compression + output = BytesIO() + if output_format == 'JPEG': + img.save(output, format=output_format, quality=85, optimize=True) + elif output_format == 'PNG': + img.save(output, format=output_format, optimize=True) + elif output_format == 'WEBP': + img.save(output, format=output_format, quality=85) + else: + img.save(output, format=output_format) + + output.seek(0) + return output.read(), mime_type + + +def convert_heic_to_jpeg(image_bytes: bytes) -> tuple[bytes, str]: + """ + Convert HEIC image to JPEG format. + + Args: + image_bytes: The HEIC image as bytes + + Returns: + Tuple of (converted_bytes, mime_type) + """ + # Open the HEIC image + img = Image.open(BytesIO(image_bytes)) + + # Convert to RGB if necessary (HEIC might be in different color mode) + if img.mode not in ('RGB', 'L'): + img = img.convert('RGB') + + # Save as JPEG + output = BytesIO() + img.save(output, format='JPEG', quality=85, optimize=True) + output.seek(0) + + return output.read(), "image/jpeg" + + +@app.post("/describe-image") +async def describe_image(file: UploadFile = File(...)): + """ + Endpoint to upload an image and receive a detailed description. + + Args: + file: The uploaded image file + + Returns: + JSON response with image description and details + """ + # All supported formats including HEIC + allowed_types = ["image/jpeg", "image/jpg", "image/png", "image/webp", "image/gif", "image/heic", "image/heif"] + + if file.content_type not in allowed_types: + raise HTTPException( + status_code=400, + detail=f"Invalid file type. Allowed types: JPEG, PNG, WebP, GIF, HEIC" + ) + + try: + # Read the image file + image_bytes = await file.read() + mime_type = file.content_type + + # Convert HEIC to JPEG if needed + if mime_type in ["image/heic", "image/heif"]: + image_bytes, mime_type = convert_heic_to_jpeg(image_bytes) + + # Resize image if it's too large (max 5MB, max dimension 1568px) + image_bytes, mime_type = resize_image_if_needed(image_bytes, mime_type) + + # Generate description + result = get_image_description(image_bytes, mime_type) + + return JSONResponse(content={ + "success": True, + "filename": file.filename, + "description": result["description"], + "model": result["model"] + }) + + except Exception as e: + raise HTTPException(status_code=500, detail=f"Error processing image: {str(e)}") + + +@app.get("/") +async def root(): + """UI endpoint for image upload""" + html_content = """ + + +
+ + +Upload an image to get a detailed AI-powered description
+ ++ Drop your image here +
+or
+ ++ Supported formats: JPEG, PNG, WebP, GIF, HEIC +
+Analyzing your image...
+