diff --git a/tools/media/README.md b/tools/media/README.md new file mode 100644 index 0000000..9d1a795 --- /dev/null +++ b/tools/media/README.md @@ -0,0 +1,123 @@ +# Media Indexer + +A tool for building and maintaining a media index from DA Live's medialog and auditlog APIs. + +## Architecture + +The indexer is now modular for better maintainability and extensibility: + +``` +tools/media/ +├── indexer.html # Entry point HTML +├── indexer.css # Styles +├── indexer.js # Main entry (initialization & auth) +├── indexer-old.js # Backup of monolithic version +└── lib/ + ├── config.js # Configuration & state management + ├── api.js # DA Admin API calls + ├── helpers.js # Utility functions (normalizePath, isPage, etc.) + ├── builder.js # Core index building logic + └── ui.js # UI rendering & event handling +``` + +## Modules + +### `lib/config.js` +- URL parameter parsing (org, repo, ref) +- Global state management +- Constants (DA_ADMIN, sitePath) + +### `lib/api.js` +- `fetchWithAuth()` - Authenticated fetch wrapper +- `daFetch()` - DA API fetch wrapper +- `loadMeta()` - Load metadata from DA +- `createSheet()` - Create sheet format for DA +- `saveMeta()` - Save metadata to DA +- `fetchFromAdminAPI()` - Fetch from auditlog/medialog with pagination + +### `lib/helpers.js` +- `normalizePath()` - Normalize paths (add .md, remove query params) +- `isPage()` - Detect if path is a page vs media file +- `extractName()` - Extract filename from medialog entry +- `detectMediaType()` - Detect media type from contentType + +### `lib/builder.js` +- `getIndexStatus()` - Get current index metadata +- `buildInitialIndex()` - Core indexing logic: + 1. Fetch auditlog entries + 2. Fetch medialog entries + 3. Match media to pages (5-second time window) + 4. Deduplicate by hash + 5. Save index to DA + +### `lib/ui.js` +- `render()` - Render UI with status, progress, logs, errors +- `attachEventListeners()` - Handle button clicks + +### `indexer.js` +- Main entry point +- DA SDK authentication +- Initialize UI + +## Index Schema + +Each entry in the media index: + +```javascript +{ + hash: "abc123", // Media hash (unique identifier) + pages: "/page1.md|/page2.md", // Pipe-separated list of pages using this media + url: "https://.../media_abc.jpg", // Full URL to media + name: "photo.jpg", // Filename (extracted from URL) + timestamp: 1771704070155, // Latest usage timestamp + user: "user@example.com", // User who uploaded/used it + operation: "reuse", // Latest operation (ingest/reuse) + type: "img > jpeg", // Media type (category > extension) + status: "referenced" // Status (referenced/unused) +} +``` + +## Indexing Rules + +- **Latest event only:** For each page, use only the latest auditlog event. Skip all others. Multiple events in a batch are sorted by timestamp; only the most recent determines the current page state. + +## Phase 1 (Current) + +✅ Media Bus items (images/videos) from medialog API +✅ Deduplicated by hash +✅ Pipe-separated pages for multi-page usage +✅ Latest usage tracking + +## Phase 2 (Current) + +- Linked content (PDFs, SVGs, fragments) from auditlog +- HTML parsing for usage detection (extractFragmentReferences, extractLinks) +- Source: "auditlog-parsed" +- Index entries: path, usedIn, timestamp, type, status + +## Phase 3 (Future) + +- Streaming architecture for large sites +- Chunked processing +- Memory optimization + +## Usage + +1. Open in browser: `https://main--repo--org.aem.page/tools/media/indexer.html?org=yourorg&repo=yourrepo` +2. Authenticate with DA Live +3. Click "Build Initial Index" +4. Index saved to `/.da/mediaindex/media-index.json` + +## Development + +Run linting: +```bash +npm run lint:js +npm run lint:css +``` + +Test locally: +```bash +npx @adobe/aem-cli up +# Open http://localhost:3000/tools/media/indexer.html?org=yourorg&repo=yourrepo +``` diff --git a/tools/media/indexer-old.js b/tools/media/indexer-old.js new file mode 100644 index 0000000..1ca1414 --- /dev/null +++ b/tools/media/indexer-old.js @@ -0,0 +1,507 @@ +/* eslint-disable import/no-absolute-path, import/no-unresolved */ +/* The DA SDK is loaded from the da.live CDN and is required for authentication */ +import DA_SDK from 'https://da.live/nx/utils/sdk.js'; + +// Parse URL parameters +const params = new URLSearchParams(window.location.search); +const org = params.get('org'); +const repo = params.get('repo') || params.get('site'); +const ref = 'main'; +const sitePath = `/${org}/${repo}`; + +const state = { + building: false, + progress: { stage: 'idle', message: '', percent: 0 }, + errors: [], + logs: [], + status: null, + daToken: null, +}; + +async function fetchWithAuth(url, opts = {}) { + opts.headers ||= {}; + if (state.daToken) { + opts.headers.Authorization = `Bearer ${state.daToken}`; + } + return fetch(url, opts); +} + +const DA_ADMIN = 'https://admin.da.live'; + +async function daFetch(url, opts = {}) { + opts.headers ||= {}; + if (state.daToken) { + opts.headers.Authorization = `Bearer ${state.daToken}`; + } + return fetch(url, opts); +} + +async function loadMeta(path) { + try { + const resp = await daFetch(`${DA_ADMIN}/source${path}`); + if (resp.ok) { + const data = await resp.json(); + return data.data?.[0] || data; + } + } catch { + return null; + } + return null; +} + +async function createSheet(data, type = 'sheet') { + const sheetMeta = { + total: data.length, + limit: data.length, + offset: 0, + data, + ':type': type, + }; + const blob = new Blob([JSON.stringify(sheetMeta, null, 2)], { type: 'application/json' }); + const formData = new FormData(); + formData.append('data', blob); + return formData; +} + +async function saveMeta(meta, path) { + const metaArray = Array.isArray(meta) ? meta : [meta]; + const formData = await createSheet(metaArray); + return daFetch(`${DA_ADMIN}/source${path}`, { + method: 'POST', + body: formData, + }); +} + +function timestampToDuration(timestamp) { + if (!timestamp) return '90d'; + const ageMs = Date.now() - timestamp; + const days = Math.ceil(ageMs / (24 * 60 * 60 * 1000)); + if (days < 1) { + const hours = Math.ceil(ageMs / (60 * 60 * 1000)); + return hours > 0 ? `${hours}h` : '1h'; + } + return `${Math.min(days, 90)}d`; +} + +async function fetchFromAdminAPI(endpoint, orgName, repoName, refName, since, limit, onPageLoaded) { + const fetchParams = new URLSearchParams(); + fetchParams.append('limit', limit.toString()); + + const sinceDuration = since ? timestampToDuration(since) : '90d'; + fetchParams.append('since', sinceDuration); + + const baseUrl = `https://admin.hlx.page/${endpoint}/${orgName}/${repoName}/${refName}`; + const separator = endpoint === 'medialog' ? '/' : ''; + const url = `${baseUrl}${separator}?${fetchParams.toString()}`; + + const resp = await fetchWithAuth(url); + + if (!resp.ok) { + throw new Error(`${endpoint} API error: ${resp.status} ${resp.statusText}`); + } + + const data = await resp.json(); + const entries = data.entries || data.data || []; + const { nextToken } = data; + + if (onPageLoaded && entries.length > 0) { + onPageLoaded(entries, !!nextToken); + } + + async function fetchNextPage(token) { + if (!token) return []; + + fetchParams.set('nextToken', token); + const nextUrl = `${baseUrl}${separator}?${fetchParams.toString()}`; + const nextResp = await fetchWithAuth(nextUrl); + + if (!nextResp.ok) return []; + + const nextData = await nextResp.json(); + const nextEntries = nextData.entries || nextData.data || []; + + if (!nextEntries || nextEntries.length === 0) return []; + + if (onPageLoaded) { + onPageLoaded([...entries, ...nextEntries], !!nextData.nextToken); + } + + const remainingEntries = await fetchNextPage(nextData.nextToken); + return [...nextEntries, ...remainingEntries]; + } + + const additionalEntries = await fetchNextPage(nextToken); + return [...entries, ...additionalEntries]; +} + +/** + * Normalize a path by removing query params/fragments and adding .md for pages + * @param {string} path - The path to normalize + * @returns {string} Normalized path + */ +function normalizePath(path) { + if (!path) return ''; + let cleanPath = path.split('?')[0].split('#')[0]; + // Add .md for pages: /drafts/page -> /drafts/page.md + if (!cleanPath.includes('.') && !cleanPath.startsWith('/media/')) { + cleanPath = `${cleanPath}.md`; + } + return cleanPath; +} + +/** + * Detect if a path represents a page (not a media file or fragment) + * @param {string} path - The path to check + * @returns {boolean} True if path is a page + */ +function isPage(path) { + if (!path || typeof path !== 'string') return false; + return (path.endsWith('.md') + || (!path.includes('.') && !path.startsWith('/media/'))) + && !path.includes('/fragments/'); +} + +/** + * Extract the filename from a medialog entry + * @param {object} mediaEntry - The medialog entry + * @returns {string} The filename without query params or fragments + */ +function extractName(mediaEntry) { + if (!mediaEntry) return ''; + if (mediaEntry.operation === 'ingest' && mediaEntry.originalFilename) { + return mediaEntry.originalFilename.split('/').pop(); + } + if (!mediaEntry.path) return ''; + // Remove query params (?...) and URL fragments (#...) + return mediaEntry.path.split('?')[0].split('#')[0].split('/').pop(); +} + +/** + * Detect media type from contentType in structured format + * @param {object} mediaEntry - The medialog entry + * @returns {string} Type in format "category > extension" + */ +function detectMediaType(mediaEntry) { + const contentType = mediaEntry.contentType || ''; + if (contentType.startsWith('image/')) { + const ext = contentType.split('/')[1]; + return `img > ${ext}`; + } + if (contentType.startsWith('video/')) { + const ext = contentType.split('/')[1]; + return `video > ${ext}`; + } + return 'unknown'; +} + +async function getIndexStatus() { + const metaPath = `${sitePath}/.da/mediaindex/medialog-meta.json`; + const meta = await loadMeta(metaPath); + + return { + lastRefresh: meta?.lastFetchTime || null, + entriesCount: meta?.entriesCount || 0, + }; +} + +async function buildInitialIndex(onProgress) { + const index = []; + + // Phase 1: Fetch auditlog entries + onProgress({ stage: 'fetching', message: 'Fetching auditlog entries...', percent: 10 }); + + const auditlogEntries = await fetchFromAdminAPI('log', org, repo, ref, null, 1000, (entries, hasMore) => { + onProgress({ + stage: 'fetching', + message: `Fetched ${entries.length} auditlog entries${hasMore ? ' (more available)' : ''}...`, + percent: 20, + }); + }); + + // Separate pages from files (filter out entries with invalid paths) + const validEntries = auditlogEntries.filter((e) => e && e.path); + const pages = validEntries.filter((e) => isPage(e.path)); + const files = validEntries.filter((e) => !isPage(e.path)); + + onProgress({ + stage: 'fetching', + message: `Identified ${pages.length} pages and ${files.length} files from auditlog`, + percent: 30, + }); + + // Phase 2: Fetch medialog entries + onProgress({ stage: 'fetching', message: 'Fetching medialog entries...', percent: 40 }); + + const medialogEntries = await fetchFromAdminAPI('medialog', org, repo, ref, null, 1000, (entries, hasMore) => { + onProgress({ + stage: 'fetching', + message: `Fetched ${entries.length} medialog entries${hasMore ? ' (more available)' : ''}...`, + percent: 50, + }); + }); + + onProgress({ + stage: 'processing', + message: `Processing ${pages.length} pages with ${medialogEntries.length} medialog entries...`, + percent: 60, + }); + + // Phase 3: Build hash map (deduplicate by hash, track all pages) + const hashMap = new Map(); + + // Process page-referenced media + pages.forEach((pageEvent) => { + const normalizedPath = normalizePath(pageEvent.path); + + // Find matching medialog entries within 5-second time window + const pageMedia = medialogEntries.filter((m) => { + if (!m.resourcePath) return false; + if (m.resourcePath !== normalizedPath) return false; + + const TIME_WINDOW_MS = 5000; + return m.timestamp >= pageEvent.timestamp + && m.timestamp < pageEvent.timestamp + TIME_WINDOW_MS; + }); + + // Add to hash map + pageMedia.forEach((media) => { + const hash = media.mediaHash; + if (!hashMap.has(hash)) { + // First time seeing this hash - initialize entry + hashMap.set(hash, { + hash, + pages: new Set([normalizedPath]), + url: media.path, + name: extractName(media), + timestamp: media.timestamp, + user: media.user, + operation: media.operation, + type: detectMediaType(media), + status: 'referenced', + }); + } else { + // Hash exists - update with latest info + const entry = hashMap.get(hash); + entry.pages.add(normalizedPath); + + // Keep latest timestamp (since logs are sorted newest first) + if (media.timestamp > entry.timestamp) { + entry.timestamp = media.timestamp; + entry.operation = media.operation; + } + } + }); + }); + + onProgress({ + stage: 'processing', + message: `Processed ${pages.length} pages, found ${hashMap.size} unique media items`, + percent: 70, + }); + + // Phase 4: Process standalone uploads (not on any page yet) + const standaloneUploads = medialogEntries.filter((m) => !m.resourcePath && m.originalFilename); + + standaloneUploads.forEach((media) => { + const hash = media.mediaHash; + if (!hashMap.has(hash)) { + // Only add if not already referenced on a page + hashMap.set(hash, { + hash, + pages: new Set(), + url: media.path, + name: media.originalFilename.split('/').pop(), + timestamp: media.timestamp, + user: media.user, + operation: media.operation, + type: detectMediaType(media), + status: 'unused', + }); + } + }); + + onProgress({ + stage: 'processing', + message: `Added ${standaloneUploads.length} standalone uploads, total unique: ${hashMap.size}`, + percent: 80, + }); + + // Convert Map to array with pipe-separated pages + hashMap.forEach((entry) => { + index.push({ + hash: entry.hash, + pages: Array.from(entry.pages).join('|'), + url: entry.url, + name: entry.name, + timestamp: entry.timestamp, + user: entry.user, + operation: entry.operation, + type: entry.type, + status: entry.status, + }); + }); + + // Phase 5: Save index + onProgress({ stage: 'saving', message: `Saving ${index.length} entries...`, percent: 90 }); + + const indexPath = `${sitePath}/.da/mediaindex/media-index.json`; + const formData = await createSheet(index); + await daFetch(`${DA_ADMIN}/source${indexPath}`, { + method: 'POST', + body: formData, + }); + + await saveMeta({ + lastFetchTime: Date.now(), + entriesCount: index.length, + lastRefreshBy: 'media-indexer', + }, `${sitePath}/.da/mediaindex/medialog-meta.json`); + + onProgress({ stage: 'complete', message: `Complete! ${index.length} entries indexed`, percent: 100 }); + + return { entriesCount: index.length }; +} + +function render() { + const app = document.getElementById('app'); + + const statusHtml = state.status ? ` +
+

Current Index Status

+
+
+ + ${state.status.lastRefresh ? new Date(state.status.lastRefresh).toLocaleString() : 'Never'} +
+
+ + ${state.status.entriesCount || 0} +
+
+
+ ` : '
Checking status...
'; + + const progressHtml = state.building || state.progress.stage !== 'idle' ? ` +
+

Progress

+
+
+
+
+ ${state.progress.stage} + ${state.progress.message} +
+
+ ` : ''; + + const logsHtml = state.logs.length > 0 ? ` +
+

Logs (${state.logs.length})

+ +
+ ` : ''; + + const errorsHtml = state.errors.length > 0 ? ` +
+

Errors (${state.errors.length})

+ +
+ ` : ''; + + app.innerHTML = ` +

Media Index Builder

+

Building index for: ${org}/${repo}

+ + ${statusHtml} + +
+ +
+ + ${progressHtml} + ${errorsHtml} + ${logsHtml} + `; +} + +function attachEventListeners() { + if (!state.building) { + const buildBtn = document.getElementById('buildBtn'); + if (buildBtn) { + buildBtn.addEventListener('click', () => { + state.building = true; + state.errors = []; + state.logs = []; + state.progress = { stage: 'starting', message: 'Starting build...', percent: 0 }; + render(); + + buildInitialIndex((progress) => { + state.progress = progress; + state.logs.push({ message: progress.message, type: 'info' }); + render(); + }) + .then((result) => { + state.logs.push({ message: `Index built successfully: ${result.entriesCount} entries`, type: 'success' }); + return getIndexStatus(); + }) + .then((status) => { + state.status = status; + }) + .catch((error) => { + state.errors.push({ message: error.message }); + state.logs.push({ message: `Error: ${error.message}`, type: 'error' }); + state.progress = { stage: 'error', message: error.message, percent: 0 }; + }) + .finally(() => { + state.building = false; + render(); + attachEventListeners(); + }); + }); + } + } +} + +async function init() { + if (!org || !repo) { + document.getElementById('app').innerHTML = ` +
+

Missing Parameters

+

Please provide org and repo parameters in the URL:

+
?org=yourorg&repo=yourrepo
+
+ `; + return; + } + + // Get DA token with timeout + try { + const tokenPromise = DA_SDK; + const timeoutPromise = new Promise((_, reject) => { + setTimeout(() => reject(new Error('Authentication timeout')), 5000); + }); + + const result = await Promise.race([tokenPromise, timeoutPromise]); + state.daToken = result?.token; + } catch (error) { + state.errors.push({ message: `Failed to get DA token: ${error.message}` }); + } + + if (!state.daToken) { + const returnUrl = encodeURIComponent(window.location.href); + window.location.href = `https://da.live/?returnUrl=${returnUrl}`; + return; + } + + state.status = await getIndexStatus(); + render(); + attachEventListeners(); +} + +init(); diff --git a/tools/media/indexer.css b/tools/media/indexer.css new file mode 100644 index 0000000..36a1647 --- /dev/null +++ b/tools/media/indexer.css @@ -0,0 +1,315 @@ +/* ========== Base & Variables ========== */ + +:root { + /* Gray Scale */ + --s2-gray-50: #f9fafb; + --s2-gray-100: #f3f4f6; + --s2-gray-200: #e5e7eb; + --s2-gray-300: #d1d5db; + --s2-gray-400: #9ca3af; + --s2-gray-500: #6b7280; + --s2-gray-600: #4b5563; + --s2-gray-700: #374151; + --s2-gray-900: #111827; + + /* Blue Scale */ + --s2-blue-50: #eff6ff; + --s2-blue-100: #dbeafe; + --s2-blue-200: #bfdbfe; + --s2-blue-300: #93c5fd; + --s2-blue-500: #3b82f6; + --s2-blue-600: #2563eb; + --s2-blue-700: #1d4ed8; + --s2-blue-900: #1e3a8a; + + /* Green Scale */ + --s2-green-100: rgb(215 247 225); + --s2-green-900: #065f46; + + /* Red Scale */ + --s2-red-100: rgb(255 214 209); + --s2-red-700: #991b1b; + + /* Spacing */ + --spacing-100: 4px; + --spacing-200: 8px; + --spacing-300: 12px; + --spacing-400: 16px; + --spacing-500: 24px; + --spacing-600: 32px; + --spacing-700: 40px; + + /* Border Radius */ + --s2-radius-100: 4px; + --s2-radius-200: 8px; + --s2-radius-300: 18px; + + /* Typography */ + --body-font-family: 'Adobe Clean', adobe-clean, 'Trebuchet MS', sans-serif; + --mono-font-family: 'Roboto Mono', menlo, consolas, 'Liberation Mono', monospace; + --s2-font-size-200: 14px; + --s2-font-size-300: 16px; + --s2-font-size-400: 16px; + --s2-font-size-600: 24px; + --s2-font-size-700: 32px; +} + +* { + box-sizing: border-box; +} + +body { + font-family: var(--body-font-family); + color: var(--s2-gray-900); + line-height: 1.6; + margin: 0; + padding: 0; + background: var(--s2-gray-50); +} + +#app { + max-width: 1200px; + margin: var(--spacing-700) auto; + padding: 0 var(--spacing-400); +} + +h1 { + font-size: var(--s2-font-size-700); + font-weight: 700; + line-height: 1.2; + margin: 0 0 var(--spacing-200); + color: var(--s2-gray-900); +} + +h2 { + font-size: 20px; + font-weight: 600; + margin: 0 0 var(--spacing-400); + color: var(--s2-gray-900); +} + +h3 { + font-size: 18px; + font-weight: 600; + margin: 0 0 var(--spacing-300); + color: var(--s2-gray-900); +} + +p { + font-size: var(--s2-font-size-400); + color: var(--s2-gray-600); + margin: 0 0 var(--spacing-600); +} + +/* ========== Status Panel ========== */ + +.status-panel { + background: white; + border: 1px solid var(--s2-gray-200); + border-radius: var(--s2-radius-200); + padding: var(--spacing-500); + margin-bottom: var(--spacing-600); +} + +.status-loading { + padding: var(--spacing-600); + text-align: center; + color: var(--s2-gray-600); + background: white; + border: 1px solid var(--s2-gray-200); + border-radius: var(--s2-radius-200); + margin-bottom: var(--spacing-600); +} + +.status-grid { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); + gap: var(--spacing-500); +} + +.status-item { + display: flex; + flex-direction: column; + gap: var(--spacing-100); +} + +.status-item label { + font-size: 11px; + font-weight: 600; + text-transform: uppercase; + color: var(--s2-gray-400); + letter-spacing: 1px; +} + +.status-item span { + font-size: var(--s2-font-size-300); + color: var(--s2-gray-900); + font-weight: 500; +} + +/* ========== Actions ========== */ + +.actions { + margin-bottom: var(--spacing-600); +} + +button, +.button { + font-family: var(--body-font-family); + font-size: 15px; + font-weight: 700; + padding: 8px 24px; + line-height: 18px; + border: 2px solid #000; + color: #000; + border-radius: var(--s2-radius-300); + background: none; + cursor: pointer; + transition: all 0.2s; + text-align: center; +} + +button:disabled { + background-color: #efefef; + border: 2px solid #efefef; + color: var(--s2-gray-700); + cursor: not-allowed; +} + +button:hover:not(:disabled) { + background: var(--s2-gray-100); +} + +button.accent, +.btn-primary { + background: #3b63fb; + border: 2px solid #3b63fb; + color: #fff; +} + +button.accent:hover:not(:disabled), +.btn-primary:hover:not(:disabled) { + background: #2952e8; + border: 2px solid #2952e8; +} + +/* ========== Progress Section ========== */ + +.progress-section { + background: white; + border: 1px solid var(--s2-gray-200); + border-radius: var(--s2-radius-200); + padding: var(--spacing-500); + margin-bottom: var(--spacing-600); +} + +.progress-bar { + width: 100%; + height: 8px; + background: var(--s2-gray-200); + border-radius: var(--s2-radius-100); + overflow: hidden; + margin-bottom: var(--spacing-300); +} + +.progress-fill { + height: 100%; + background: #3b63fb; + transition: width 0.3s ease; +} + +.progress-info { + display: flex; + gap: var(--spacing-300); + align-items: center; + font-size: var(--s2-font-size-200); +} + +.progress-stage { + font-weight: 600; + color: var(--s2-gray-900); + text-transform: capitalize; +} + +.progress-message { + color: var(--s2-gray-600); + flex: 1; +} + +.progress-timing { + display: flex; + gap: var(--spacing-500); + margin-top: var(--spacing-200); + font-size: var(--s2-font-size-200); + color: var(--s2-gray-500); +} + +/* ========== Logs Section ========== */ + +.logs-section, +.errors-section { + background: white; + border: 1px solid var(--s2-gray-200); + border-radius: var(--s2-radius-200); + padding: var(--spacing-500); + margin-bottom: var(--spacing-600); +} + +.logs-list, +.errors-list { + list-style: none; + margin: 0; + padding: 0; + max-height: 400px; + overflow-y: auto; +} + +.logs-list li, +.errors-list li { + padding: var(--spacing-200) var(--spacing-300); + margin-bottom: var(--spacing-100); + border-radius: var(--s2-radius-100); + font-size: 13px; + font-family: var(--mono-font-family); +} + +.log-info { + background: var(--s2-gray-100); + color: var(--s2-gray-900); +} + +.log-success { + background: var(--s2-green-100); + color: var(--s2-green-900); +} + +.log-error, +.errors-list li { + background: var(--s2-red-100); + color: var(--s2-red-700); +} + +/* ========== Error Page ========== */ + +.error { + background: white; + border: 1px solid var(--s2-gray-200); + border-radius: var(--s2-radius-200); + padding: var(--spacing-700); + text-align: center; +} + +.error h1 { + color: var(--s2-red-700); +} + +.error pre { + background: var(--s2-gray-100); + padding: var(--spacing-400); + border-radius: var(--s2-radius-100); + text-align: left; + display: inline-block; + margin-top: var(--spacing-400); + font-family: var(--mono-font-family); + font-size: 13px; +} diff --git a/tools/media/indexer.html b/tools/media/indexer.html new file mode 100644 index 0000000..1c75061 --- /dev/null +++ b/tools/media/indexer.html @@ -0,0 +1,14 @@ + + + + + + Media Index Builder + + + + +
+ + + diff --git a/tools/media/indexer.js b/tools/media/indexer.js new file mode 100644 index 0000000..ede955e --- /dev/null +++ b/tools/media/indexer.js @@ -0,0 +1,60 @@ +/* eslint-disable import/no-absolute-path, import/no-unresolved */ +/* The DA SDK is loaded from the da.live CDN and is required for authentication */ +import DA_SDK from 'https://da.live/nx/utils/sdk.js'; + +import { state, org, repo } from './lib/config.js'; +import { getIndexStatus } from './lib/builder.js'; +import { render, attachEventListeners } from './lib/ui.js'; + +/** Constants */ +const AUTH_TIMEOUT_MS = 5000; // Timeout for DA authentication + +async function init() { + if (!org || !repo) { + const params = new URLSearchParams(window.location.search); + const rawOrg = params.get('org'); + const rawRepo = params.get('repo') || params.get('site'); + + let errorMsg = '

Please provide valid org and repo parameters in the URL:

'; + if (!rawOrg || !rawRepo) { + errorMsg += '

Missing required parameters.

'; + } else { + errorMsg += '

Invalid parameter format. Names must be alphanumeric with optional hyphens, underscores, or dots.

'; + } + + document.getElementById('app').innerHTML = ` +
+

Configuration Error

+ ${errorMsg} +
?org=yourorg&repo=yourrepo
+

Example: ?org=mycompany&repo=myproject

+
+ `; + return; + } + + // Get DA token with timeout + try { + const tokenPromise = DA_SDK; + const timeoutPromise = new Promise((_, reject) => { + setTimeout(() => reject(new Error('Authentication timeout')), AUTH_TIMEOUT_MS); + }); + + const result = await Promise.race([tokenPromise, timeoutPromise]); + state.daToken = result?.token; + } catch (error) { + state.errors.push({ message: `Failed to get DA token: ${error.message}` }); + } + + if (!state.daToken) { + const returnUrl = encodeURIComponent(window.location.href); + window.location.href = `https://da.live/?returnUrl=${returnUrl}`; + return; + } + + state.status = await getIndexStatus(); + render(); + attachEventListeners(); +} + +init(); diff --git a/tools/media/lib/api.js b/tools/media/lib/api.js new file mode 100644 index 0000000..f916f8c --- /dev/null +++ b/tools/media/lib/api.js @@ -0,0 +1,314 @@ +/** + * DA Admin API functions for fetching logs and saving data + */ + +import { + state, DA_ADMIN, org, repo, ref, +} from './config.js'; +import * as logger from './logger.js'; + +/** Constants */ +const RATE_LIMIT_DELAY_MS = 100; // Delay between paginated API requests + +/** + * Fetch with DA authentication token + * @param {string} url - URL to fetch + * @param {object} opts - Fetch options + * @returns {Promise} Fetch response + */ +export async function daFetch(url, opts = {}) { + opts.headers ||= {}; + if (state.daToken) { + opts.headers.Authorization = `Bearer ${state.daToken}`; + } + return fetch(url, opts); +} + +/** CORS proxy for cross-origin fetches (same as media-library block) */ +const CORS_PROXY_URL = 'https://media-library-cors-proxy.aem-poc-lab.workers.dev/'; + +/** + * Fetch with CORS proxy. Uses proxy first when cross-origin (e.g. localhost → aem.page) + * to avoid CORS errors; direct fetch when same-origin. + * @param {string} url - URL to fetch + * @param {object} options - Fetch options + * @returns {Promise} Fetch response + */ +async function fetchWithCorsProxy(url, options = {}) { + const targetOrigin = url.startsWith('http') ? new URL(url).origin : null; + const isCrossOrigin = targetOrigin && window.location.origin !== targetOrigin; + + if (isCrossOrigin) { + const proxyUrl = `${CORS_PROXY_URL}?url=${encodeURIComponent(url)}`; + return fetch(proxyUrl, options); + } + + try { + const response = await fetch(url, options); + if (!response.ok) { + const proxyUrl = `${CORS_PROXY_URL}?url=${encodeURIComponent(url)}`; + return fetch(proxyUrl, options); + } + return response; + } catch (directError) { + if (directError.name === 'TypeError' + && (directError.message.includes('CORS') + || directError.message.includes('blocked') + || directError.message.includes('Access-Control-Allow-Origin') + || directError.message.includes('Failed to fetch'))) { + const proxyUrl = `${CORS_PROXY_URL}?url=${encodeURIComponent(url)}`; + return fetch(proxyUrl, options); + } + throw directError; + } +} + +export async function loadMeta(path) { + try { + const resp = await daFetch(`${DA_ADMIN}/source${path}`); + if (resp.ok) { + const data = await resp.json(); + return data.data?.[0] || data; + } + } catch (error) { + logger.error(`Failed to load meta from ${path}:`, error.message); + return null; + } + return null; +} + +/** + * Fetch page markdown from preview URL (org, repo from query params). + * Uses CORS proxy fallback when direct fetch fails (e.g. cross-origin). + * @param {string} pagePath - Path e.g. /drafts/page.md + * @returns {Promise} - Raw markdown or null + */ +export async function fetchPageMarkdown(pagePath) { + try { + if (!org || !repo) return null; + const path = pagePath.startsWith('/') ? pagePath : `/${pagePath}`; + const url = `https://${ref}--${repo}--${org}.aem.page${path}`; + const resp = await fetchWithCorsProxy(url); + if (!resp.ok) return null; + return resp.text(); + } catch (error) { + logger.error(`Failed to fetch page markdown ${pagePath}:`, error.message); + return null; + } +} + +/** + * Load media-index.json from DA (sheet format). + * @param {string} path - Path to media-index.json + * @returns {Promise} - Array of index entries, or [] if not found + */ +export async function loadIndex(path) { + try { + const resp = await daFetch(`${DA_ADMIN}/source${path}`); + if (!resp.ok) return []; + const data = await resp.json(); + const entries = data.data || []; + return Array.isArray(entries) ? entries : []; + } catch (error) { + logger.error(`Failed to load index from ${path}:`, error.message); + return []; + } +} + +/** + * List children of a DA path using the DA Admin List API. + * Returns array of items; each item may have path, name, ext, props (with lastModified). + * @param {string} path - Path within org/repo (e.g. /.da/mediaindex) + * @returns {Promise>} + */ +export async function daList(path) { + const normalizedPath = path.replace(/^\//, '') || ''; + const url = `${DA_ADMIN}/list/${org}/${repo}/${normalizedPath}`; + const resp = await daFetch(url); + if (!resp.ok) return []; + const data = await resp.json(); + const items = Array.isArray(data) ? data : (data.sources || []); + return items; +} + +/** + * Get media-index.json info from DA Admin List API (not Franklin Admin API). + * Uses DA List API since the index is stored in DA. + * @param {string} folderPath - Path to mediaindex folder within repo (e.g. .da/mediaindex) + * @returns {Promise<{exists: boolean, lastModified: number|null}>} + */ +export async function getMediaIndexInfo(folderPath = '.da/mediaindex') { + const items = await daList(folderPath); + const indexFile = items.find( + (item) => (item.name === 'media-index' && item.ext === 'json') + || (item.path && item.path.endsWith('/media-index.json')), + ); + if (!indexFile) return { exists: false, lastModified: null }; + // DA List API: lastModified is Unix timestamp (ms) on item (docs.da.live/developers/api/list) + const lastMod = indexFile.lastModified ?? indexFile.props?.lastModified; + const ts = lastMod != null && typeof lastMod === 'number' ? lastMod : null; + return { exists: true, lastModified: ts }; +} + +export async function createSheet(data, type = 'sheet') { + const sheetMeta = { + total: data.length, + limit: data.length, + offset: 0, + data, + ':type': type, + }; + const blob = new Blob([JSON.stringify(sheetMeta, null, 2)], { type: 'application/json' }); + const formData = new FormData(); + formData.append('data', blob); + return formData; +} + +export async function saveMeta(meta, path) { + const metaArray = Array.isArray(meta) ? meta : [meta]; + const formData = await createSheet(metaArray); + return daFetch(`${DA_ADMIN}/source${path}`, { + method: 'POST', + body: formData, + }); +} + +function timestampToDuration(timestamp) { + if (!timestamp) return '90d'; + const ageMs = Date.now() - timestamp; + const days = Math.ceil(ageMs / (24 * 60 * 60 * 1000)); + if (days < 1) { + const hours = Math.ceil(ageMs / (60 * 60 * 1000)); + return hours > 0 ? `${hours}h` : '1h'; + } + return `${Math.min(days, 90)}d`; +} + +export async function fetchFromAdminAPI( + endpoint, + orgName, + repoName, + refName, + since, + limit, + onPageLoaded, +) { + const fetchParams = new URLSearchParams(); + fetchParams.append('limit', limit.toString()); + + // API default (no since) = from=now-15min, to=now. For initial index use max span. + const sinceDuration = since != null ? timestampToDuration(since) : '36500d'; + fetchParams.append('since', sinceDuration); + + const baseUrl = `https://admin.hlx.page/${endpoint}/${orgName}/${repoName}/${refName}`; + const separator = endpoint === 'medialog' ? '/' : ''; + const url = `${baseUrl}${separator}?${fetchParams.toString()}`; + + const resp = await daFetch(url); + + if (!resp.ok) { + throw new Error(`${endpoint} API error: ${resp.status} ${resp.statusText}`); + } + + const data = await resp.json(); + const entries = data.entries || data.data || []; + const { nextToken } = data; + + if (onPageLoaded && entries.length > 0) { + onPageLoaded(entries, !!nextToken); + } + + async function fetchNextPage(token) { + if (!token) return []; + + fetchParams.set('nextToken', token); + const nextUrl = `${baseUrl}${separator}?${fetchParams.toString()}`; + const nextResp = await daFetch(nextUrl); + + if (!nextResp.ok) return []; + + const nextData = await nextResp.json(); + const nextEntries = nextData.entries || nextData.data || []; + + if (onPageLoaded && nextEntries?.length > 0) { + onPageLoaded(nextEntries, !!nextData.nextToken); + } + + const remainingEntries = nextData.nextToken + ? await fetchNextPage(nextData.nextToken) + : []; + return [...(nextEntries || []), ...remainingEntries]; + } + + const additionalEntries = await fetchNextPage(nextToken); + return [...entries, ...additionalEntries]; +} + +/** Delay helper for rate limiting */ +function sleep(ms) { + return new Promise((resolve) => { + setTimeout(resolve, ms); + }); +} + +/** + * Stream fetch from Admin API - yields chunks to onChunk, does not accumulate in memory. + * @param {string} endpoint - 'log' or 'medialog' + * @param {string} orgName - Org + * @param {string} repoName - Repo + * @param {string} refName - Ref (e.g. 'main') + * @param {number|null} since - Timestamp for incremental, or null for full + * @param {number} limit - Page size + * @param {Function} onChunk - (entries: Array) => void|Promise - called per chunk + */ +export async function fetchFromAdminAPIStreaming( + endpoint, + orgName, + repoName, + refName, + since, + limit, + onChunk, +) { + const fetchParams = new URLSearchParams(); + fetchParams.append('limit', limit.toString()); + const sinceDuration = since != null ? timestampToDuration(since) : '36500d'; + fetchParams.append('since', sinceDuration); + + const baseUrl = `https://admin.hlx.page/${endpoint}/${orgName}/${repoName}/${refName}`; + const separator = endpoint === 'medialog' ? '/' : ''; + let nextUrl = `${baseUrl}${separator}?${fetchParams.toString()}`; + + /* eslint-disable no-await-in-loop -- sequential fetch required for pagination */ + while (nextUrl) { + const resp = await daFetch(nextUrl); + + if (!resp.ok) { + throw new Error(`${endpoint} API error: ${resp.status} ${resp.statusText}`); + } + + const data = await resp.json(); + const entries = data.entries || data.data || []; + + if (entries.length > 0 && onChunk) { + await onChunk(entries); + } + + const nextLink = data.links?.next; + const token = data.nextToken; + logger.debug(`[${endpoint}] page: ${entries.length} entries | response keys: ${Object.keys(data).join(', ')} | nextToken=${token ?? 'null'} | links.next=${nextLink ?? 'null'}`); + + if (nextLink && typeof nextLink === 'string' && nextLink.trim()) { + const base = `${baseUrl}${separator}`; + nextUrl = nextLink.startsWith('http') ? nextLink : new URL(nextLink, base).href; + } else if (token) { + fetchParams.set('nextToken', token); + nextUrl = `${baseUrl}${separator}?${fetchParams.toString()}`; + } else { + nextUrl = null; + } + + if (nextUrl) await sleep(RATE_LIMIT_DELAY_MS); + } + /* eslint-enable no-await-in-loop */ +} diff --git a/tools/media/lib/builder.js b/tools/media/lib/builder.js new file mode 100644 index 0000000..7457bbb --- /dev/null +++ b/tools/media/lib/builder.js @@ -0,0 +1,880 @@ +/** + * Core index building logic + */ + +import { + org, repo, ref, sitePath, DA_ADMIN, +} from './config.js'; +import { + fetchFromAdminAPI, fetchFromAdminAPIStreaming, createSheet, daFetch, saveMeta, loadMeta, + loadIndex, getMediaIndexInfo, fetchPageMarkdown, +} from './api.js'; +import { + normalizePath, isPage, extractName, detectMediaType, + isPdf, isSvg, isFragment, isPdfOrSvg, getFileType, + isLinkedContentPath, normalizeFilePath, + extractFragmentReferences, extractLinks, extractIconReferences, +} from './helpers.js'; +import * as logger from './logger.js'; + +/** Constants */ +// 2 minutes tolerance for index/meta alignment +const INDEX_ALIGNMENT_TOLERANCE_MS = 120_000; +// 5s window for matching media to page events (full build) +const MEDIA_ASSOCIATION_WINDOW_MS = 5000; +// 10s window for incremental media updates +const INCREMENTAL_WINDOW_MS = 10000; +// Default page size for Admin API requests +const API_PAGE_SIZE = 1000; +// Max concurrent page markdown fetches to avoid overwhelming browser/server +const MAX_CONCURRENT_FETCHES = 10; + +export async function getIndexStatus() { + const metaPath = `${sitePath}/.da/mediaindex/medialog-meta.json`; + const meta = await loadMeta(metaPath); + const { exists: indexExists, lastModified: indexLastModified } = await getMediaIndexInfo('.da/mediaindex'); + + return { + lastRefresh: meta?.lastFetchTime || null, + entriesCount: meta?.entriesCount || 0, + lastBuildMode: meta?.lastBuildMode || null, + indexExists, + indexLastModified, + }; +} + +/** + * Determine if we can do incremental re-index instead of full build. + * Re-index when: meta has lastFetchTime, index exists, and index lastModified aligns with meta. + * @returns {Promise<{shouldReindex: boolean, reason?: string}>} + */ +export async function shouldReindex() { + const metaPath = `${sitePath}/.da/mediaindex/medialog-meta.json`; + const meta = await loadMeta(metaPath); + const { exists: indexExists, lastModified: indexLastModified } = await getMediaIndexInfo('.da/mediaindex'); + + if (!meta?.lastFetchTime) { + return { shouldReindex: false, reason: 'No previous fetch (meta missing lastFetchTime)' }; + } + if (!indexExists) { + return { shouldReindex: false, reason: 'Index file does not exist in DA' }; + } + if (indexLastModified == null) { + return { shouldReindex: false, reason: 'DA List API did not return lastModified for media-index.json' }; + } + + const lastFetch = meta.lastFetchTime; + const diff = Math.abs(lastFetch - indexLastModified); + if (diff > INDEX_ALIGNMENT_TOLERANCE_MS) { + return { + shouldReindex: false, + reason: `Index lastModified (${indexLastModified}) does not align with meta lastFetchTime (${lastFetch})`, + }; + } + + return { shouldReindex: true }; +} + +/** + * Execute async tasks with concurrency limit + * @param {Array} items - Items to process + * @param {Function} fn - Async function to execute per item + * @param {number} concurrency - Max concurrent operations + * @returns {Promise} Results in order + */ +async function processConcurrently(items, fn, concurrency) { + const results = []; + const executing = []; + + for (let i = 0; i < items.length; i += 1) { + const item = items[i]; + const promise = Promise.resolve().then(() => fn(item, i)); + results.push(promise); + + if (concurrency <= items.length) { + const executingPromise = promise.then(() => { + executing.splice(executing.indexOf(executingPromise), 1); + }); + executing.push(executingPromise); + + if (executing.length >= concurrency) { + // eslint-disable-next-line no-await-in-loop + await Promise.race(executing); + } + } + } + + return Promise.all(results); +} + +/** + * Build usage map for linked content (PDFs, SVGs, fragments). + * Fetches .md from preview URL and parses markdown link syntax. + * @param {Array<{path: string}>} pageEntries - Auditlog entries for pages + * @param {Function} onProgress - Progress callback + * @returns {Promise<{pdfs: Map, svgs: Map, fragments: Map}>} + */ +async function buildContentUsageMap(pageEntries, onProgress) { + const usageMap = { + fragments: new Map(), + pdfs: new Map(), + svgs: new Map(), + }; + + const pagesByPath = new Map(); + pageEntries.forEach((e) => { + const p = normalizePath(e.path); + if (!pagesByPath.has(p)) pagesByPath.set(p, []); + pagesByPath.get(p).push(e); + }); + pagesByPath.forEach((events) => { + events.sort((a, b) => b.timestamp - a.timestamp); + }); + + const uniquePages = [...pagesByPath.keys()]; + logger.debug(`[buildContentUsageMap] parsing ${uniquePages.length} unique pages: [${uniquePages.slice(0, 10).join(', ')}${uniquePages.length > 10 ? '...' : ''}]`); + + const results = await processConcurrently( + uniquePages, + async (normalizedPath, i) => { + onProgress?.({ message: `Parsing page ${i + 1}/${uniquePages.length}: ${normalizedPath}` }); + const md = await fetchPageMarkdown(normalizedPath); + return { normalizedPath, md }; + }, + MAX_CONCURRENT_FETCHES, + ); + + const failed = results.filter((r) => !r.md); + if (failed.length > 0) { + logger.warn(`[buildContentUsageMap] failed to fetch markdown for ${failed.length} pages: [${failed.map((r) => r.normalizedPath).join(', ')}]`); + } + + results.forEach(({ normalizedPath, md }) => { + if (!md) return; + + const fragments = extractFragmentReferences(md); + const pdfs = extractLinks(md, /\.pdf$/); + const svgs = extractLinks(md, /\.svg$/); + const icons = extractIconReferences(md); + + const addToMap = (map, path) => { + if (!map.has(path)) map.set(path, []); + if (!map.get(path).includes(normalizedPath)) { + map.get(path).push(normalizedPath); + } + }; + + fragments.forEach((f) => addToMap(usageMap.fragments, f)); + pdfs.forEach((p) => addToMap(usageMap.pdfs, p)); + svgs.forEach((s) => addToMap(usageMap.svgs, s)); + icons.forEach((s) => addToMap(usageMap.svgs, s)); + }); + + const iconPathsFromUsage = [...usageMap.svgs.keys()].filter((p) => p.includes('/icons/')); + logger.debug(`[buildContentUsageMap] usageMap: pdfs=${usageMap.pdfs.size}, svgs=${usageMap.svgs.size}, fragments=${usageMap.fragments.size} | icon paths from parsing: [${iconPathsFromUsage.join(', ') || 'none'}]`); + + return usageMap; +} + +function noop() {} + +/** + * Find page events matching media within time window + * Matches media to page events that occurred BEFORE media timestamp within window + * Time window: MEDIA_ASSOCIATION_WINDOW_MS (5s for full build) + * Rationale: Media operations typically follow page preview within seconds + * Example: Page preview at T, media upload at T+2s → matched (within 5s window) + * Edge case: Media uploaded, then page previewed → not matched (preview must come first) + * @param {Map} pagesByPath - Map of normalized path to page events + * @param {string} resourcePath - Media resource path + * @param {number} mediaTimestamp - Media operation timestamp + * @returns {Array} Matching page events + */ +function findMatchingPageEvents(pagesByPath, resourcePath, mediaTimestamp) { + const events = pagesByPath.get(resourcePath); + if (!events || events.length === 0) return []; + const minTs = mediaTimestamp - MEDIA_ASSOCIATION_WINDOW_MS; + return events.filter( + (e) => e.timestamp <= mediaTimestamp && e.timestamp > minTs, + ); +} + +/** Check memory (Chrome/Edge); returns { warning, usedMB, limitMB } or { warning: false } */ +function checkMemory() { + if (typeof performance !== 'undefined' && performance.memory) { + const used = performance.memory.usedJSHeapSize / (1024 * 1024); + const limit = performance.memory.jsHeapSizeLimit / (1024 * 1024); + return { warning: used > limit * 0.8, usedMB: used, limitMB: limit }; + } + return { warning: false }; +} + +/** + * Remove media entry from index; handle orphaned media + * Strategy: If removing last reference to a hash, mark as "unused" vs deleting + * Exception: Don't add "unused" if medialog has explicit "delete" for this hash + * Rationale: Media files persist in storage when unreferenced; track for cleanup + * Example: Media on 2 pages, remove from 1 → still referenced + * Remove from both → becomes "unused" + * @param {Array} idx - Index array + * @param {object} entry - Entry to remove + * @param {string} path - Page path + * @param {Array} medialog - Medialog entries for delete detection + * @returns {number} removed count (0 or 1) + */ +function removeMediaMaybeAddOrphan(idx, entry, path, medialog) { + const i = idx.findIndex((e) => e.hash === entry.hash && e.page === path); + if (i === -1) return 0; + const { hash } = entry; + const hasDelete = medialog.some((m) => m.mediaHash === hash && m.operation === 'delete'); + idx.splice(i, 1); + const stillHasEntry = idx.some((e) => e.hash === hash); + const alreadyUnused = idx.some((e) => e.hash === hash && !e.page); + if (!stillHasEntry && !hasDelete && !alreadyUnused) { + idx.push({ + hash, + page: '', + url: entry.url, + name: entry.name, + timestamp: entry.timestamp, + user: entry.user, + operation: entry.operation, + type: entry.type, + status: 'unused', + }); + } + return 1; +} + +/** + * Create a linked-content index entry using the same schema as media entries + * so the DA sheet stores all rows correctly (no column misalignment). + * @param {string} filePath - Path e.g. /icons/headset.svg + * @param {string[]} linkedPages - Pages that reference this file + * @param {{timestamp: number, user?: string}} fileEvent - Auditlog event + * @param {string} status - 'referenced' or 'file-unused' + * @returns {object} Entry matching media schema (hash, page, url, name, etc.) + */ +function toLinkedContentEntry(filePath, linkedPages, fileEvent, status) { + const pageVal = linkedPages.length > 0 ? linkedPages.join(',') : ''; + return { + hash: filePath, + page: pageVal, + url: '', + name: filePath.split('/').pop() || filePath, + timestamp: fileEvent.timestamp, + user: fileEvent.user || '', + operation: 'auditlog-parsed', + type: getFileType(filePath), + status, + source: 'auditlog-parsed', + }; +} + +/** + * Process page-level media updates for incremental indexing + * Compares old index entries with new medialog to detect additions/removals + * @param {Array} updatedIndex - Index being built (mutated) + * @param {Map} pagesByPath - Map of page path to events + * @param {Array} medialogEntries - New medialog entries + * @param {Function} onLog - Logging callback + * @returns {{added: number, removed: number}} Counts + */ +function processPageMediaUpdates(updatedIndex, pagesByPath, medialogEntries, onLog) { + let added = 0; + let removed = 0; + + pagesByPath.forEach((pageEvents, normalizedPath) => { + const latestEvent = pageEvents[0]; + const latestTs = latestEvent.timestamp; + const windowStart = latestTs; + const windowEnd = latestTs + INCREMENTAL_WINDOW_MS; + + onLog(`--- Page: ${normalizedPath} ---`); + onLog(` Latest preview: ${latestTs} (${new Date(latestTs).toISOString()})`); + onLog(` Window: [${windowStart}-${windowEnd}] (${INCREMENTAL_WINDOW_MS / 1000}s)`); + + const matchesPage = (m) => m.resourcePath && m.resourcePath === normalizedPath; + const pageMedialogAll = medialogEntries.filter(matchesPage); + const inWindow = (m) => m.timestamp >= windowStart && m.timestamp < windowEnd; + const newPageMedia = pageMedialogAll.filter(inWindow); + const outsideWindow = pageMedialogAll.filter((m) => !newPageMedia.includes(m)); + + if (pageMedialogAll.length > 0) { + onLog(` Medialog for page: ${pageMedialogAll.length} total, ${newPageMedia.length} in window, ${outsideWindow.length} outside`); + if (outsideWindow.length > 0) { + outsideWindow.slice(0, 3).forEach((m) => { + onLog(` Outside: hash=${m.mediaHash} ts=${m.timestamp} (${new Date(m.timestamp).toISOString()})`); + }); + } + } + + const oldPageEntries = updatedIndex.filter((e) => e.page === normalizedPath); + const oldHashes = new Set(oldPageEntries.map((e) => e.hash)); + const newHashes = new Set(newPageMedia.map((m) => m.mediaHash)); + + onLog(` Old (index): ${oldHashes.size} hashes ${[...oldHashes].slice(0, 5).join(', ')}${oldHashes.size > 5 ? '...' : ''}`); + onLog(` New (medialog in window): ${newHashes.size} hashes ${[...newHashes].slice(0, 5).join(', ')}${newHashes.size > 5 ? '...' : ''}`); + + /** + * Edge case: Page was previewed but no media in the time window + * Scenario: User previewed page, removed all media, then previewed again + * Decision: Remove all old media entries for this page (assume removal intended) + * Alternative considered: Keep old entries (assume no change) + * Rationale: Preview action signals intent to update; empty medialog = intentional removal + * Assumption: Events are processed in timestamp order + */ + if (newPageMedia.length === 0 && oldPageEntries.length > 0) { + onLog(' Edge case: Page previewed with no media in window - removing old entries'); + const rm = removeMediaMaybeAddOrphan; + oldPageEntries.forEach((oldEntry) => { + removed += rm(updatedIndex, oldEntry, normalizedPath, medialogEntries); + }); + return; + } + + const toRemove = [...oldHashes].filter((h) => !newHashes.has(h)); + const toAdd = [...newHashes].filter((h) => !oldHashes.has(h)); + const unchanged = [...newHashes].filter((h) => oldHashes.has(h)); + + if (toRemove.length || toAdd.length) { + onLog(` Diff: remove ${toRemove.length} (${toRemove.slice(0, 3).join(', ')}${toRemove.length > 3 ? '...' : ''}), add ${toAdd.length}`); + } + + const rm = removeMediaMaybeAddOrphan; + toRemove.forEach((hash) => { + const oldEntry = oldPageEntries.find((e) => e.hash === hash); + if (oldEntry) { + removed += rm(updatedIndex, oldEntry, normalizedPath, medialogEntries); + } + }); + + toAdd.forEach((hash) => { + const media = newPageMedia.find((m) => m.mediaHash === hash); + if (media) { + updatedIndex.push({ + hash: media.mediaHash, + page: normalizedPath, + url: media.path, + name: extractName(media), + timestamp: media.timestamp, + user: media.user, + operation: media.operation, + type: detectMediaType(media), + status: 'referenced', + }); + added += 1; + } + }); + + unchanged.forEach((hash) => { + const idx = updatedIndex.findIndex((e) => e.hash === hash && e.page === normalizedPath); + const media = newPageMedia.find((m) => m.mediaHash === hash); + if (idx !== -1 && media) { + updatedIndex[idx].timestamp = media.timestamp; + } + }); + }); + + return { added, removed }; +} + +/** + * Process standalone media uploads (no page association) + * @param {Array} updatedIndex - Index being built (mutated) + * @param {Array} medialogEntries - New medialog entries + * @param {Set} referencedHashes - Already referenced media hashes + * @returns {number} Added count + */ +function processStandaloneUploads(updatedIndex, medialogEntries, referencedHashes) { + let added = 0; + const standaloneUploads = medialogEntries.filter((m) => !m.resourcePath && m.originalFilename); + + standaloneUploads.forEach((media) => { + if (!referencedHashes.has(media.mediaHash)) { + const exists = updatedIndex.some((e) => e.hash === media.mediaHash && !e.page); + if (!exists) { + updatedIndex.push({ + hash: media.mediaHash, + page: '', + url: media.path, + name: media.originalFilename.split('/').pop(), + timestamp: media.timestamp, + user: media.user, + operation: media.operation, + type: detectMediaType(media), + status: 'unused', + }); + added += 1; + } + } + }); + + return added; +} + +/** + * Process linked content (PDFs, SVGs, fragments) for incremental index + * @param {Array} updatedIndex - Index being built (mutated) + * @param {Array} files - File events from auditlog + * @param {Array} pages - Page events + * @param {Function} onProgress - Progress callback + * @param {Function} onLog - Log callback + * @returns {Promise<{added: number, removed: number}>} Counts + */ +async function processLinkedContentIncremental(updatedIndex, files, pages, onProgress, onLog) { + let added = 0; + let removed = 0; + + const filesByPath = new Map(); + files.forEach((e) => { + if (!isPdfOrSvg(e.path) && !isFragment(e.path)) return; + const p = e.path; + const existing = filesByPath.get(p); + if (!existing || e.timestamp > existing.timestamp) filesByPath.set(p, e); + }); + + const deletedPaths = new Set(); + filesByPath.forEach((event, path) => { + if (event.method === 'DELETE') deletedPaths.add(path); + }); + + // Remove deleted linked content + deletedPaths.forEach((path) => { + const idx = updatedIndex.findIndex( + (e) => (e.operation === 'auditlog-parsed' || e.source === 'auditlog-parsed') && e.hash === path, + ); + if (idx !== -1) { + updatedIndex.splice(idx, 1); + removed += 1; + onLog(`Removed linked content (DELETE): ${path}`); + } + }); + + // Build usage map + onProgress({ stage: 'processing', message: 'Building usage map for linked content...', percent: 83 }); + const usageMap = await buildContentUsageMap(pages, (p) => onProgress(p)); + + const allLinkedPaths = new Set(filesByPath.keys()); + ['pdfs', 'svgs', 'fragments'].forEach((key) => { + usageMap[key]?.forEach((_, path) => allLinkedPaths.add(path)); + }); + + // Add existing linked content paths whose pages were parsed + const parsedPages = new Set(pages.map((p) => normalizePath(p.path))); + updatedIndex.forEach((e) => { + const isLinkedContent = e.operation === 'auditlog-parsed' || e.source === 'auditlog-parsed'; + if (!isLinkedContent) return; + const entryPages = (e.page || '').split(',').map((p) => p.trim()).filter(Boolean); + if (entryPages.some((p) => parsedPages.has(p))) { + allLinkedPaths.add(e.hash); + } + }); + + allLinkedPaths.forEach((filePath) => { + if (deletedPaths.has(filePath)) return; + + let key = 'fragments'; + if (isPdf(filePath)) key = 'pdfs'; + else if (isSvg(filePath)) key = 'svgs'; + const linkedPages = usageMap[key]?.get(filePath) || []; + const status = linkedPages.length > 0 ? 'referenced' : 'file-unused'; + const fileEvent = filesByPath.get(filePath) || { timestamp: 0, user: '' }; + + const isLinked = (e) => (e.operation === 'auditlog-parsed' || e.source === 'auditlog-parsed') + && e.hash === filePath; + const existingIdx = updatedIndex.findIndex(isLinked); + + if (existingIdx !== -1) { + updatedIndex[existingIdx].page = linkedPages.length > 0 ? linkedPages.join(',') : ''; + updatedIndex[existingIdx].timestamp = fileEvent.timestamp; + updatedIndex[existingIdx].status = status; + } else { + updatedIndex.push(toLinkedContentEntry(filePath, linkedPages, fileEvent, status)); + added += 1; + } + }); + + return { added, removed }; +} + +/** + * Incremental re-index: fetch logs since lastFetchTime, merge with existing index. + * Detects additions, removals, and updates per page. + * @param {Function} onProgress - Progress callback + * @param {Function} [onLog] - Optional debug log callback for per-page details + */ +export async function buildIncrementalIndex(onProgress, onLog = noop) { + const metaPath = `${sitePath}/.da/mediaindex/medialog-meta.json`; + const indexPath = `${sitePath}/.da/mediaindex/media-index.json`; + const meta = await loadMeta(metaPath); + const lastFetchTime = meta?.lastFetchTime; + + if (!lastFetchTime) { + throw new Error('Cannot run incremental: meta missing lastFetchTime'); + } + + onLog(`lastFetchTime: ${lastFetchTime} (${new Date(lastFetchTime).toISOString()})`); + onProgress({ + stage: 'starting', + message: 'Mode: Incremental re-index (since last build)', + percent: 5, + }); + + onProgress({ stage: 'loading', message: 'Loading existing index...', percent: 8 }); + const existingIndex = await loadIndex(indexPath); + + onLog(`Fetching auditlog since ${new Date(lastFetchTime).toISOString()}`); + onProgress({ stage: 'fetching', message: 'Fetching new auditlog entries...', percent: 15 }); + const auditlogEntries = await fetchFromAdminAPI('log', org, repo, ref, lastFetchTime, API_PAGE_SIZE, (entries, hasMore) => { + onProgress({ + stage: 'fetching', + message: `Fetched ${entries.length} auditlog entries${hasMore ? ' (more available)' : ''}...`, + percent: 25, + }); + }); + + const validEntries = auditlogEntries.filter((e) => e && e.path && e.route === 'preview'); + const pages = validEntries.filter((e) => isPage(e.path)); + + onProgress({ stage: 'fetching', message: 'Fetching new medialog entries...', percent: 35 }); + const medialogEntries = await fetchFromAdminAPI('medialog', org, repo, ref, lastFetchTime, API_PAGE_SIZE, (entries, hasMore) => { + onProgress({ + stage: 'fetching', + message: `Fetched ${entries.length} medialog entries${hasMore ? ' (more available)' : ''}...`, + percent: 45, + }); + }); + + if (pages.length === 0 && medialogEntries.length === 0) { + onProgress({ + stage: 'complete', + message: 'No new activity since last build - index unchanged', + percent: 100, + }); + return existingIndex; + } + + onLog(`Auditlog: ${auditlogEntries.length} entries, ${pages.length} pages`); + onLog(`Medialog: ${medialogEntries.length} entries (all since lastFetchTime)`); + onProgress({ + stage: 'processing', + message: `Processing ${pages.length} pages with ${medialogEntries.length} medialog entries...`, + percent: 55, + }); + + const updatedIndex = [...existingIndex]; + + const pagesByPath = new Map(); + pages.forEach((e) => { + const p = normalizePath(e.path); + if (!pagesByPath.has(p)) pagesByPath.set(p, []); + pagesByPath.get(p).push(e); + }); + + /** + * Indexing strategy for multiple preview events per page + * Rule: Process only the LATEST preview event per page, skip others + * Rationale: Latest preview represents current state; earlier previews are superseded + * Example: Page previewed at T1, T2, T3 → only process T3's media associations + * Trade-off: Simpler logic, potential to miss media if window misaligned (acceptable) + */ + pagesByPath.forEach((events) => { + events.sort((a, b) => b.timestamp - a.timestamp); + }); + onLog(`Time window: ${INCREMENTAL_WINDOW_MS / 1000}s (medialog within window of latest preview)`); + onLog(`Pages to process: ${pagesByPath.size} (${[...pagesByPath.keys()].join(', ')})`); + onLog(`Medialog entries since lastFetch: ${medialogEntries.length}`); + + // Process page-level media updates + const pageResults = processPageMediaUpdates(updatedIndex, pagesByPath, medialogEntries, onLog); + let { added, removed } = pageResults; + + // Calculate referenced hashes for standalone upload processing + const referencedHashes = new Set( + updatedIndex.filter((e) => e.page).flatMap((e) => e.hash), + ); + + // Process standalone uploads + const standaloneAdded = processStandaloneUploads(updatedIndex, medialogEntries, referencedHashes); + added += standaloneAdded; + + // Process linked content + const files = validEntries.filter((e) => !isPage(e.path)); + const linkedResults = await processLinkedContentIncremental( + updatedIndex, + files, + pages, + onProgress, + onLog, + ); + added += linkedResults.added; + removed += linkedResults.removed; + + onProgress({ + stage: 'processing', + message: `Incremental: +${added} added, -${removed} removed, total: ${updatedIndex.length}`, + percent: 85, + }); + + onProgress({ stage: 'saving', message: `Saving ${updatedIndex.length} entries...`, percent: 90 }); + + const formData = await createSheet(updatedIndex); + await daFetch(`${DA_ADMIN}/source${indexPath}`, { + method: 'POST', + body: formData, + }); + + await saveMeta({ + lastFetchTime: Date.now(), + entriesCount: updatedIndex.length, + lastRefreshBy: 'media-indexer', + lastBuildMode: 'incremental', + }, metaPath); + + onProgress({ + stage: 'complete', + message: `Incremental complete! ${updatedIndex.length} entries (${added} added, ${removed} removed)`, + percent: 100, + }); + + return updatedIndex; +} + +export async function buildInitialIndex(onProgress) { + const index = []; + const buildMode = 'full'; // incremental not yet implemented + + onProgress({ + stage: 'starting', + message: 'Mode: Full build (rebuilding from auditlog + medialog)', + percent: 5, + }); + + // Phase 1: Stream auditlog, build maps (no full accumulation) + onProgress({ stage: 'fetching', message: 'Fetching auditlog (streaming)...', percent: 10 }); + + const pagesByPath = new Map(); // normalizedPath -> [events] sorted desc + const filesByPath = new Map(); // path -> latest event + const deletedPaths = new Set(); + let auditlogCount = 0; + + await fetchFromAdminAPIStreaming('log', org, repo, ref, null, API_PAGE_SIZE, (chunk) => { + const rawCount = chunk.length; + const droppedNoPath = chunk.filter((e) => !e?.path).length; + const droppedRoute = chunk.filter((e) => e?.path && e.route !== 'preview').length; + if (droppedNoPath > 0 || droppedRoute > 0) { + logger.debug(`[auditlog chunk] raw=${rawCount}, dropped(no path)=${droppedNoPath}, dropped(route!==preview)=${droppedRoute}`); + } + chunk.forEach((e) => { + if (!e?.path || e.route !== 'preview') return; + auditlogCount += 1; + if (isPage(e.path)) { + const p = normalizePath(e.path); + if (!pagesByPath.has(p)) pagesByPath.set(p, []); + pagesByPath.get(p).push(e); + } else { + const fp = normalizeFilePath(e.path); + const existing = filesByPath.get(fp); + if (!existing || e.timestamp > existing.timestamp) { + filesByPath.set(fp, e); + } + } + }); + onProgress({ + stage: 'fetching', + message: `Auditlog: ${auditlogCount} entries, ${pagesByPath.size} pages...`, + percent: 15, + }); + }); + + pagesByPath.forEach((events) => events.sort((a, b) => b.timestamp - a.timestamp)); + + const pages = []; + pagesByPath.forEach((events) => pages.push(...events)); + + /** + * Deletion detection strategy: Only mark as deleted if LATEST event is DELETE + * Rationale: If a file was deleted then re-added, the latest event reflects current state + * Assumption: filesByPath contains only the latest event per path (maintained above) + * Example timeline: DELETE at T1, POST at T2 → latest=POST → not deleted (correct) + */ + filesByPath.forEach((event, path) => { + if (isLinkedContentPath(path) && event.method === 'DELETE') { + deletedPaths.add(path); + } + }); + + const iconPathsFromAuditlog = [...filesByPath.keys()].filter((p) => p.includes('/icons/')); + const iconPathsInDeleted = [...deletedPaths].filter((p) => p.includes('/icons/')); + logger.debug(`[auditlog done] total=${auditlogCount}, pages=${pagesByPath.size}, files=${filesByPath.size}, deleted=${deletedPaths.size}`); + logger.debug(` icon paths from auditlog: [${iconPathsFromAuditlog.join(', ') || 'none'}]`); + logger.debug(` icon paths in deletedPaths: [${iconPathsInDeleted.join(', ') || 'none'}]`); + + onProgress({ + stage: 'fetching', + message: `Identified ${pages.length} page events, ${filesByPath.size} files`, + percent: 25, + }); + + // Phase 2: Stream medialog, process each chunk (no full accumulation) + onProgress({ stage: 'fetching', message: 'Fetching medialog (streaming)...', percent: 30 }); + + const entryMap = new Map(); + const referencedHashes = new Set(); + const standaloneBuffer = []; + let medialogCount = 0; + + await fetchFromAdminAPIStreaming('medialog', org, repo, ref, null, API_PAGE_SIZE, (chunk) => { + logger.debug(`[medialog chunk] ${chunk.length} entries`); + chunk.forEach((media) => { + medialogCount += 1; + if (media.resourcePath) { + const matches = findMatchingPageEvents(pagesByPath, media.resourcePath, media.timestamp); + matches.forEach((pageEvent) => { + const normalizedPath = normalizePath(pageEvent.path); + const hash = media.mediaHash; + const key = `${hash}|${normalizedPath}`; + const existing = entryMap.get(key); + if (!existing || media.timestamp > existing.timestamp) { + entryMap.set(key, { + hash, + page: normalizedPath, + url: media.path, + name: extractName(media), + timestamp: media.timestamp, + user: media.user, + operation: media.operation, + type: detectMediaType(media), + status: 'referenced', + }); + } + referencedHashes.add(hash); + }); + } else if (media.originalFilename) { + standaloneBuffer.push(media); + } + }); + const mem = checkMemory(); + if (mem.warning) { + onProgress({ + stage: 'processing', + message: `Memory: ${mem.usedMB.toFixed(0)}MB / ${mem.limitMB.toFixed(0)}MB`, + percent: 35, + }); + } else { + onProgress({ + stage: 'fetching', + message: `Medialog: ${medialogCount} entries processed...`, + percent: 35, + }); + } + }); + + onProgress({ + stage: 'processing', + message: `Processed ${medialogCount} medialog, ${entryMap.size} page refs`, + percent: 60, + }); + + // Phase 3: Process standalone uploads + standaloneBuffer.forEach((media) => { + const hash = media.mediaHash; + if (!referencedHashes.has(hash)) { + const key = `${hash}|`; + const existing = entryMap.get(key); + if (!existing || media.timestamp > existing.timestamp) { + entryMap.set(key, { + hash, + page: '', + url: media.path, + name: media.originalFilename.split('/').pop(), + timestamp: media.timestamp, + user: media.user, + operation: media.operation, + type: detectMediaType(media), + status: 'unused', + }); + } + } + }); + + onProgress({ + stage: 'processing', + message: `Standalone: ${standaloneBuffer.length}, total: ${entryMap.size}`, + percent: 70, + }); + + // Convert Map to array + entryMap.forEach((entry) => { + index.push(entry); + }); + + // Phase 5: Linked content (PDFs, SVGs, fragments) - parse pages for usage + onProgress({ stage: 'processing', message: 'Building content usage map (parsing pages)...', percent: 78 }); + const usageMap = await buildContentUsageMap(pages, (p) => onProgress(p)); + + const linkedFilesByPath = new Map(); + filesByPath.forEach((e, p) => { + if (!isPdfOrSvg(p) && !isFragment(p)) return; + linkedFilesByPath.set(p, e); + }); + + const usageKey = (path) => { + if (isPdf(path)) return 'pdfs'; + if (isSvg(path)) return 'svgs'; + return 'fragments'; + }; + + const allLinkedPaths = new Set(linkedFilesByPath.keys()); + ['pdfs', 'svgs', 'fragments'].forEach((key) => { + usageMap[key]?.forEach((_, path) => allLinkedPaths.add(path)); + }); + + const iconPathsInAllLinked = [...allLinkedPaths].filter((p) => p.includes('/icons/')); + logger.debug(`[linked content] linkedFilesByPath=${linkedFilesByPath.size}, allLinkedPaths=${allLinkedPaths.size} (after merge with usageMap) | icon paths: [${iconPathsInAllLinked.join(', ') || 'none'}]`); + + allLinkedPaths.forEach((filePath) => { + if (deletedPaths.has(filePath)) { + if (filePath.includes('/icons/')) { + logger.debug(`[linked content] SKIP (in deletedPaths): ${filePath}`); + } + return; + } + const key = usageKey(filePath); + const linkedPages = usageMap[key]?.get(filePath) || []; + const status = linkedPages.length > 0 ? 'referenced' : 'file-unused'; + const fileEvent = linkedFilesByPath.get(filePath) || { timestamp: 0, user: '' }; + index.push(toLinkedContentEntry(filePath, linkedPages, fileEvent, status)); + }); + + const linkedContentCount = index.length - entryMap.size; + const iconEntriesInIndex = index.filter((e) => e.hash?.includes?.('/icons/')); + logger.debug(`[full build done] media=${entryMap.size}, linked content=${linkedContentCount}, total=${index.length} | icon entries in index: [${iconEntriesInIndex.map((e) => e.hash).join(', ') || 'none'}]`); + + onProgress({ + stage: 'processing', + message: `Added ${allLinkedPaths.size} linked content entries (PDFs, SVGs, fragments)`, + percent: 82, + }); + + onProgress({ stage: 'saving', message: `Saving ${index.length} entries...`, percent: 90 }); + + const indexPath = `${sitePath}/.da/mediaindex/media-index.json`; + const formData = await createSheet(index); + await daFetch(`${DA_ADMIN}/source${indexPath}`, { + method: 'POST', + body: formData, + }); + + await saveMeta({ + lastFetchTime: Date.now(), + entriesCount: index.length, + lastRefreshBy: 'media-indexer', + lastBuildMode: buildMode, + }, `${sitePath}/.da/mediaindex/medialog-meta.json`); + + onProgress({ stage: 'complete', message: `Complete! ${index.length} entries indexed`, percent: 100 }); + + return index; +} diff --git a/tools/media/lib/config.js b/tools/media/lib/config.js new file mode 100644 index 0000000..621094a --- /dev/null +++ b/tools/media/lib/config.js @@ -0,0 +1,39 @@ +/** + * Configuration and state management for media indexer + */ + +/** + * Validate GitHub org/repo name to prevent injection attacks + * Allows: alphanumeric, hyphens, underscores, dots (standard GitHub naming) + * @param {string} name - Org or repo name + * @returns {string|null} Validated name or null if invalid + */ +function validateGitHubName(name) { + if (!name || typeof name !== 'string') return null; + // GitHub allows alphanumeric, hyphens, underscores, dots + // Must not start/end with special chars, max 100 chars + const validPattern = /^[a-zA-Z0-9]([a-zA-Z0-9._-]{0,98}[a-zA-Z0-9])?$/; + return validPattern.test(name) ? name : null; +} + +// Parse URL parameters +const params = new URLSearchParams(window.location.search); +const rawOrg = params.get('org'); +const rawRepo = params.get('repo') || params.get('site'); + +export const org = validateGitHubName(rawOrg); +export const repo = validateGitHubName(rawRepo); +export const ref = 'main'; +export const sitePath = org && repo ? `/${org}/${repo}` : null; + +export const DA_ADMIN = 'https://admin.da.live'; + +export const state = { + building: false, + progress: { stage: 'idle', message: '', percent: 0 }, + buildStartTime: null, + errors: [], + logs: [], + status: null, + daToken: null, +}; diff --git a/tools/media/lib/helpers.js b/tools/media/lib/helpers.js new file mode 100644 index 0000000..23a0c5f --- /dev/null +++ b/tools/media/lib/helpers.js @@ -0,0 +1,177 @@ +/** + * Helper functions for path normalization, type detection, and name extraction + */ + +import * as logger from './logger.js'; + +/** + * Normalize path by removing query params and adding .md for pages + * @param {string} path - The path to normalize + * @returns {string} Normalized path + */ +export function normalizePath(path) { + if (!path) return ''; + let cleanPath = path.split('?')[0].split('#')[0]; + if (!cleanPath.includes('.') && !cleanPath.startsWith('/media/')) { + cleanPath = cleanPath === '/' || cleanPath === '' ? '/index.md' : `${cleanPath}.md`; + } + return cleanPath; +} + +/** + * Check if a path represents a page (not a media file or fragment) + * @param {string} path - The path to check + * @returns {boolean} True if path is a page + */ +export function isPage(path) { + if (!path || typeof path !== 'string') return false; + return (path.endsWith('.md') + || (!path.includes('.') && !path.startsWith('/media/'))) + && !path.includes('/fragments/'); +} + +/** + * Extract filename from medialog entry or path + * @param {object} mediaEntry - The medialog entry + * @returns {string} Extracted filename + */ +export function extractName(mediaEntry) { + if (!mediaEntry) return ''; + if (mediaEntry.originalFilename) { + return mediaEntry.originalFilename.split('/').pop(); + } + if (!mediaEntry.path) return ''; + return mediaEntry.path.split('?')[0].split('#')[0].split('/').pop(); +} + +/** Phase 2: Linked content type detection */ +export function isPdf(path) { + return path && path.toLowerCase().endsWith('.pdf'); +} + +export function isSvg(path) { + return path && path.toLowerCase().endsWith('.svg'); +} + +export function isFragment(path) { + return path && path.includes('/fragments/'); +} + +/** True if path is PDF, SVG, or fragment (linked content from auditlog) */ +export function isLinkedContentPath(path) { + return path && (isPdf(path) || isSvg(path) || isFragment(path)); +} + +/** Normalize file path for matching (ensure leading slash) */ +export function normalizeFilePath(path) { + if (!path) return ''; + const p = path.split('?')[0].split('#')[0].trim(); + return p.startsWith('/') ? p : `/${p}`; +} + +export function isPdfOrSvg(path) { + return isPdf(path) || isSvg(path); +} + +/** + * Get file type in same format as media: "category > extension" + * @param {string} path - File path + * @returns {string} e.g. "document > pdf", "image > svg", "content > fragment" + */ +export function getFileType(path) { + if (isPdf(path)) return 'document > pdf'; + if (isSvg(path)) return 'image > svg'; + if (isFragment(path)) return 'content > fragment'; + return 'unknown'; +} + +function toPath(href) { + if (!href) return ''; + try { + if (href.startsWith('http')) { + return new URL(href).pathname; + } + return href.startsWith('/') ? href : `/${href}`; + } catch (error) { + logger.error(`Failed to parse URL ${href}:`, error.message); + return href; + } +} + +/** Markdown link regex: [text](url) or ![alt](url) - captures URL in group 1 */ +const MD_LINK_RE = /\[[^\]]*\]\(([^)]+)\)/gi; + +/** Markdown autolink: - captures URL in group 1 */ +const MD_AUTOLINK_RE = /<(https?:\/\/[^>]+|\/[^>\s]*)>/g; + +/** Icon shorthand: :iconname: → /icons/iconname.svg */ +const ICON_RE = /:([a-zA-Z0-9-]+):/g; +/** Exclude doc terms like "with :svg: syntax" to avoid false positives */ +const ICON_DOC_EXCLUDE = new Set(['svg', 'pdf', 'image', 'link', 'syntax']); + +/** + * Extract all URLs from markdown: [text](url), ![alt](url), and autolinks + * @param {string} md - Raw markdown + * @returns {string[]} - URLs from link syntax + */ +function extractUrlsFromMarkdown(md) { + if (!md || typeof md !== 'string') return []; + const fromLinks = [...md.matchAll(MD_LINK_RE)].map((m) => m[1].trim()); + const fromAutolinks = [...md.matchAll(MD_AUTOLINK_RE)].map((m) => m[1].trim()); + return [...fromLinks, ...fromAutolinks]; +} + +/** + * Extract icon references from :iconname: shorthand (resolves to /icons/iconname.svg) + * @param {string} md - Raw markdown + * @returns {string[]} - Normalized paths like /icons/headset.svg + */ +export function extractIconReferences(md) { + if (!md || typeof md !== 'string') return []; + const matches = [...md.matchAll(ICON_RE)]; + return [...new Set( + matches + .filter((m) => !ICON_DOC_EXCLUDE.has(m[1].toLowerCase())) + .map((m) => `/icons/${m[1]}.svg`), + )]; +} + +/** + * Extract fragment references from markdown (links to /fragments/...) + * @param {string} md - Raw markdown + * @returns {string[]} - Normalized paths + */ +export function extractFragmentReferences(md) { + const urls = extractUrlsFromMarkdown(md); + return [...new Set(urls.filter((u) => u.includes('/fragments/')).map((u) => toPath(u)))]; +} + +/** + * Extract links matching pattern (e.g. .pdf, .svg) from markdown + * @param {string} md - Raw markdown + * @param {RegExp} pattern - Pattern to match (e.g. /\.pdf$/) + * @returns {string[]} - Normalized paths + */ +export function extractLinks(md, pattern) { + const urls = extractUrlsFromMarkdown(md); + const pathPart = (u) => u.split('?')[0].split('#')[0]; + return [...new Set(urls.filter((u) => pattern.test(pathPart(u))).map((u) => toPath(u)))]; +} + +/** + * Detect media type from contentType in structured format + * @param {object} mediaEntry - The medialog entry + * @returns {string} Type in format "category > extension" + */ +export function detectMediaType(mediaEntry) { + const contentType = mediaEntry.contentType || ''; + if (contentType.startsWith('image/')) { + const ext = contentType.split('/')[1]; + return `img > ${ext}`; + } + if (contentType.startsWith('video/')) { + const ext = contentType.split('/')[1]; + return `video > ${ext}`; + } + return 'unknown'; +} diff --git a/tools/media/lib/logger.js b/tools/media/lib/logger.js new file mode 100644 index 0000000..5d7f752 --- /dev/null +++ b/tools/media/lib/logger.js @@ -0,0 +1,79 @@ +/** + * Logging utility with configurable log levels + */ + +const LOG_LEVELS = { + DEBUG: 0, + INFO: 1, + WARN: 2, + ERROR: 3, + NONE: 4, +}; + +/** + * Logger configuration + * Set LOG_LEVEL to control verbosity in production + */ +const config = { + // Change to LOG_LEVELS.INFO or LOG_LEVELS.WARN for production + level: LOG_LEVELS.DEBUG, + prefix: '[MediaIndexer]', +}; + +/** + * Set log level + * @param {number} level - Log level from LOG_LEVELS + */ +export function setLogLevel(level) { + config.level = level; +} + +/** + * Debug logging - verbose details for development + * @param {string} message - Log message + * @param {...any} args - Additional arguments + */ +export function debug(message, ...args) { + if (config.level <= LOG_LEVELS.DEBUG) { + // eslint-disable-next-line no-console + console.log(`${config.prefix}[DEBUG]`, message, ...args); + } +} + +/** + * Info logging - general information + * @param {string} message - Log message + * @param {...any} args - Additional arguments + */ +export function info(message, ...args) { + if (config.level <= LOG_LEVELS.INFO) { + // eslint-disable-next-line no-console + console.log(`${config.prefix}[INFO]`, message, ...args); + } +} + +/** + * Warning logging - potential issues + * @param {string} message - Log message + * @param {...any} args - Additional arguments + */ +export function warn(message, ...args) { + if (config.level <= LOG_LEVELS.WARN) { + // eslint-disable-next-line no-console + console.warn(`${config.prefix}[WARN]`, message, ...args); + } +} + +/** + * Error logging - failures and exceptions + * @param {string} message - Log message + * @param {...any} args - Additional arguments + */ +export function error(message, ...args) { + if (config.level <= LOG_LEVELS.ERROR) { + // eslint-disable-next-line no-console + console.error(`${config.prefix}[ERROR]`, message, ...args); + } +} + +export { LOG_LEVELS }; diff --git a/tools/media/lib/ui.js b/tools/media/lib/ui.js new file mode 100644 index 0000000..2e4e0cc --- /dev/null +++ b/tools/media/lib/ui.js @@ -0,0 +1,175 @@ +/** + * UI rendering and event handling + */ + +import { state, org, repo } from './config.js'; +import { + buildInitialIndex, buildIncrementalIndex, shouldReindex, getIndexStatus, +} from './builder.js'; + +export function render() { + const app = document.getElementById('app'); + + const statusHtml = state.status ? ` +
+

Current Index Status

+
+
+ + ${state.status.lastRefresh ? new Date(state.status.lastRefresh).toLocaleString() : 'Never'} +
+
+ + ${state.status.entriesCount || 0} +
+ ${state.status.lastBuildMode ? ` +
+ + ${state.status.lastBuildMode === 'full' ? 'Full rebuild' : 'Incremental'} +
+ ` : ''} + ${state.status.indexLastModified != null ? ` +
+ + ${new Date(state.status.indexLastModified).toLocaleString()} +
+ ` : ''} +
+
+ ` : '
Checking status...
'; + + const elapsedMs = state.buildStartTime ? Date.now() - state.buildStartTime : 0; + const elapsedStr = elapsedMs >= 1000 ? `${(elapsedMs / 1000).toFixed(1)}s` : `${elapsedMs}ms`; + const pct = state.progress.percent; + const etaMs = pct > 0 && pct < 100 ? (elapsedMs / pct) * (100 - pct) : 0; + const etaStr = etaMs > 0 ? `~${(etaMs / 1000).toFixed(1)}s` : ''; + const { totalMs } = state.progress; + let totalStr = ''; + if (totalMs != null) { + totalStr = totalMs >= 1000 ? `${(totalMs / 1000).toFixed(1)}s` : `${totalMs}ms`; + } + + const timingHtml = state.progress.stage === 'complete' && totalStr + ? `Total: ${totalStr}` + : `Elapsed: ${elapsedStr}${etaStr ? `ETA: ${etaStr}` : ''}`; + + const progressHtml = state.building || state.progress.stage !== 'idle' ? ` +
+

Progress

+
+
+
+
+ ${state.progress.stage} + ${state.progress.message} +
+
+ ${timingHtml} +
+
+ ` : ''; + + const logsHtml = state.logs.length > 0 ? ` +
+

Logs (${state.logs.length})

+
    + ${state.logs.map((log) => `
  • ${log.message}
  • `).join('')} +
+
+ ` : ''; + + const errorsHtml = state.errors.length > 0 ? ` +
+

Errors (${state.errors.length})

+
    + ${state.errors.map((err) => `
  • ${err.message}
  • `).join('')} +
+
+ ` : ''; + + app.innerHTML = ` +

Media Index Builder

+

Building index for: ${org}/${repo}

+ + ${statusHtml} + +
+ +
+ + ${progressHtml} + ${errorsHtml} + ${logsHtml} + `; +} + +export function attachEventListeners() { + if (!state.building) { + const buildBtn = document.getElementById('buildBtn'); + if (buildBtn) { + buildBtn.addEventListener('click', () => { + state.building = true; + state.buildStartTime = Date.now(); + state.errors = []; + state.logs = []; + state.progress = { stage: 'starting', message: 'Checking build mode...', percent: 0 }; + render(); + + const runBuild = (useIncremental) => { + const buildFn = useIncremental ? buildIncrementalIndex : buildInitialIndex; + const onLog = (msg) => { + state.logs.push({ message: msg, type: 'info' }); + render(); + }; + return buildFn( + (progress) => { + let finalProgress = progress; + if (progress.stage === 'complete' && state.buildStartTime) { + const totalMs = Date.now() - state.buildStartTime; + const totalStr = totalMs >= 1000 ? `${(totalMs / 1000).toFixed(1)}s` : `${totalMs}ms`; + finalProgress = { ...progress, message: `${progress.message} (${totalStr})`, totalMs }; + } + state.progress = finalProgress; + state.logs.push({ message: finalProgress.message, type: 'info' }); + render(); + }, + useIncremental ? onLog : undefined, + ); + }; + + shouldReindex() + .then(({ shouldReindex: useIncremental, reason }) => { + if (reason && !useIncremental) { + state.logs.push({ message: `Full build: ${reason}`, type: 'info' }); + } + return runBuild(useIncremental); + }) + .then((result) => { + const totalMs = state.buildStartTime ? Date.now() - state.buildStartTime : 0; + const totalStr = totalMs >= 1000 ? `${(totalMs / 1000).toFixed(1)}s` : `${totalMs}ms`; + state.logs.push({ + message: `Index built successfully: ${result.length} entries (${totalStr})`, + type: 'success', + }); + return getIndexStatus(); + }) + .then((status) => { + state.status = status; + }) + .catch((error) => { + state.errors.push({ message: error.message }); + state.logs.push({ message: `Error: ${error.message}`, type: 'error' }); + state.progress = { stage: 'error', message: error.message, percent: 0 }; + }) + .finally(() => { + state.building = false; + state.buildStartTime = null; + render(); + attachEventListeners(); + }); + }); + } + } +} diff --git a/tools/media/media-indexing-strategy.md b/tools/media/media-indexing-strategy.md new file mode 100644 index 0000000..752d5da --- /dev/null +++ b/tools/media/media-indexing-strategy.md @@ -0,0 +1,2160 @@ +# Media Indexing Strategy for AEM Sites +**Date:** February 24, 2026 +**Author:** Testing & Analysis with Claude Code +**Project:** Media Library Integration with Medialog & Auditlog + +--- + +## Summary + +Strategy for building a media index by combining **AEM Auditlog** and **Medialog** APIs, including operational architecture for backfilling historical data, initial index population, and ongoing refresh mechanisms. + +### Key Findings + +- **Two Log Sources:** Auditlog tracks page/file previews; Medialog tracks Media Bus items (images/videos) +- **Temporal Relationship:** Auditlog entries precede medialog by 1.5-2 seconds +- **Path Matching:** Critical to normalize paths (`/drafts/page` vs `/drafts/page.md`) +- **Media Bus vs Content Delivery:** Images/videos use Media Bus (tracked in medialog), PDFs/SVGs/Fragments use content delivery (auditlog only) +- **Parsing Required:** Fragments, PDFs, and SVGs require parsing page content to determine usage relationships +- **Duplicate Events:** Every preview creates new log entries, even without content changes +- **Time Window:** Use 5-second window to match medialog entries to auditlog events + +### Operational Architecture + +- **Two-Tier Backfill:** Separate CLI tool for medialog backfill, separate DA app for index population +- **Historical Coverage:** Supports sites created before medialog existed (2023+) via Status API parsing +- **Initial Setup:** One-time engineer-run process per repository (30-60 minutes total) +- **Ongoing Refresh:** Browser-based auto-refresh every 10 minutes with distributed locking +- **Multi-User Support:** Distributed lock prevents race conditions across concurrent users +- **Scalability:** Handles sites with 10,000+ pages and 50,000+ media items + +--- + +## Log Relationships + +### Auditlog vs Medialog + +| Aspect | Auditlog | Medialog | +|--------|----------|----------| +| **Purpose** | Tracks all preview/publish actions | Tracks Media Bus activity only | +| **Scope** | Pages, PDFs, SVGs, Fragments, images, videos | Images and videos only | +| **Timing** | Logged first (T) | Logged ~1.5-2s later (T+1500ms) | +| **Path Format** | `/drafts/page` | `/drafts/page.md` | +| **Contains** | Page-level events | Media-level events with `resourcePath` | + +### Linking Strategy + +``` +Auditlog Entry Medialog Entries +┌─────────────────────┐ ┌──────────────────────────┐ +│ path: /drafts/page │ ───────>│ resourcePath: /drafts/ │ +│ timestamp: 1000 │ match │ page.md │ +│ │ by: │ timestamp: 1001-1005 │ +└─────────────────────┘ └──────────────────────────┘ + │ + │ Multiple media + ▼ + All have same timestamp +``` + +**Matching Rules:** +1. Normalize paths: `auditlog.path` + `.md` = `medialog.resourcePath` +2. Time window: `medialog.timestamp` within 5 seconds after `auditlog.timestamp` +3. Group medialog entries by `(resourcePath, timestamp)` to find page's media + +--- + +## Content Types & Tracking Methods + +### Complete Matrix + +| Content Type | Delivery Method | Auditlog | Medialog | Tracking Method | Usage Relationship | +|-------------|-----------------|----------|----------|-----------------|-------------------| +| **Images (embedded)** | Media Bus | Page event | Entry with resourcePath | Medialog linking | From resourcePath field | +| **Images (standalone)** | Media Bus | File event | Entry with originalFilename | Medialog linking | No usage (standalone) | +| **Videos (embedded)** | Media Bus | Page event | Entry with resourcePath | Medialog linking | From resourcePath field | +| **Videos (standalone)** | Media Bus | File event | Entry with originalFilename | Medialog linking | No usage (standalone) | +| **PDFs** | Content delivery | File event | None | Auditlog + Parse | Parse page HTML for links | +| **SVGs** | Content delivery | File event | None | Auditlog + Parse | Parse page HTML for links | +| **Fragments** | Content delivery | File event | None | Auditlog + Parse | Parse page HTML for references | +| **Icons** | Icons folder | None | None | Not tracked | N/A | + +### Why Different Tracking Methods? + +**Media Bus Items (Images, Videos):** +- Deduplicated, content-addressed storage +- Hash-based URLs: `media_/` +- Tracked in medialog with `resourcePath` linking to pages +- **No parsing required** - logs provide complete relationships + +**Content Delivery Items (PDFs, SVGs, Fragments):** +- Regular file delivery +- Standard preview/publish lifecycle +- Not content-addressed +- **Parsing required** - logs don't link to containing pages + +**Source:** [AEM Media Documentation](https://www.aem.live/docs/media) + +--- + +## Test Scenarios & Results + +### Scenario A: Page with 3 Images + +**Action:** Created `/drafts/scenario-a.md` with 3 embedded images, previewed once + +**Auditlog Response:** +```json +{ + "entries": [ + { + "path": "/drafts/scenario-a", + "timestamp": 1771936397105, + "route": "preview", + "user": "user@example.com", + "status": 200 + } + ] +} +``` + +**Medialog Response:** +```json +{ + "entries": [ + { + "path": "https://main--repo--org.aem.live/media_a1b2c3/image1.jpg", + "operation": "ingest", + "timestamp": 1771936400523, + "resourcePath": "/drafts/scenario-a.md", + "contentType": "image/jpeg", + "user": "user@example.com", + "mediaHash": "a1b2c3", + "width": "2000", + "height": "1333" + }, + { + "path": "https://main--repo--org.aem.live/media_d4e5f6/image2.jpg", + "operation": "ingest", + "timestamp": 1771936400523, + "resourcePath": "/drafts/scenario-a.md", + "contentType": "image/jpeg", + "user": "user@example.com", + "mediaHash": "d4e5f6", + "width": "2000", + "height": "1500" + }, + { + "path": "https://main--repo--org.aem.live/media_g7h8i9/image3.jpg", + "operation": "ingest", + "timestamp": 1771936400523, + "resourcePath": "/drafts/scenario-a.md", + "contentType": "image/jpeg", + "user": "user@example.com", + "mediaHash": "g7h8i9", + "width": "2000", + "height": "1600" + } + ] +} +``` + +**Learning:** +- All media on same page share identical timestamp +- Media added through markup: `operation: "ingest"` WITHOUT `originalFilename` +- 3.4 second processing delay between logs +- `resourcePath` links media to page + +--- + +### Scenario B: Text-Only Page + +**Action:** Created `/drafts/scenario-b.md` with only text, previewed + +**Auditlog Response:** +```json +{ + "entries": [ + { + "path": "/drafts/scenario-b", + "timestamp": 1771936758410, + "route": "preview", + "user": "user@example.com", + "duration": 1112, + "status": 200 + } + ] +} +``` + +**Medialog Response:** +```json +{ + "entries": [] +} +``` + +**Learning:** +- Auditlog logs text-only pages +- Empty medialog = no Media Bus items on page +- Can detect "all media removed" pattern (with caveats) + +--- + +### Scenario H: Standalone Media Preview + +**Action:** Uploaded and previewed 3 standalone files: +- `/media/standalone-image.jpg` (image) +- `/media/standalone-doc.pdf` (PDF) +- `/media/standalone-graphic.svg` (SVG) + +**Auditlog Response:** +```json +{ + "entries": [ + { + "path": "/media/standalone-image.jpg", + "timestamp": 1771937123456, + "route": "preview", + "user": "user@example.com", + "status": 200 + }, + { + "path": "/media/standalone-doc.pdf", + "timestamp": 1771937125789, + "route": "preview", + "user": "user@example.com", + "status": 200 + }, + { + "path": "/media/standalone-graphic.svg", + "timestamp": 1771937128012, + "route": "preview", + "user": "user@example.com", + "status": 200 + } + ] +} +``` + +**Medialog Response:** +```json +{ + "entries": [ + { + "path": "https://main--repo--org.aem.live/media_j1k2l3/standalone-image.jpg", + "operation": "ingest", + "timestamp": 1771937124567, + "originalFilename": "/media/standalone-image.jpg", + "contentType": "image/jpeg", + "user": "user@example.com", + "mediaHash": "j1k2l3", + "owner": "2d0fcd52abc", + "repo": "2d0fcd52abc", + "width": "1920", + "height": "1080" + } + ] +} +``` + +**Learning:** +- Images: Appear in BOTH logs +- PDFs/SVGs: Auditlog ONLY (not on Media Bus) +- Standalone uploads have `originalFilename` + `owner` + `repo` +- NO `resourcePath` (not linked to page) + +--- + +### Scenario G: Page with Mixed Media + +**Action:** Created `/drafts/scenario-g.md` with: +- 2 embedded images +- 1 PDF preview link +- 1 SVG preview link +- 1 icon (`:headset:`) + +**Auditlog Response (page preview):** +```json +{ + "entries": [ + { + "path": "/drafts/scenario-g", + "timestamp": 1771937500000, + "route": "preview", + "user": "user@example.com", + "status": 200 + } + ] +} +``` + +**Auditlog Response (when user clicks PDF/SVG links):** +```json +{ + "entries": [ + { + "path": "/media/standalone-doc.pdf", + "timestamp": 1771937510000, + "route": "preview", + "user": "user@example.com", + "status": 200 + }, + { + "path": "/media/standalone-graphic.svg", + "timestamp": 1771937515000, + "route": "preview", + "user": "user@example.com", + "status": 200 + } + ] +} +``` + +**Medialog Response:** +```json +{ + "entries": [ + { + "path": "https://main--repo--org.aem.live/media_m4n5o6/image1.jpg", + "operation": "ingest", + "timestamp": 1771937501500, + "resourcePath": "/drafts/scenario-g.md", + "contentType": "image/jpeg", + "user": "user@example.com", + "mediaHash": "m4n5o6", + "width": "1800", + "height": "1200" + }, + { + "path": "https://main--repo--org.aem.live/media_p7q8r9/image2.jpg", + "operation": "ingest", + "timestamp": 1771937501500, + "resourcePath": "/drafts/scenario-g.md", + "contentType": "image/jpeg", + "user": "user@example.com", + "mediaHash": "p7q8r9", + "width": "2000", + "height": "1500" + } + ] +} +``` + +**Learning:** +- Only embedded images tracked in medialog +- Icons: Not tracked +- PDF/SVG links: Create separate auditlog entries when clicked (unrelated timestamps) +- **Cannot determine page→PDF/SVG relationships from logs** +- **Parsing required** to find which pages reference PDFs/SVGs/Fragments + +--- + +### Re-Preview Test: Duplicate Events + +**Action:** Re-previewed scenario-a, scenario-b, scenario-g without changes + +**Auditlog Response:** +```json +{ + "entries": [ + { + "path": "/drafts/scenario-b", + "timestamp": 1771938338331, + "route": "preview", + "user": "user@example.com", + "status": 200 + }, + { + "path": "/drafts/scenario-a", + "timestamp": 1771938338335, + "route": "preview", + "user": "user@example.com", + "status": 200 + }, + { + "path": "/drafts/scenario-g", + "timestamp": 1771938338340, + "route": "preview", + "user": "user@example.com", + "status": 200 + } + ] +} +``` + +**Medialog Response:** +```json +{ + "entries": [ + { + "path": "https://main--repo--org.aem.live/media_m4n5o6/image1.jpg", + "operation": "reuse", + "timestamp": 1771938339903, + "resourcePath": "/drafts/scenario-g.md", + "contentType": "image/jpeg", + "user": "user@example.com", + "mediaHash": "m4n5o6" + }, + { + "path": "https://main--repo--org.aem.live/media_p7q8r9/image2.jpg", + "operation": "reuse", + "timestamp": 1771938339903, + "resourcePath": "/drafts/scenario-g.md", + "contentType": "image/jpeg", + "user": "user@example.com", + "mediaHash": "p7q8r9" + }, + { + "path": "https://main--repo--org.aem.live/media_a1b2c3/image1.jpg", + "operation": "reuse", + "timestamp": 1771938340350, + "resourcePath": "/drafts/scenario-a.md", + "contentType": "image/jpeg", + "user": "user@example.com", + "mediaHash": "a1b2c3" + }, + { + "path": "https://main--repo--org.aem.live/media_d4e5f6/image2.jpg", + "operation": "reuse", + "timestamp": 1771938340350, + "resourcePath": "/drafts/scenario-a.md", + "contentType": "image/jpeg", + "user": "user@example.com", + "mediaHash": "d4e5f6" + }, + { + "path": "https://main--repo--org.aem.live/media_g7h8i9/image3.jpg", + "operation": "reuse", + "timestamp": 1771938340350, + "resourcePath": "/drafts/scenario-a.md", + "contentType": "image/jpeg", + "user": "user@example.com", + "mediaHash": "g7h8i9" + } + ] +} +``` + +**Learning:** +- Every preview creates new events (even without changes) +- `operation: "reuse"` indicates media already exists +- Cannot rely on "new events = new content" +- Must compare current state vs previous state + +--- + +## Parsing Strategy for Linked Content + +### Why Parsing is Needed + +**Media Bus items** (images/videos) have `resourcePath` in medialog that directly links them to pages. **Content delivery items** (PDFs/SVGs/Fragments) do NOT have this linking - you must parse page HTML to find references. + +### Current Implementation + +The codebase already parses for fragments: + +```javascript +// From media-library.js (existing code) +const [fragmentLogEntries, pageLogEntries] = await Promise.all([ + fetchFragments(this.org, this.repo, 'main', since), + fetchPages(this.org, this.repo, 'main', since), +]); + +// Parse pages to build fragment usage map +const usageMap = await buildFragmentUsageMap(pageLogEntries, this.sitePath); + +const mergedData = mergeFragmentEntries( + fragmentsData, + fragmentLogEntries, + this.org, + this.repo, + usageMap // Usage map from parsing +); +``` + +### Unified Parsing Approach + +Extend the existing fragment parsing to include PDFs and SVGs: + +```javascript +async function buildContentUsageMap(pageLogEntries, org, repo) { + const usageMap = { + fragments: new Map(), // fragment path -> [page paths] + pdfs: new Map(), // pdf path -> [page paths] + svgs: new Map(), // svg path -> [page paths] + }; + + for (const pageEvent of pageLogEntries) { + // Fetch page HTML (single fetch per page) + const html = await fetchPageHtml(pageEvent.path, org, repo); + + // Extract all content types in one pass + const fragments = extractFragmentReferences(html); + const pdfs = extractLinks(html, /\.pdf$/); + const svgs = extractLinks(html, /\.svg$/); + + // Build usage maps + fragments.forEach(f => { + if (!usageMap.fragments.has(f)) { + usageMap.fragments.set(f, []); + } + usageMap.fragments.get(f).push(pageEvent.path); + }); + + pdfs.forEach(p => { + if (!usageMap.pdfs.has(p)) { + usageMap.pdfs.set(p, []); + } + usageMap.pdfs.get(p).push(pageEvent.path); + }); + + svgs.forEach(s => { + if (!usageMap.svgs.has(s)) { + usageMap.svgs.set(s, []); + } + usageMap.svgs.get(s).push(pageEvent.path); + }); + } + + return usageMap; +} +``` + +### Extraction Helper Functions + +```javascript +function extractFragmentReferences(html) { + // Fragments typically appear in href attributes + const fragmentPattern = /href="([^"]*\/fragments\/[^"]+)"/g; + const matches = [...html.matchAll(fragmentPattern)]; + return matches.map(m => m[1]).map(normalizePath); +} + +function extractLinks(html, pattern) { + // Extract href or src attributes matching pattern + const linkPattern = new RegExp(`(?:href|src)="([^"]*${pattern.source})"`, 'gi'); + const matches = [...html.matchAll(linkPattern)]; + return matches.map(m => m[1]).map(normalizePath); +} + +function normalizePath(path) { + // Remove query params, hashes + return path.split('?')[0].split('#')[0]; +} +``` + +### Performance Optimization + +**Single-pass parsing:** +```javascript +// GOOD: Fetch once, extract all +const html = await fetchPageHtml(page); +const allContent = { + fragments: extractFragments(html), + pdfs: extractPdfs(html), + svgs: extractSvgs(html), +}; + +// BAD: Multiple fetches +const fragments = extractFragments(await fetchPageHtml(page)); +const pdfs = extractPdfs(await fetchPageHtml(page)); // Duplicate fetch! +``` + +### When to Parse + +**Parse frequency:** +- **Initial build:** Parse all pages to establish complete usage map +- **Incremental update:** Parse only pages with new auditlog events +- **Validation:** Periodic full re-parse (weekly/monthly) to catch any drift + +--- + +## Operational Architecture + +### Overview + +Two-tier approach: historical data backfill (Tier 1) + ongoing incremental updates (Tier 2). Separates concerns, handles large sites efficiently, no server infrastructure for ongoing operations. + +### Architecture Diagram + +``` +Historical Data (2023-2026) Live Data (2026+) +┌──────────────────────┐ ┌────────────────────┐ +│ Status API │ │ Auditlog API │ +│ (all pages) │ │ Medialog API │ +└──────────┬───────────┘ └─────────┬──────────┘ + │ │ + v v + ┌──────────────┐ ┌──────────────┐ + │ Tier 1: │ │ Incremental │ + │ Medialog │ │ Refresh │ + │ Backfill CLI │ │ (10-min) │ + └──────┬───────┘ └──────┬───────┘ + │ │ + v │ + ┌──────────────┐ │ + │ Medialog API │ │ + │ (populated) │ │ + └──────┬───────┘ │ + │ │ + └─────────────┬────────────────────┘ + v + ┌──────────────┐ + │ Tier 2: │ + │ Index │ + │ Population │ + └──────┬───────┘ + v + ┌──────────────┐ + │ .da/ │ + │ mediaindex/ │ + │ index.json │ + └──────────────┘ +``` + +--- + +### Tier 1: Medialog Backfill (One-Time) + +**Purpose:** Populate medialog API with historical data for sites created before medialog existed + +**Implementation:** CLI tool at `/media-log-ingestor` + +**Process:** +1. Engineer runs CLI tool with org/repo credentials +2. Tool fetches all pages via Status API +3. Parses markdown content to extract media references +4. Sends entries to Medialog API in batches +5. Deduplicates based on media hash (first = ingest, subsequent = reuse) +6. Enriches with user information from preview logs + +**Command:** +```bash +logmedia ingest --org myorg --repo myrepo --token +``` + +**Characteristics:** +- One-time operation per repository +- Takes 5-30 minutes depending on site size +- Handles rate limits (10 req/sec) +- Resumable on failure +- Creates historical medialog entries with "ingest" operations + +**Output:** +- Medialog API populated with historical media references +- All pages analyzed, media tracked back to 2023 +- Ready for Tier 2 index building + +--- + +### Tier 2: Index Population & Refresh + +#### Initial Population (One-Time per Site) + +**Purpose:** Build complete media index from medialog + auditlog data + +**Implementation:** Separate DA app at `/tools/media-indexer` (to be created) + +**Process:** +1. Engineer navigates to `/tools/media-indexer` +2. Clicks "Build Initial Index" button +3. Server-side process: + - Fetches all medialog entries (from Tier 1 backfill) + - Fetches all auditlog entries (last 90 days) + - Processes and combines data + - Parses pages for PDFs/SVGs/Fragments + - Deduplicates and sorts + - Writes to `.da/mediaindex/index.json` +4. Displays progress (X of Y pages processed) +5. Completes in 30-60 seconds + +**Characteristics:** +- One-time per repository +- Server-side execution (handles large datasets) +- Shows progress indicator +- Atomic operation (succeeds or fails completely) +- Creates complete index with all historical + recent data + +**Output:** +``` +.da/mediaindex/ +├── index.json # Complete media index +├── medialog-meta.json # { lastFetchTime, entriesCount, lastRefreshBy } +└── lock.json # Distributed lock (initially unlocked) +``` + +--- + +#### Incremental Refresh (Ongoing) + +**Purpose:** Keep index up-to-date with new preview activity + +**Two Modes:** + +**1. User-Triggered Refresh** +- User clicks "Refresh" button in media library +- Acquires distributed lock +- Fetches logs since last update (incremental) +- Merges with existing index +- Updates UI immediately +- Takes 2-5 seconds + +**2. Background Auto-Refresh** +- Runs every 10 minutes from any open browser +- Checks if lock is available +- Checks if index is stale (> 5 minutes old) +- If both true, performs incremental refresh +- Silent operation, no UI disruption +- Dispatches event for UI refresh when complete + +--- + +### Distributed Locking Strategy + +**Problem:** Multiple users may have media library open simultaneously, each browser trying to refresh every 10 minutes. Without coordination, this creates race conditions and corrupts the index. + +**Solution:** Distributed lock using `.da/mediaindex/lock.json` + +#### Lock Structure + +```json +{ + "locked": true, + "lockedBy": "user@example.com|session-abc123", + "lockedAt": 1709567890000, + "operation": "auto-refresh", + "timeout": 300000 +} +``` + +#### Lock Behavior + +**Acquiring Lock:** +```javascript +1. Read current lock.json +2. If locked=false OR (now - lockedAt) > timeout: + - Write new lock with your identity + - Wait 500ms + - Re-read to verify (race condition check) + - If lockedBy matches yours, lock acquired + - Else, retry (max 3 attempts with 2s delay) +3. If locked by someone else: + - Return "lock held by X" +``` + +**Releasing Lock:** +```javascript +1. Read current lock.json +2. If lockedBy matches your identity: + - Write { locked: false, releasedAt: now } +3. Else, skip (don't own lock) +``` + +**Lock Timeout:** +- Default: 5 minutes (300000ms) +- After 5 minutes, lock considered expired +- Next process treats expired lock as unlocked +- Handles browser crashes gracefully + +**Priority:** +- User-triggered refresh > Background refresh +- User clicks button: Attempts lock immediately +- Background timer: Checks lock first, skips if held + +--- + +### File Structure & Metadata + +#### .da/mediaindex/index.json + +Main index file containing processed media entries: + +```json +[ + { + "hash": "abc123", + "url": "https://main--repo--org.aem.live/media_abc123/image.jpg", + "name": "image.jpg", + "page": "/drafts/my-page.md", + "timestamp": 1709567890000, + "user": "user@example.com", + "operation": "ingest", + "type": "img > jpg", + "source": "medialog" + }, + { + "path": "/media/doc.pdf", + "usedIn": ["/drafts/page1.md", "/drafts/page2.md"], + "timestamp": 1709567890000, + "user": "user@example.com", + "type": "document > pdf", + "status": "referenced", + "source": "auditlog-parsed" + } +] +``` + +#### .da/mediaindex/medialog-meta.json + +Metadata tracking last refresh: + +```json +{ + "lastFetchTime": 1709567890000, + "entriesCount": 1523, + "lastRefreshBy": "user@example.com" +} +``` + +#### .da/mediaindex/lock.json + +Distributed lock state: + +```json +{ + "locked": false, + "lockedBy": null, + "lockedAt": null, + "operation": null, + "releasedAt": 1709567890000 +} +``` + +--- + +### Operational Procedures + +#### Initial Setup (One-Time per Repository) + +**Step 1: Backfill Medialog (if site existed before 2026)** + +```bash +# Clone backfill tool +git clone +cd media-log-ingestor + +# Install dependencies +npm install + +# Get authentication token +npm run token # Shows instructions + +# Run backfill +npm run ingest -- --org myorg --repo myrepo --token + +# Wait for completion (5-30 minutes) +``` + +**Step 2: Build Initial Index** + +``` +1. Navigate to: https://main--repo--org.aem.live/tools/media-indexer +2. Click "Build Initial Index" +3. Wait for progress bar to complete (30-60 seconds) +4. Verify: "Index built successfully: 1523 entries" +``` + +**Step 3: Enable Auto-Refresh** + +``` +1. Open media library: https://main--repo--org.aem.live/tools/media +2. Auto-refresh starts automatically (10-minute interval) +3. Verify in console: "Background auto-refresh started" +``` + +--- + +#### Ongoing Operations + +**User-Triggered Refresh:** +- User clicks "Refresh" button +- Shows progress: "Fetching logs..." → "Processing..." → "Complete" +- Updates UI with latest media +- Frequency: As needed (typically when adding new content) + +**Background Auto-Refresh:** +- Runs silently every 10 minutes +- Logs to console: "Auto-refresh starting..." or "Index fresh, skipping" +- No user interaction required +- Handles multiple users gracefully via locking + +**Manual Unlock (Admin Only):** +- If lock stuck (rare), admin can force unlock +- Click "Force Unlock" button in index manager +- Confirms before releasing lock +- Use only when certain no other process is running + +--- + +#### Monitoring & Troubleshooting + +**Check Index Status:** + +```javascript +// In browser console +const status = await getIndexStatus(sitePath); +console.log(status); +// { +// lastRefresh: 1709567890000, +// entriesCount: 1523, +// isStale: false, +// locked: false +// } +``` + +**Common Issues:** + +| Issue | Symptom | Solution | +|-------|---------|----------| +| Index not updating | Old timestamps | Check lock status, force unlock if stuck | +| Missing recent media | Added but not shown | Trigger manual refresh | +| Duplicate entries | Same media appears multiple times | Full rebuild (weekly maintenance) | +| Lock timeout | "Cannot acquire lock" errors | Wait 5 minutes or force unlock | +| Parse failures | PDFs/SVGs not tracked | Check page accessibility, retry | + +**Maintenance Tasks:** + +- **Daily:** None (auto-refresh handles updates) +- **Weekly:** Review error logs, check for parse failures +- **Monthly:** Full index rebuild to eliminate drift +- **Quarterly:** Verify backfill integrity, re-run if needed + +--- + +### Performance & Scalability + +#### Incremental Refresh Performance + +| Site Size | Pages | Media | Refresh Time | +|-----------|-------|-------|--------------| +| Small | < 100 | < 500 | 1-2 seconds | +| Medium | 100-1000 | 500-5000 | 2-5 seconds | +| Large | 1000-10000 | 5000-50000 | 5-10 seconds | +| Very Large | > 10000 | > 50000 | 10-30 seconds | + +**Optimization Techniques:** +- Hash-based lookups (O(1) instead of O(N)) +- Group medialog by resourcePath (reduce iterations) +- Parse only changed pages (not entire site) +- Batch writes to DA (reduce API calls) +- Cache parsed HTML (avoid re-fetching) + +#### Lock Contention + +**Scenario:** 10 users have media library open + +- Each browser runs auto-refresh every 10 minutes +- On average, 1 lock attempt per minute across all users +- Lock held for 2-5 seconds during refresh +- Contention rate: < 10% (most attempts succeed) + +**Mitigation:** +- Lock timeout ensures stale locks don't block +- Retry logic with exponential backoff +- Background refresh skips if locked (no retry spam) +- User-triggered refresh has higher priority + +--- + +### Future Enhancements + +**Tier 1 Improvements:** +- Incremental medialog backfill (only new pages) +- Scheduled re-backfill for updated pages +- Webhook integration for real-time updates + +**Tier 2 Improvements:** +- Worker-based refresh (dedicated service vs browser-based) +- Streaming updates (websocket for live refresh) +- Index compression for very large sites +- Pagination for index loading + +**Locking Improvements:** +- Leader election (one browser becomes "leader" for all refreshes) +- Heartbeat mechanism (detect crashed processes faster) +- Lock priority queue (order competing processes) + +--- + +## Indexing Flowcharts + +### 1. Initial Index Build (First Pull) + +```mermaid +flowchart TD + Start([Start: First Pull / Backfill]) --> FetchLogs[Fetch ALL available logs:
- Auditlog
- Medialog
Note: Medialog is new, get all available] + FetchLogs --> FilterAudit{Filter auditlog
by file type} + + FilterAudit -->|.md files| ProcessPages[Group: Pages to process] + FilterAudit -->|.pdf, .svg, /fragments/| ProcessFiles[Group: Content delivery files] + + ProcessPages --> ParseContent[Parse page HTML:
- Extract fragment references
- Extract PDF links
- Extract SVG links] + + ParseContent --> BuildUsageMap[Build usage maps:
- fragments -> pages
- pdfs -> pages
- svgs -> pages] + + ProcessPages --> LoopPages{For each page} + LoopPages --> NormalizePath[Normalize: page -> page.md] + NormalizePath --> FindMedia[Find medialog entries WHERE:
- resourcePath = normalized path
- timestamp within 5s of audit timestamp] + + FindMedia --> HasMedia{Media found?} + + HasMedia -->|Yes| CreateRefs[Create index entries:
hash, page, timestamp, status='referenced'] + HasMedia -->|No| SkipPage[Page has no Media Bus items
May have PDFs/SVGs/fragments] + + CreateRefs --> MorePages{More pages?} + SkipPage --> MorePages + MorePages -->|Yes| LoopPages + MorePages -->|No| ProcessFiles + + ProcessFiles --> LoopFiles{For each file} + LoopFiles --> FileType{File type?} + + FileType -->|PDF/SVG/Fragment| CheckUsage[Check in usage map:
Referenced by any page?] + FileType -->|Image standalone| FindStandalone[Find in medialog:
originalFilename present?] + + CheckUsage -->|Referenced| CreateLinkedEntry[Create index entry:
path, usedIn pages, type, status='referenced'] + CheckUsage -->|Not referenced| CreateStandaloneFile[Create index entry:
path, type, status='file-unused'] + + FindStandalone -->|Found| CreateStandaloneEntry[Create index entry:
hash, originalFilename, status='uploaded-unused'] + FindStandalone -->|Not found| SkipFile[Skip: Not Media Bus item] + + CreateLinkedEntry --> MoreFiles{More files?} + CreateStandaloneFile --> MoreFiles + CreateStandaloneEntry --> MoreFiles + SkipFile --> MoreFiles + + MoreFiles -->|Yes| LoopFiles + MoreFiles -->|No| MergeUsage[Merge usage map into index] + + BuildUsageMap --> MergeUsage + MergeUsage --> SaveIndex[Save index to DA:
/.da/mediaindex/media.json] + + SaveIndex --> SaveMeta[Save metadata:
lastFetchTime, itemCount] + SaveMeta --> End([End: Index Built]) +``` + +--- + +### 2. Incremental Update Flow + +```mermaid +flowchart TD + Start([Start: Incremental Update]) --> LoadMeta[Load index metadata:
Get lastFetchTime] + LoadMeta --> FetchNew[Fetch NEW logs since lastFetchTime:
- Auditlog
- Medialog] + + FetchNew --> HasNewEvents{New events exist?} + HasNewEvents -->|No| EndEarly([End: No updates needed]) + HasNewEvents -->|Yes| LoadIndex[Load existing index from DA] + + LoadIndex --> ParseNewPages[Parse newly previewed pages:
Update usage maps for:
- Fragments
- PDFs
- SVGs] + + ParseNewPages --> LoopNewPages{For each page
in new auditlog} + + LoopNewPages --> NormalizePath[Normalize: page -> page.md] + NormalizePath --> FindNewMedia[Find NEW medialog entries:
- resourcePath = normalized path
- timestamp within 5s of audit timestamp] + + FindNewMedia --> LoadOldState[Load OLD index entries
for this page] + + LoadOldState --> CompareState{Compare:
Old vs New} + + CompareState --> ExtractHashes[Extract:
- oldHashes from index
- newHashes from medialog] + + ExtractHashes --> CheckChanges{Content
changed?} + + CheckChanges -->|newHashes empty| CheckAmbiguous{Old state
had media?} + CheckChanges -->|newHashes exist| CompareHashes[Compare hash sets] + + CheckAmbiguous -->|Yes| AmbiguousCase[Ambiguous: May have PDFs/SVGs
or all removed or processing delay] + CheckAmbiguous -->|No| StillText[Still text-only
No action needed] + + AmbiguousCase --> VerifyParse{Parse to verify?} + VerifyParse -->|Yes| QuickParse[Quick parse: Check for img tags] + VerifyParse -->|No| AssumeRemoved[Assume removed
Mark as medium confidence] + + QuickParse --> HasImages{Images found?} + HasImages -->|No| ConfirmedRemoved[Confirmed: All removed] + HasImages -->|Yes| DataInconsistency[Data inconsistency
Flag for investigation] + + ConfirmedRemoved --> MarkUnreferenced[DELETE or UPDATE all old entries:
status = 'unreferenced'] + AssumeRemoved --> MarkUnreferenced + + CompareHashes --> FindAdded[Added = newHashes NOT IN oldHashes] + CompareHashes --> FindRemoved[Removed = oldHashes NOT IN newHashes] + CompareHashes --> FindUnchanged[Unchanged = intersection] + + FindAdded --> HasAdded{Additions?} + HasAdded -->|Yes| InsertNew[INSERT new index entries:
hash, page, timestamp, status='referenced'] + HasAdded -->|No| CheckRemoved + + FindRemoved --> CheckRemoved{Removals?} + CheckRemoved -->|Yes| DeleteOld[DELETE or UPDATE removed entries:
status = 'unreferenced'] + CheckRemoved -->|No| CheckUnchanged + + FindUnchanged --> CheckUnchanged{Unchanged?} + CheckUnchanged -->|Yes| UpdateTimestamp[UPDATE timestamp only
for unchanged entries] + CheckUnchanged -->|No| NextPage + + InsertNew --> NextPage + DeleteOld --> NextPage + UpdateTimestamp --> NextPage + MarkUnreferenced --> NextPage + StillText --> NextPage + DataInconsistency --> NextPage + + NextPage{More pages?} + NextPage -->|Yes| LoopNewPages + NextPage -->|No| UpdateLinkedContent[Update linked content from usage maps:
- Add new fragment references
- Remove old references
- Update PDF/SVG usage] + + UpdateLinkedContent --> SaveUpdated[Save updated index to DA] + + SaveUpdated --> UpdateMeta[Update metadata:
lastFetchTime = now
itemCount = index.length] + + UpdateMeta --> End([End: Index Updated]) +``` + +--- + +### 3. Page State Detection Logic + +```mermaid +flowchart TD + Start([Page Preview Event]) --> GetAudit[Auditlog entry:
page X at time T] + + GetAudit --> SearchMedia[Search medialog for entries:
- resourcePath = X.md
- timestamp in T, T+5000ms] + + SearchMedia --> MediaCount{Count of
media entries} + + MediaCount -->|0 entries| CheckHistory1{Check index:
Page existed before?} + MediaCount -->|N entries| HasMedia[Scenario: Page has N Media Bus items
May also have PDFs/SVGs/fragments] + + CheckHistory1 -->|No| NewTextPage[New text-only page
OR page with only PDFs/SVGs/fragments
Action: Check parsed content] + CheckHistory1 -->|Yes, had media| Ambiguous[Ambiguous scenario:
- All Media Bus items removed?
- Or page has PDFs/SVGs only?
- Or processing delay?] + CheckHistory1 -->|Yes, no media| StillText[Still text-only or non-Media Bus
Action: Update timestamp only] + + Ambiguous --> DecideParse{Parse to verify?} + DecideParse -->|Yes| ParseCheck[Parse HTML for img tags] + DecideParse -->|No| AssumeRemoved[Assume removed
Medium confidence] + + ParseCheck --> ImagesFound{Images in HTML?} + ImagesFound -->|No| ConfirmRemoved[Confirmed: All removed
Action: Mark old entries unreferenced] + ImagesFound -->|Yes| Inconsistent[Inconsistency detected
Action: Flag for investigation] + + HasMedia --> GroupByTimestamp[Group media by timestamp:
All should have same timestamp] + + GroupByTimestamp --> ExtractHashes[Extract: List of mediaHashes] + + ExtractHashes --> CheckPrevious{Check index:
Page existed before?} + + CheckPrevious -->|No| NewPage[New page with media
Action: Create all entries] + CheckPrevious -->|Yes| CompareHashes[Compare old vs new hashes] + + CompareHashes --> DiffResult{Difference?} + + DiffResult -->|Same hashes| NoChange[No content change
Action: Update timestamps] + DiffResult -->|Added hashes| MediaAdded[Media added
Action: Insert new entries] + DiffResult -->|Removed hashes| SomeRemoved[Media removed
Action: Delete/flag entries] + DiffResult -->|Both added & removed| MediaChanged[Media changed
Action: Insert + Delete] + + NewTextPage --> End([End]) + ConfirmRemoved --> End + AssumeRemoved --> End + Inconsistent --> End + StillText --> End + NewPage --> End + NoChange --> End + MediaAdded --> End + SomeRemoved --> End + MediaChanged --> End +``` + +--- + +### 4. Medialog Entry Classification + +```mermaid +flowchart TD + Start([Medialog Entry]) --> CheckOperation{Check:
operation field} + + CheckOperation -->|"ingest"| CheckPath{Has
resourcePath?} + CheckOperation -->|"reuse"| ReuseCase[Reuse Operation] + CheckOperation -->|"delete"| DeleteCase[Delete Operation
Future: TBD by API team] + + CheckPath -->|Yes| IngestInPage[Ingest via Markup
New media added to page] + CheckPath -->|No| CheckOriginal{Has
originalFilename?} + + CheckOriginal -->|Yes| StandaloneUpload[Standalone Upload
Media previewed alone] + CheckOriginal -->|No| AnomalyCase[Anomaly: No path, no filename
Should not occur] + + IngestInPage --> ExtractName1[Extract name from URL path:
media_hash.jpg] + IngestInPage --> UseResource1[Use resourcePath for doc field] + IngestInPage --> NoOwner1[NO owner/repo fields] + + StandaloneUpload --> ExtractName2[Extract name from originalFilename:
clay-banks-cabin.jpg] + StandaloneUpload --> NoDoc2[Doc field = empty/null] + StandaloneUpload --> HasOwner2[HAS owner/repo fields] + + ReuseCase --> ExtractName3[Extract name from URL path:
media_hash.jpg] + ReuseCase --> UseResource3[Use resourcePath for doc field] + ReuseCase --> NoOwner3[NO owner/repo fields] + + DeleteCase --> WaitSpec[Wait for API spec confirmation] + + ExtractName1 --> CreateIndex1[Create index entry:
source='medialog-ingest-page'] + UseResource1 --> CreateIndex1 + NoOwner1 --> CreateIndex1 + + ExtractName2 --> CreateIndex2[Create index entry:
source='medialog-ingest-standalone'] + NoDoc2 --> CreateIndex2 + HasOwner2 --> CreateIndex2 + + ExtractName3 --> CreateIndex3[Create index entry:
source='medialog-reuse'] + UseResource3 --> CreateIndex3 + NoOwner3 --> CreateIndex3 + + CreateIndex1 --> End([Process Complete]) + CreateIndex2 --> End + CreateIndex3 --> End + AnomalyCase --> End + WaitSpec --> End +``` + +--- + +## Decision Tables + +### Table 1: Matching Auditlog to Medialog + +| Auditlog Entry | Expected Medialog | Action | +|----------------|-------------------|--------| +| Page preview @ T | N entries with resourcePath=page.md, timestamp in [T, T+5000] | Link entries to page, extract media list | +| Page preview @ T | 0 entries matching | Page is text-only OR all media removed OR has only PDFs/SVGs/fragments | +| PDF/SVG/Fragment preview @ T | 0 entries | Expected - Not on Media Bus | +| Image preview @ T | 1 entry with originalFilename=image.jpg | Standalone image upload | + +### Table 2: Index Update Actions + +| Old Index State | New Medialog State | Action | Index Update | +|-----------------|-------------------|--------|--------------| +| Page not in index | Medialog has N hashes | New page | INSERT N entries with status='referenced' | +| Page has [A,B] | Medialog has [A,B] | No change | UPDATE timestamps only | +| Page has [A,B] | Medialog has [A,B,C] | Media added | INSERT entry for C | +| Page has [A,B,C] | Medialog has [A,B] | Media removed | DELETE or FLAG entry for C as 'unreferenced' | +| Page has [A,B] | Medialog empty | Ambiguous | Parse to verify or assume all removed | +| Page has [A,B] | Medialog has [C,D] | Complete change | DELETE [A,B], INSERT [C,D] | + +### Table 3: Processing Optimization + +| Condition | Optimization | Benefit | +|-----------|--------------|---------| +| Event timestamp < lastFetchTime | Skip event | Avoid reprocessing old data | +| Same page, multiple events in batch | Process only latest | Reduce redundant work | +| No changes detected in comparison | Skip write operation | Reduce DA API calls | +| medialog entries have same timestamp | Batch process as single page state | Improve efficiency | +| Parsing multiple content types | Single fetch, extract all | Minimize network calls | + +### Table 4: Content Type Decision Matrix + +| Content Type | Found In | Requires Parsing | Usage Tracking Method | +|-------------|----------|------------------|----------------------| +| Images | Medialog | No | resourcePath field provides direct link | +| Videos | Medialog | No | resourcePath field provides direct link | +| PDFs | Auditlog only | Yes | Parse page HTML for PDF links | +| SVGs | Auditlog only | Yes | Parse page HTML for SVG links | +| Fragments | Auditlog only | Yes | Parse page HTML for fragment references | +| Icons | Not tracked | N/A | Not included in index | + +--- + +## Implementation Pseudocode + +### Initial Build Algorithm + +```javascript +async function buildInitialIndex(org, repo, ref = 'main') { + const index = []; + + // 1. Fetch all available logs (medialog is new, backfill what exists) + const auditLogEntries = await fetchAuditLog(org, repo, ref, since=null); + const mediaLogEntries = await fetchMediaLog(org, repo, ref, since=null); + + // 2. Separate pages from files + const pages = auditLogEntries.filter(e => isPage(e.path)); + const files = auditLogEntries.filter(e => !pages.includes(e)); + + // 3. Parse pages to build usage maps for linked content + const usageMap = await buildContentUsageMap(pages, org, repo); + + // 4. Process each page + for (const pageEvent of pages) { + const normalizedPath = normalizePath(pageEvent.path); // Add .md if needed + + // Find matching medialog entries within 5-second window + const pageMedia = mediaLogEntries.filter(m => + m.resourcePath === normalizedPath && + m.timestamp >= pageEvent.timestamp && + m.timestamp < pageEvent.timestamp + 5000 + ); + + // Create index entries for Media Bus items + for (const media of pageMedia) { + index.push({ + hash: media.mediaHash, + page: normalizedPath, + url: media.path, + name: extractName(media), + timestamp: media.timestamp, + user: media.user, + operation: media.operation, + status: 'referenced', + source: 'medialog', + type: 'image' // or 'video' + }); + } + } + + // 5. Process standalone files and linked content + for (const fileEvent of files) { + const filePath = fileEvent.path; + + if (isPdfOrSvg(filePath)) { + // Check if referenced by any page + const linkedPages = usageMap.pdfs.get(filePath) || usageMap.svgs.get(filePath) || []; + + index.push({ + path: filePath, + usedIn: linkedPages, + timestamp: fileEvent.timestamp, + user: fileEvent.user, + type: getFileType(filePath), + status: linkedPages.length > 0 ? 'referenced' : 'file-unused', + source: 'auditlog-parsed' + }); + + } else if (isFragment(filePath)) { + // Check if referenced by any page + const linkedPages = usageMap.fragments.get(filePath) || []; + + index.push({ + path: filePath, + usedIn: linkedPages, + timestamp: fileEvent.timestamp, + user: fileEvent.user, + type: 'fragment', + status: linkedPages.length > 0 ? 'referenced' : 'file-unused', + source: 'auditlog-parsed' + }); + + } else if (isImage(filePath)) { + // Check if in medialog (standalone upload) + const mediaEntry = mediaLogEntries.find(m => + m.originalFilename === filePath && + Math.abs(m.timestamp - fileEvent.timestamp) < 5000 + ); + + if (mediaEntry) { + index.push({ + hash: mediaEntry.mediaHash, + url: mediaEntry.path, + name: extractFromOriginalFilename(mediaEntry.originalFilename), + originalFilename: mediaEntry.originalFilename, + timestamp: mediaEntry.timestamp, + user: mediaEntry.user, + status: 'uploaded-unused', + source: 'medialog', + type: 'image' + }); + } + } + } + + // 6. Save index + await saveMediaSheet(index, `/${org}/${repo}`); + await saveLogMeta(`/${org}/${repo}`, { + lastFetchTime: Date.now(), + processedItems: index.length + }); + + return index; +} +``` + +### Incremental Update Algorithm + +```javascript +async function updateIndex(org, repo, ref = 'main') { + // 1. Load existing state + const meta = await loadLogMeta(`/${org}/${repo}`); + const existingIndex = await loadMediaSheet(`/${org}/${repo}`); + const lastFetchTime = meta?.lastFetchTime || null; + + // 2. Fetch new events since last update + const newAuditLog = await fetchAuditLog(org, repo, ref, since=lastFetchTime); + const newMediaLog = await fetchMediaLog(org, repo, ref, since=lastFetchTime); + + if (newAuditLog.length === 0 && newMediaLog.length === 0) { + return existingIndex; // No updates needed + } + + // 3. Parse newly previewed pages + const newPages = newAuditLog.filter(e => isPage(e.path)); + const newUsageMap = await buildContentUsageMap(newPages, org, repo); + + // 4. Process each new page event + const updatedIndex = [...existingIndex]; + + for (const pageEvent of newPages) { + const normalizedPath = normalizePath(pageEvent.path); + + // Find new medialog entries for this page + const newPageMedia = newMediaLog.filter(m => + m.resourcePath === normalizedPath && + m.timestamp >= pageEvent.timestamp && + m.timestamp < pageEvent.timestamp + 5000 + ); + + // Get old state from index + const oldPageEntries = existingIndex.filter(e => + e.page === normalizedPath && e.source === 'medialog' + ); + const oldHashes = new Set(oldPageEntries.map(e => e.hash)); + const newHashes = new Set(newPageMedia.map(m => m.mediaHash)); + + // Handle ambiguous case: no medialog entries + if (newPageMedia.length === 0 && oldPageEntries.length > 0) { + // Option A: Parse to verify (recommended for high-value pages) + const shouldVerify = oldPageEntries.length >= 5 || isImportantPage(normalizedPath); + + if (shouldVerify) { + const hasImages = await quickParseForImages(pageEvent.path, org, repo); + + if (hasImages === false) { + // Confirmed: All removed + for (const oldEntry of oldPageEntries) { + const idx = updatedIndex.indexOf(oldEntry); + if (idx !== -1) updatedIndex.splice(idx, 1); + } + } else if (hasImages === true) { + // Data inconsistency - log for investigation + console.warn('Data inconsistency detected:', normalizedPath); + } + } else { + // Option B: Trust logs, assume removed + for (const oldEntry of oldPageEntries) { + const idx = updatedIndex.indexOf(oldEntry); + if (idx !== -1) updatedIndex.splice(idx, 1); + } + } + + continue; + } + + // Detect changes + const added = [...newHashes].filter(h => !oldHashes.has(h)); + const removed = [...oldHashes].filter(h => !newHashes.has(h)); + const unchanged = [...newHashes].filter(h => oldHashes.has(h)); + + // Apply changes + + // 1. Remove deleted media + for (const hash of removed) { + const idx = updatedIndex.findIndex(e => + e.hash === hash && e.page === normalizedPath + ); + if (idx !== -1) { + updatedIndex.splice(idx, 1); + } + } + + // 2. Add new media + for (const hash of added) { + const mediaEntry = newPageMedia.find(m => m.mediaHash === hash); + updatedIndex.push({ + hash: mediaEntry.mediaHash, + page: normalizedPath, + url: mediaEntry.path, + name: extractName(mediaEntry), + timestamp: mediaEntry.timestamp, + user: mediaEntry.user, + operation: mediaEntry.operation, + status: 'referenced', + source: 'medialog', + type: 'image' + }); + } + + // 3. Update timestamps for unchanged media + for (const hash of unchanged) { + const idx = updatedIndex.findIndex(e => + e.hash === hash && e.page === normalizedPath + ); + if (idx !== -1) { + const mediaEntry = newPageMedia.find(m => m.mediaHash === hash); + updatedIndex[idx].timestamp = mediaEntry.timestamp; + } + } + } + + // 5. Update linked content (PDFs, SVGs, Fragments) from usage map + for (const fileEvent of newAuditLog.filter(e => !isPage(e.path))) { + const filePath = fileEvent.path; + + if (isPdfOrSvg(filePath) || isFragment(filePath)) { + const usageKey = isPdf(filePath) ? 'pdfs' : + isSvg(filePath) ? 'svgs' : 'fragments'; + const linkedPages = newUsageMap[usageKey].get(filePath) || []; + + // Update or create entry + const existingIdx = updatedIndex.findIndex(e => e.path === filePath); + + if (existingIdx !== -1) { + // Update existing entry + updatedIndex[existingIdx].usedIn = linkedPages; + updatedIndex[existingIdx].timestamp = fileEvent.timestamp; + updatedIndex[existingIdx].status = linkedPages.length > 0 ? 'referenced' : 'file-unused'; + } else { + // Create new entry + updatedIndex.push({ + path: filePath, + usedIn: linkedPages, + timestamp: fileEvent.timestamp, + user: fileEvent.user, + type: getFileType(filePath), + status: linkedPages.length > 0 ? 'referenced' : 'file-unused', + source: 'auditlog-parsed' + }); + } + } + } + + // 6. Save updated index + await saveMediaSheet(updatedIndex, `/${org}/${repo}`); + await saveLogMeta(`/${org}/${repo}`, { + lastFetchTime: Date.now(), + processedItems: updatedIndex.length + }); + + return updatedIndex; +} +``` + +### Content Usage Map Builder + +```javascript +async function buildContentUsageMap(pageLogEntries, org, repo) { + const usageMap = { + fragments: new Map(), + pdfs: new Map(), + svgs: new Map(), + }; + + for (const pageEvent of pageLogEntries) { + try { + // Fetch page HTML + const html = await fetchPageHtml(pageEvent.path, org, repo); + + // Extract all content types in single pass + const fragments = extractFragmentReferences(html); + const pdfs = extractLinks(html, /\.pdf$/); + const svgs = extractLinks(html, /\.svg$/); + + const normalizedPage = normalizePath(pageEvent.path); + + // Build usage maps + fragments.forEach(f => { + if (!usageMap.fragments.has(f)) { + usageMap.fragments.set(f, []); + } + if (!usageMap.fragments.get(f).includes(normalizedPage)) { + usageMap.fragments.get(f).push(normalizedPage); + } + }); + + pdfs.forEach(p => { + if (!usageMap.pdfs.has(p)) { + usageMap.pdfs.set(p, []); + } + if (!usageMap.pdfs.get(p).includes(normalizedPage)) { + usageMap.pdfs.get(p).push(normalizedPage); + } + }); + + svgs.forEach(s => { + if (!usageMap.svgs.has(s)) { + usageMap.svgs.set(s, []); + } + if (!usageMap.svgs.get(s).includes(normalizedPage)) { + usageMap.svgs.get(s).push(normalizedPage); + } + }); + + } catch (error) { + console.error(`Failed to parse page ${pageEvent.path}:`, error); + // Continue with other pages + } + } + + return usageMap; +} + +async function fetchPageHtml(pagePath, org, repo, ref = 'main') { + const url = `https://${ref}--${repo}--${org}.aem.page${pagePath}`; + const response = await fetch(url); + if (!response.ok) { + throw new Error(`Failed to fetch ${url}: ${response.status}`); + } + return response.text(); +} + +function extractFragmentReferences(html) { + const fragmentPattern = /href="([^"]*\/fragments\/[^"]+)"/g; + const matches = [...html.matchAll(fragmentPattern)]; + return matches.map(m => normalizePath(m[1])); +} + +function extractLinks(html, pattern) { + const linkPattern = new RegExp(`(?:href|src)="([^"]*${pattern.source})"`, 'gi'); + const matches = [...html.matchAll(linkPattern)]; + return matches.map(m => normalizePath(m[1])); +} + +async function quickParseForImages(pagePath, org, repo, ref = 'main') { + try { + const html = await fetchPageHtml(pagePath, org, repo, ref); + // Simple check: Does it contain tags with media_ URLs? + return html.includes('media_') && html.includes(' /drafts/page.md + if (!cleanPath.includes('.') && !cleanPath.startsWith('/media/')) { + cleanPath = `${cleanPath}.md`; + } + + return cleanPath; +} + +function extractName(mediaEntry) { + // For "ingest" with originalFilename + if (mediaEntry.operation === 'ingest' && mediaEntry.originalFilename) { + return mediaEntry.originalFilename.split('/').pop(); + } + + // For "reuse" or "ingest" without originalFilename + const cleanPath = mediaEntry.path.split('?')[0].split('#')[0]; + return cleanPath.split('/').pop(); +} + +function isPage(path) { + return (path.endsWith('.md') || + (!path.includes('.') && !path.startsWith('/media/'))) && + !path.includes('/fragments/'); +} + +function isPdfOrSvg(path) { + return path.endsWith('.pdf') || path.endsWith('.svg'); +} + +function isPdf(path) { + return path.endsWith('.pdf'); +} + +function isSvg(path) { + return path.endsWith('.svg'); +} + +function isFragment(path) { + return path.includes('/fragments/'); +} + +function isImage(path) { + const imageExts = ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp']; + return imageExts.some(ext => path.toLowerCase().endsWith(ext)); +} + +function getFileType(path) { + if (path.endsWith('.pdf')) return 'pdf'; + if (path.endsWith('.svg')) return 'svg'; + if (path.includes('/fragments/')) return 'fragment'; + if (isImage(path)) return 'image'; + return 'unknown'; +} +``` + +--- + +## Edge Cases & Handling + +### 1. Missing Auditlog Entry for Medialog Events + +**Scenario:** Medialog has entries but no matching auditlog entry + +**Causes:** +- Processing delay (auditlog slower than medialog) +- Auditlog API failure +- Events outside fetched time range + +**Handling:** +```javascript +// Queue orphaned medialog entries for next processing cycle +const orphanedMedia = mediaLog.filter(m => { + return !auditLog.some(a => + normalizePath(a.path) === m.resourcePath && + Math.abs(a.timestamp - m.timestamp) < 10000 + ); +}); + +// Retry on next incremental update with wider time range +``` + +### 2. Timestamp Drift Beyond 5 Seconds + +**Scenario:** Medialog timestamp > 5 seconds after auditlog + +**Causes:** +- Heavy server load +- Batch processing delays +- Queue backlog + +**Handling:** +```javascript +// Adaptive time window +const timeWindow = calculateAdaptiveWindow(processingLoad); +// Start at 5s, increase to 10s or 15s if needed + +// Or use backup matching by sequence +matchByTimestampProximity(auditLog, mediaLog); +``` + +### 3. Duplicate Hash in Multiple Pages + +**Scenario:** Same image used in 5 different pages + +**Handling:** +```javascript +// Create separate index entry for each page reference +// Reference count = unique pages +const referenceCount = index.filter(e => e.hash === targetHash).length; + +// Each entry tracks its specific page usage +``` + +### 4. Page Deleted (No Preview Events) + +**Scenario:** User deletes page entirely via DA + +**Handling:** +```javascript +// After X days (e.g., 30) without preview events: +const staleThresholdMs = 30 * 24 * 60 * 60 * 1000; +const cutoffTime = Date.now() - staleThresholdMs; + +const staleEntries = index.filter(e => + e.timestamp < cutoffTime +); + +// Option A: Flag as potentially stale +staleEntries.forEach(e => e.status = 'potentially-stale'); + +// Option B: Verify by fetching actual document +// If 404, mark as unreferenced +``` + +### 5. Race Condition: Concurrent Previews + +**Scenario:** Two users preview same page simultaneously + +**Handling:** +```javascript +// Use "last-write-wins" with timestamp comparison +if (newEntry.timestamp > existingEntry.timestamp) { + // New entry is more recent, use it + replaceEntry(existingEntry, newEntry); +} else { + // Existing entry is more recent, skip + skipEntry(newEntry); +} + +// OR: Keep both entries and deduplicate later +// based on latest timestamp per hash+page combo +``` + +### 6. Ambiguous Empty Medialog + +**Scenario:** Auditlog shows page preview, medialog is empty, old state had media + +**Possible Causes:** +- All Media Bus items removed +- Page now has only PDFs/SVGs/Fragments +- Processing delay +- API failure + +**Handling:** +```javascript +async function handleAmbiguousCase(pageEvent, oldEntries) { + // Wait for processing delay + await sleep(3000); + + // Retry medialog fetch + const retryEntries = await fetchMediaLog(...); + + if (retryEntries.length > 0) { + return { action: 'update', entries: retryEntries }; + } + + // Still empty - parse to verify for high-value pages + const shouldVerify = oldEntries.length >= 5 || isImportantPage(pageEvent.path); + + if (shouldVerify) { + const hasImages = await quickParseForImages(pageEvent.path); + + if (hasImages === false) { + return { action: 'remove-all', confidence: 'high' }; + } else if (hasImages === true) { + return { action: 'flag-investigation', confidence: 'low' }; + } + } + + // Default: assume removed + return { action: 'remove-all', confidence: 'medium' }; +} +``` + +### 7. Parsing Failures + +**Scenario:** Cannot fetch or parse page HTML + +**Handling:** +```javascript +async function buildContentUsageMap(pageLogEntries, org, repo) { + const usageMap = { fragments: new Map(), pdfs: new Map(), svgs: new Map() }; + const failures = []; + + for (const pageEvent of pageLogEntries) { + try { + const html = await fetchPageHtml(pageEvent.path, org, repo); + // Extract content... + } catch (error) { + failures.push({ page: pageEvent.path, error: error.message }); + // Continue with other pages + } + } + + // Log failures for investigation + if (failures.length > 0) { + console.warn('Failed to parse pages:', failures); + } + + return usageMap; +} +``` + +--- + +## Performance Considerations + +### Scaling Factors + +| Factor | Impact | Mitigation | +|--------|--------|------------| +| **Number of pages** | O(N) processing time | Batch processing, parallel processing | +| **Media per page** | O(M) comparison operations | Hash-based lookups instead of linear search | +| **Event frequency** | Incremental update frequency | Adaptive polling (more frequent when active) | +| **Index size** | Storage and read time | Compress, paginate, or archive old entries | +| **Time window size** | False matches | Optimize to 5s, expand only if needed | +| **Parsing pages** | Network and CPU cost | Cache parsed results, parse only changed pages | + +### Optimization Strategies + +```javascript +// 1. Use Map for O(1) lookups instead of Array.filter +const indexMap = new Map(); +existingIndex.forEach(e => { + const key = `${e.hash}|${e.page}`; + indexMap.set(key, e); +}); + +// 2. Group medialog entries by resourcePath first +const mediaByPage = groupBy(mediaLog, 'resourcePath'); + +// 3. Process only changed pages +const changedPages = new Set(newAuditLog.map(e => e.path)); + +// 4. Batch writes to DA +const BATCH_SIZE = 100; +await saveBatch(updatedEntries, BATCH_SIZE); + +// 5. Cache parsed HTML results +const parseCache = new Map(); +const getCachedHtml = async (page) => { + if (!parseCache.has(page)) { + parseCache.set(page, await fetchPageHtml(page)); + } + return parseCache.get(page); +}; + +// 6. Parallel parsing for multiple pages +const htmlResults = await Promise.all( + pages.map(p => fetchPageHtml(p.path, org, repo)) +); +``` + +### Parsing Performance + +**Minimize parsing overhead:** +```javascript +// Parse only when necessary +const shouldParse = (pageEvent, oldState) => { + // Always parse for initial build + if (!oldState) return true; + + // Parse if page has new auditlog event + if (pageEvent.timestamp > oldState.lastParsed) return true; + + // Skip if recently parsed + const cacheAge = Date.now() - oldState.lastParsed; + return cacheAge > (24 * 60 * 60 * 1000); // 24 hours +}; + +// Single-pass extraction +const parsePageContent = (html) => { + return { + fragments: extractFragmentReferences(html), + pdfs: extractLinks(html, /\.pdf$/), + svgs: extractLinks(html, /\.svg$/), + hasImages: html.includes('media_') && html.includes(' { + if (entry.status !== 'referenced') return; // Skip unreferenced + + const key = entry.hash || entry.path; + if (!counts.has(key)) { + counts.set(key, { + key, + pages: new Set(), + lastUsed: 0, + type: entry.type + }); + } + + const count = counts.get(key); + + // For Media Bus items (have page field) + if (entry.page) { + count.pages.add(entry.page); + } + + // For linked content (have usedIn field) + if (entry.usedIn) { + entry.usedIn.forEach(p => count.pages.add(p)); + } + + if (entry.timestamp > count.lastUsed) { + count.lastUsed = entry.timestamp; + } + }); + + // Convert to array with reference counts + return Array.from(counts.values()).map(c => ({ + key: c.key, + type: c.type, + referenceCount: c.pages.size, + pages: Array.from(c.pages), + lastUsed: c.lastUsed + })); +} +``` + +### Filtering by Content Type + +```javascript +function filterByType(index, type) { + return index.filter(e => e.type === type && e.status === 'referenced'); +} + +// Examples +const images = filterByType(index, 'image'); +const pdfs = filterByType(index, 'pdf'); +const fragments = filterByType(index, 'fragment'); +``` + +### Filtering Unreferenced Media + +```javascript +function getUnreferencedMedia(index) { + return index.filter(e => + e.status === 'unreferenced' || + e.status === 'uploaded-unused' || + e.status === 'file-unused' + ); +} +``` + +### Getting Usage Details for Media Info Panel + +```javascript +function getMediaUsage(index, identifier) { + // identifier can be hash (for images) or path (for PDFs/SVGs/fragments) + const usageEntries = index.filter(e => + (e.hash === identifier || e.path === identifier) && + e.status === 'referenced' + ); + + if (usageEntries.length === 0) return []; + + // For Media Bus items (images/videos) + if (usageEntries[0].hash) { + const byPage = groupBy(usageEntries, 'page'); + + return Object.entries(byPage).map(([page, entries]) => ({ + page, + previewCount: entries.length, + lastPreview: Math.max(...entries.map(e => e.timestamp)), + users: [...new Set(entries.map(e => e.user))] + })); + } + + // For linked content (PDFs/SVGs/fragments) + if (usageEntries[0].usedIn) { + return usageEntries[0].usedIn.map(page => ({ + page, + previewCount: 1, // Can't track individual previews for linked content + lastPreview: usageEntries[0].timestamp, + users: [usageEntries[0].user] + })); + } + + return []; +} +``` + +--- + +## Next Steps + +### 1. Infrastructure Setup + +- [ ] Verify medialog backfill CLI tool is production-ready +- [ ] Create `/tools/media-indexer` DA app for initial index population +- [ ] Set up `.da/mediaindex/` directory structure +- [ ] Document authentication requirements and token management +- [ ] Test on small pilot repository first + +### 2. Implementation Phase + +- [ ] Implement initial index build function (in media-indexer app) +- [ ] Implement incremental update function (in browser) +- [ ] Add content usage map builder (fragments, PDFs, SVGs) +- [ ] Implement distributed locking mechanism +- [ ] Add background auto-refresh with 10-minute interval +- [ ] Create user-triggered refresh UI +- [ ] Add error handling and retry logic +- [ ] Test with production data at scale + +### 3. Testing & Validation + +- [ ] Unit tests for matching logic +- [ ] Integration tests with real logs +- [ ] Performance testing with large datasets (10,000+ pages) +- [ ] Validate reference counts accuracy +- [ ] Test parsing extraction functions +- [ ] Test distributed lock under concurrent load +- [ ] Test browser crash recovery (lock timeout) +- [ ] Validate medialog backfill completeness + +### 4. Operational Readiness + +- [ ] Write operational runbooks for engineers +- [ ] Create monitoring dashboards for index health +- [ ] Document troubleshooting procedures +- [ ] Set up alerts for failures (lock timeouts, parse errors) +- [ ] Establish maintenance schedule (monthly rebuilds) +- [ ] Train support team on index operations + +### 5. Monitoring & Maintenance + +- [ ] Log processing metrics (time, entries, errors) +- [ ] Alert on anomalies (orphaned entries, large drifts) +- [ ] Periodic full rebuild (weekly/monthly) +- [ ] Dashboard for index health +- [ ] Track parsing failures and success rates +- [ ] Monitor lock contention and timeout rates +- [ ] Track refresh performance across site sizes + +### 6. Future Enhancements + +- [ ] Handle "delete" operations when API confirmed +- [ ] Add support for video tracking (similar to images) +- [ ] Implement pagination for large indexes +- [ ] Add caching layer for frequent queries +- [ ] Explore real-time updates via webhooks +- [ ] Optimize parsing performance (parallel processing, caching) +- [ ] Implement leader election for background refresh +- [ ] Add incremental medialog backfill for updated pages +- [ ] Explore worker-based refresh (replace browser-based) + +--- + +## References + +- **AEM Media Documentation:** https://www.aem.live/docs/media +- **Auditlog API:** https://www.aem.live/docs/admin.html#tag/log/operation/getLogs +- **Medialog API:** (Similar to auditlog, dedicated for Media Bus) +- **Testing Repository:** `kmurugulla/brightpath` +- **Test Date:** February 24, 2026 + +--- + +## Appendix: Test Data Summary + +### All Scenarios Tested + +| Scenario | Pages | Media | Key Learning | +|----------|-------|-------|--------------| +| A | scenario-a.md | 3 images | First-time ingest via markup | +| B | scenario-b.md | 0 (text) | Auditlog without medialog | +| H | Standalone files | 1 image, 1 PDF, 1 SVG | Standalone vs embedded | +| G | scenario-g.md | 2 images, links | Mixed media behavior, parsing required | +| Re-preview | All 3 pages | Same media | Duplicate event handling | + +### Timestamp Patterns Observed + +| Event Type | Typical Delay | Range Observed | +|------------|--------------|----------------| +| Auditlog to Medialog | 1.5-2 seconds | 800ms - 3400ms | +| Multi-page bulk preview | Nearly simultaneous | 4-9ms apart | +| Media on same page | Identical timestamp | 0ms (exact match) | + +### Content Type Tracking Summary + +| Content | Logs | Parsing | Usage Link | +|---------|------|---------|------------| +| Images/Videos | Medialog | No | resourcePath field | +| PDFs/SVGs/Fragments | Auditlog only | Yes | Parse HTML | + +--- + +**Last Updated:** February 24, 2026