diff --git a/tools/media/README.md b/tools/media/README.md
new file mode 100644
index 0000000..9d1a795
--- /dev/null
+++ b/tools/media/README.md
@@ -0,0 +1,123 @@
+# Media Indexer
+
+A tool for building and maintaining a media index from DA Live's medialog and auditlog APIs.
+
+## Architecture
+
+The indexer is now modular for better maintainability and extensibility:
+
+```
+tools/media/
+├── indexer.html # Entry point HTML
+├── indexer.css # Styles
+├── indexer.js # Main entry (initialization & auth)
+├── indexer-old.js # Backup of monolithic version
+└── lib/
+ ├── config.js # Configuration & state management
+ ├── api.js # DA Admin API calls
+ ├── helpers.js # Utility functions (normalizePath, isPage, etc.)
+ ├── builder.js # Core index building logic
+ └── ui.js # UI rendering & event handling
+```
+
+## Modules
+
+### `lib/config.js`
+- URL parameter parsing (org, repo, ref)
+- Global state management
+- Constants (DA_ADMIN, sitePath)
+
+### `lib/api.js`
+- `fetchWithAuth()` - Authenticated fetch wrapper
+- `daFetch()` - DA API fetch wrapper
+- `loadMeta()` - Load metadata from DA
+- `createSheet()` - Create sheet format for DA
+- `saveMeta()` - Save metadata to DA
+- `fetchFromAdminAPI()` - Fetch from auditlog/medialog with pagination
+
+### `lib/helpers.js`
+- `normalizePath()` - Normalize paths (add .md, remove query params)
+- `isPage()` - Detect if path is a page vs media file
+- `extractName()` - Extract filename from medialog entry
+- `detectMediaType()` - Detect media type from contentType
+
+### `lib/builder.js`
+- `getIndexStatus()` - Get current index metadata
+- `buildInitialIndex()` - Core indexing logic:
+ 1. Fetch auditlog entries
+ 2. Fetch medialog entries
+ 3. Match media to pages (5-second time window)
+ 4. Deduplicate by hash
+ 5. Save index to DA
+
+### `lib/ui.js`
+- `render()` - Render UI with status, progress, logs, errors
+- `attachEventListeners()` - Handle button clicks
+
+### `indexer.js`
+- Main entry point
+- DA SDK authentication
+- Initialize UI
+
+## Index Schema
+
+Each entry in the media index:
+
+```javascript
+{
+ hash: "abc123", // Media hash (unique identifier)
+ pages: "/page1.md|/page2.md", // Pipe-separated list of pages using this media
+ url: "https://.../media_abc.jpg", // Full URL to media
+ name: "photo.jpg", // Filename (extracted from URL)
+ timestamp: 1771704070155, // Latest usage timestamp
+ user: "user@example.com", // User who uploaded/used it
+ operation: "reuse", // Latest operation (ingest/reuse)
+ type: "img > jpeg", // Media type (category > extension)
+ status: "referenced" // Status (referenced/unused)
+}
+```
+
+## Indexing Rules
+
+- **Latest event only:** For each page, use only the latest auditlog event. Skip all others. Multiple events in a batch are sorted by timestamp; only the most recent determines the current page state.
+
+## Phase 1 (Current)
+
+✅ Media Bus items (images/videos) from medialog API
+✅ Deduplicated by hash
+✅ Pipe-separated pages for multi-page usage
+✅ Latest usage tracking
+
+## Phase 2 (Current)
+
+- Linked content (PDFs, SVGs, fragments) from auditlog
+- HTML parsing for usage detection (extractFragmentReferences, extractLinks)
+- Source: "auditlog-parsed"
+- Index entries: path, usedIn, timestamp, type, status
+
+## Phase 3 (Future)
+
+- Streaming architecture for large sites
+- Chunked processing
+- Memory optimization
+
+## Usage
+
+1. Open in browser: `https://main--repo--org.aem.page/tools/media/indexer.html?org=yourorg&repo=yourrepo`
+2. Authenticate with DA Live
+3. Click "Build Initial Index"
+4. Index saved to `/.da/mediaindex/media-index.json`
+
+## Development
+
+Run linting:
+```bash
+npm run lint:js
+npm run lint:css
+```
+
+Test locally:
+```bash
+npx @adobe/aem-cli up
+# Open http://localhost:3000/tools/media/indexer.html?org=yourorg&repo=yourrepo
+```
diff --git a/tools/media/indexer-old.js b/tools/media/indexer-old.js
new file mode 100644
index 0000000..1ca1414
--- /dev/null
+++ b/tools/media/indexer-old.js
@@ -0,0 +1,507 @@
+/* eslint-disable import/no-absolute-path, import/no-unresolved */
+/* The DA SDK is loaded from the da.live CDN and is required for authentication */
+import DA_SDK from 'https://da.live/nx/utils/sdk.js';
+
+// Parse URL parameters
+const params = new URLSearchParams(window.location.search);
+const org = params.get('org');
+const repo = params.get('repo') || params.get('site');
+const ref = 'main';
+const sitePath = `/${org}/${repo}`;
+
+const state = {
+ building: false,
+ progress: { stage: 'idle', message: '', percent: 0 },
+ errors: [],
+ logs: [],
+ status: null,
+ daToken: null,
+};
+
+async function fetchWithAuth(url, opts = {}) {
+ opts.headers ||= {};
+ if (state.daToken) {
+ opts.headers.Authorization = `Bearer ${state.daToken}`;
+ }
+ return fetch(url, opts);
+}
+
+const DA_ADMIN = 'https://admin.da.live';
+
+async function daFetch(url, opts = {}) {
+ opts.headers ||= {};
+ if (state.daToken) {
+ opts.headers.Authorization = `Bearer ${state.daToken}`;
+ }
+ return fetch(url, opts);
+}
+
+async function loadMeta(path) {
+ try {
+ const resp = await daFetch(`${DA_ADMIN}/source${path}`);
+ if (resp.ok) {
+ const data = await resp.json();
+ return data.data?.[0] || data;
+ }
+ } catch {
+ return null;
+ }
+ return null;
+}
+
+async function createSheet(data, type = 'sheet') {
+ const sheetMeta = {
+ total: data.length,
+ limit: data.length,
+ offset: 0,
+ data,
+ ':type': type,
+ };
+ const blob = new Blob([JSON.stringify(sheetMeta, null, 2)], { type: 'application/json' });
+ const formData = new FormData();
+ formData.append('data', blob);
+ return formData;
+}
+
+async function saveMeta(meta, path) {
+ const metaArray = Array.isArray(meta) ? meta : [meta];
+ const formData = await createSheet(metaArray);
+ return daFetch(`${DA_ADMIN}/source${path}`, {
+ method: 'POST',
+ body: formData,
+ });
+}
+
+function timestampToDuration(timestamp) {
+ if (!timestamp) return '90d';
+ const ageMs = Date.now() - timestamp;
+ const days = Math.ceil(ageMs / (24 * 60 * 60 * 1000));
+ if (days < 1) {
+ const hours = Math.ceil(ageMs / (60 * 60 * 1000));
+ return hours > 0 ? `${hours}h` : '1h';
+ }
+ return `${Math.min(days, 90)}d`;
+}
+
+async function fetchFromAdminAPI(endpoint, orgName, repoName, refName, since, limit, onPageLoaded) {
+ const fetchParams = new URLSearchParams();
+ fetchParams.append('limit', limit.toString());
+
+ const sinceDuration = since ? timestampToDuration(since) : '90d';
+ fetchParams.append('since', sinceDuration);
+
+ const baseUrl = `https://admin.hlx.page/${endpoint}/${orgName}/${repoName}/${refName}`;
+ const separator = endpoint === 'medialog' ? '/' : '';
+ const url = `${baseUrl}${separator}?${fetchParams.toString()}`;
+
+ const resp = await fetchWithAuth(url);
+
+ if (!resp.ok) {
+ throw new Error(`${endpoint} API error: ${resp.status} ${resp.statusText}`);
+ }
+
+ const data = await resp.json();
+ const entries = data.entries || data.data || [];
+ const { nextToken } = data;
+
+ if (onPageLoaded && entries.length > 0) {
+ onPageLoaded(entries, !!nextToken);
+ }
+
+ async function fetchNextPage(token) {
+ if (!token) return [];
+
+ fetchParams.set('nextToken', token);
+ const nextUrl = `${baseUrl}${separator}?${fetchParams.toString()}`;
+ const nextResp = await fetchWithAuth(nextUrl);
+
+ if (!nextResp.ok) return [];
+
+ const nextData = await nextResp.json();
+ const nextEntries = nextData.entries || nextData.data || [];
+
+ if (!nextEntries || nextEntries.length === 0) return [];
+
+ if (onPageLoaded) {
+ onPageLoaded([...entries, ...nextEntries], !!nextData.nextToken);
+ }
+
+ const remainingEntries = await fetchNextPage(nextData.nextToken);
+ return [...nextEntries, ...remainingEntries];
+ }
+
+ const additionalEntries = await fetchNextPage(nextToken);
+ return [...entries, ...additionalEntries];
+}
+
+/**
+ * Normalize a path by removing query params/fragments and adding .md for pages
+ * @param {string} path - The path to normalize
+ * @returns {string} Normalized path
+ */
+function normalizePath(path) {
+ if (!path) return '';
+ let cleanPath = path.split('?')[0].split('#')[0];
+ // Add .md for pages: /drafts/page -> /drafts/page.md
+ if (!cleanPath.includes('.') && !cleanPath.startsWith('/media/')) {
+ cleanPath = `${cleanPath}.md`;
+ }
+ return cleanPath;
+}
+
+/**
+ * Detect if a path represents a page (not a media file or fragment)
+ * @param {string} path - The path to check
+ * @returns {boolean} True if path is a page
+ */
+function isPage(path) {
+ if (!path || typeof path !== 'string') return false;
+ return (path.endsWith('.md')
+ || (!path.includes('.') && !path.startsWith('/media/')))
+ && !path.includes('/fragments/');
+}
+
+/**
+ * Extract the filename from a medialog entry
+ * @param {object} mediaEntry - The medialog entry
+ * @returns {string} The filename without query params or fragments
+ */
+function extractName(mediaEntry) {
+ if (!mediaEntry) return '';
+ if (mediaEntry.operation === 'ingest' && mediaEntry.originalFilename) {
+ return mediaEntry.originalFilename.split('/').pop();
+ }
+ if (!mediaEntry.path) return '';
+ // Remove query params (?...) and URL fragments (#...)
+ return mediaEntry.path.split('?')[0].split('#')[0].split('/').pop();
+}
+
+/**
+ * Detect media type from contentType in structured format
+ * @param {object} mediaEntry - The medialog entry
+ * @returns {string} Type in format "category > extension"
+ */
+function detectMediaType(mediaEntry) {
+ const contentType = mediaEntry.contentType || '';
+ if (contentType.startsWith('image/')) {
+ const ext = contentType.split('/')[1];
+ return `img > ${ext}`;
+ }
+ if (contentType.startsWith('video/')) {
+ const ext = contentType.split('/')[1];
+ return `video > ${ext}`;
+ }
+ return 'unknown';
+}
+
+async function getIndexStatus() {
+ const metaPath = `${sitePath}/.da/mediaindex/medialog-meta.json`;
+ const meta = await loadMeta(metaPath);
+
+ return {
+ lastRefresh: meta?.lastFetchTime || null,
+ entriesCount: meta?.entriesCount || 0,
+ };
+}
+
+async function buildInitialIndex(onProgress) {
+ const index = [];
+
+ // Phase 1: Fetch auditlog entries
+ onProgress({ stage: 'fetching', message: 'Fetching auditlog entries...', percent: 10 });
+
+ const auditlogEntries = await fetchFromAdminAPI('log', org, repo, ref, null, 1000, (entries, hasMore) => {
+ onProgress({
+ stage: 'fetching',
+ message: `Fetched ${entries.length} auditlog entries${hasMore ? ' (more available)' : ''}...`,
+ percent: 20,
+ });
+ });
+
+ // Separate pages from files (filter out entries with invalid paths)
+ const validEntries = auditlogEntries.filter((e) => e && e.path);
+ const pages = validEntries.filter((e) => isPage(e.path));
+ const files = validEntries.filter((e) => !isPage(e.path));
+
+ onProgress({
+ stage: 'fetching',
+ message: `Identified ${pages.length} pages and ${files.length} files from auditlog`,
+ percent: 30,
+ });
+
+ // Phase 2: Fetch medialog entries
+ onProgress({ stage: 'fetching', message: 'Fetching medialog entries...', percent: 40 });
+
+ const medialogEntries = await fetchFromAdminAPI('medialog', org, repo, ref, null, 1000, (entries, hasMore) => {
+ onProgress({
+ stage: 'fetching',
+ message: `Fetched ${entries.length} medialog entries${hasMore ? ' (more available)' : ''}...`,
+ percent: 50,
+ });
+ });
+
+ onProgress({
+ stage: 'processing',
+ message: `Processing ${pages.length} pages with ${medialogEntries.length} medialog entries...`,
+ percent: 60,
+ });
+
+ // Phase 3: Build hash map (deduplicate by hash, track all pages)
+ const hashMap = new Map();
+
+ // Process page-referenced media
+ pages.forEach((pageEvent) => {
+ const normalizedPath = normalizePath(pageEvent.path);
+
+ // Find matching medialog entries within 5-second time window
+ const pageMedia = medialogEntries.filter((m) => {
+ if (!m.resourcePath) return false;
+ if (m.resourcePath !== normalizedPath) return false;
+
+ const TIME_WINDOW_MS = 5000;
+ return m.timestamp >= pageEvent.timestamp
+ && m.timestamp < pageEvent.timestamp + TIME_WINDOW_MS;
+ });
+
+ // Add to hash map
+ pageMedia.forEach((media) => {
+ const hash = media.mediaHash;
+ if (!hashMap.has(hash)) {
+ // First time seeing this hash - initialize entry
+ hashMap.set(hash, {
+ hash,
+ pages: new Set([normalizedPath]),
+ url: media.path,
+ name: extractName(media),
+ timestamp: media.timestamp,
+ user: media.user,
+ operation: media.operation,
+ type: detectMediaType(media),
+ status: 'referenced',
+ });
+ } else {
+ // Hash exists - update with latest info
+ const entry = hashMap.get(hash);
+ entry.pages.add(normalizedPath);
+
+ // Keep latest timestamp (since logs are sorted newest first)
+ if (media.timestamp > entry.timestamp) {
+ entry.timestamp = media.timestamp;
+ entry.operation = media.operation;
+ }
+ }
+ });
+ });
+
+ onProgress({
+ stage: 'processing',
+ message: `Processed ${pages.length} pages, found ${hashMap.size} unique media items`,
+ percent: 70,
+ });
+
+ // Phase 4: Process standalone uploads (not on any page yet)
+ const standaloneUploads = medialogEntries.filter((m) => !m.resourcePath && m.originalFilename);
+
+ standaloneUploads.forEach((media) => {
+ const hash = media.mediaHash;
+ if (!hashMap.has(hash)) {
+ // Only add if not already referenced on a page
+ hashMap.set(hash, {
+ hash,
+ pages: new Set(),
+ url: media.path,
+ name: media.originalFilename.split('/').pop(),
+ timestamp: media.timestamp,
+ user: media.user,
+ operation: media.operation,
+ type: detectMediaType(media),
+ status: 'unused',
+ });
+ }
+ });
+
+ onProgress({
+ stage: 'processing',
+ message: `Added ${standaloneUploads.length} standalone uploads, total unique: ${hashMap.size}`,
+ percent: 80,
+ });
+
+ // Convert Map to array with pipe-separated pages
+ hashMap.forEach((entry) => {
+ index.push({
+ hash: entry.hash,
+ pages: Array.from(entry.pages).join('|'),
+ url: entry.url,
+ name: entry.name,
+ timestamp: entry.timestamp,
+ user: entry.user,
+ operation: entry.operation,
+ type: entry.type,
+ status: entry.status,
+ });
+ });
+
+ // Phase 5: Save index
+ onProgress({ stage: 'saving', message: `Saving ${index.length} entries...`, percent: 90 });
+
+ const indexPath = `${sitePath}/.da/mediaindex/media-index.json`;
+ const formData = await createSheet(index);
+ await daFetch(`${DA_ADMIN}/source${indexPath}`, {
+ method: 'POST',
+ body: formData,
+ });
+
+ await saveMeta({
+ lastFetchTime: Date.now(),
+ entriesCount: index.length,
+ lastRefreshBy: 'media-indexer',
+ }, `${sitePath}/.da/mediaindex/medialog-meta.json`);
+
+ onProgress({ stage: 'complete', message: `Complete! ${index.length} entries indexed`, percent: 100 });
+
+ return { entriesCount: index.length };
+}
+
+function render() {
+ const app = document.getElementById('app');
+
+ const statusHtml = state.status ? `
+
+
Current Index Status
+
+
+
+ ${state.status.lastRefresh ? new Date(state.status.lastRefresh).toLocaleString() : 'Never'}
+
+
+
+ ${state.status.entriesCount || 0}
+
+
+
+ ` : 'Checking status...
';
+
+ const progressHtml = state.building || state.progress.stage !== 'idle' ? `
+
+
Progress
+
+
+ ${state.progress.stage}
+ ${state.progress.message}
+
+
+ ` : '';
+
+ const logsHtml = state.logs.length > 0 ? `
+
+
Logs (${state.logs.length})
+
+ ${state.logs.map((log) => `- ${log.message}
`).join('')}
+
+
+ ` : '';
+
+ const errorsHtml = state.errors.length > 0 ? `
+
+
Errors (${state.errors.length})
+
+ ${state.errors.map((err) => `- ${err.message}
`).join('')}
+
+
+ ` : '';
+
+ app.innerHTML = `
+ Media Index Builder
+ Building index for: ${org}/${repo}
+
+ ${statusHtml}
+
+
+
+
+
+ ${progressHtml}
+ ${errorsHtml}
+ ${logsHtml}
+ `;
+}
+
+function attachEventListeners() {
+ if (!state.building) {
+ const buildBtn = document.getElementById('buildBtn');
+ if (buildBtn) {
+ buildBtn.addEventListener('click', () => {
+ state.building = true;
+ state.errors = [];
+ state.logs = [];
+ state.progress = { stage: 'starting', message: 'Starting build...', percent: 0 };
+ render();
+
+ buildInitialIndex((progress) => {
+ state.progress = progress;
+ state.logs.push({ message: progress.message, type: 'info' });
+ render();
+ })
+ .then((result) => {
+ state.logs.push({ message: `Index built successfully: ${result.entriesCount} entries`, type: 'success' });
+ return getIndexStatus();
+ })
+ .then((status) => {
+ state.status = status;
+ })
+ .catch((error) => {
+ state.errors.push({ message: error.message });
+ state.logs.push({ message: `Error: ${error.message}`, type: 'error' });
+ state.progress = { stage: 'error', message: error.message, percent: 0 };
+ })
+ .finally(() => {
+ state.building = false;
+ render();
+ attachEventListeners();
+ });
+ });
+ }
+ }
+}
+
+async function init() {
+ if (!org || !repo) {
+ document.getElementById('app').innerHTML = `
+
+
Missing Parameters
+
Please provide org and repo parameters in the URL:
+
?org=yourorg&repo=yourrepo
+
+ `;
+ return;
+ }
+
+ // Get DA token with timeout
+ try {
+ const tokenPromise = DA_SDK;
+ const timeoutPromise = new Promise((_, reject) => {
+ setTimeout(() => reject(new Error('Authentication timeout')), 5000);
+ });
+
+ const result = await Promise.race([tokenPromise, timeoutPromise]);
+ state.daToken = result?.token;
+ } catch (error) {
+ state.errors.push({ message: `Failed to get DA token: ${error.message}` });
+ }
+
+ if (!state.daToken) {
+ const returnUrl = encodeURIComponent(window.location.href);
+ window.location.href = `https://da.live/?returnUrl=${returnUrl}`;
+ return;
+ }
+
+ state.status = await getIndexStatus();
+ render();
+ attachEventListeners();
+}
+
+init();
diff --git a/tools/media/indexer.css b/tools/media/indexer.css
new file mode 100644
index 0000000..36a1647
--- /dev/null
+++ b/tools/media/indexer.css
@@ -0,0 +1,315 @@
+/* ========== Base & Variables ========== */
+
+:root {
+ /* Gray Scale */
+ --s2-gray-50: #f9fafb;
+ --s2-gray-100: #f3f4f6;
+ --s2-gray-200: #e5e7eb;
+ --s2-gray-300: #d1d5db;
+ --s2-gray-400: #9ca3af;
+ --s2-gray-500: #6b7280;
+ --s2-gray-600: #4b5563;
+ --s2-gray-700: #374151;
+ --s2-gray-900: #111827;
+
+ /* Blue Scale */
+ --s2-blue-50: #eff6ff;
+ --s2-blue-100: #dbeafe;
+ --s2-blue-200: #bfdbfe;
+ --s2-blue-300: #93c5fd;
+ --s2-blue-500: #3b82f6;
+ --s2-blue-600: #2563eb;
+ --s2-blue-700: #1d4ed8;
+ --s2-blue-900: #1e3a8a;
+
+ /* Green Scale */
+ --s2-green-100: rgb(215 247 225);
+ --s2-green-900: #065f46;
+
+ /* Red Scale */
+ --s2-red-100: rgb(255 214 209);
+ --s2-red-700: #991b1b;
+
+ /* Spacing */
+ --spacing-100: 4px;
+ --spacing-200: 8px;
+ --spacing-300: 12px;
+ --spacing-400: 16px;
+ --spacing-500: 24px;
+ --spacing-600: 32px;
+ --spacing-700: 40px;
+
+ /* Border Radius */
+ --s2-radius-100: 4px;
+ --s2-radius-200: 8px;
+ --s2-radius-300: 18px;
+
+ /* Typography */
+ --body-font-family: 'Adobe Clean', adobe-clean, 'Trebuchet MS', sans-serif;
+ --mono-font-family: 'Roboto Mono', menlo, consolas, 'Liberation Mono', monospace;
+ --s2-font-size-200: 14px;
+ --s2-font-size-300: 16px;
+ --s2-font-size-400: 16px;
+ --s2-font-size-600: 24px;
+ --s2-font-size-700: 32px;
+}
+
+* {
+ box-sizing: border-box;
+}
+
+body {
+ font-family: var(--body-font-family);
+ color: var(--s2-gray-900);
+ line-height: 1.6;
+ margin: 0;
+ padding: 0;
+ background: var(--s2-gray-50);
+}
+
+#app {
+ max-width: 1200px;
+ margin: var(--spacing-700) auto;
+ padding: 0 var(--spacing-400);
+}
+
+h1 {
+ font-size: var(--s2-font-size-700);
+ font-weight: 700;
+ line-height: 1.2;
+ margin: 0 0 var(--spacing-200);
+ color: var(--s2-gray-900);
+}
+
+h2 {
+ font-size: 20px;
+ font-weight: 600;
+ margin: 0 0 var(--spacing-400);
+ color: var(--s2-gray-900);
+}
+
+h3 {
+ font-size: 18px;
+ font-weight: 600;
+ margin: 0 0 var(--spacing-300);
+ color: var(--s2-gray-900);
+}
+
+p {
+ font-size: var(--s2-font-size-400);
+ color: var(--s2-gray-600);
+ margin: 0 0 var(--spacing-600);
+}
+
+/* ========== Status Panel ========== */
+
+.status-panel {
+ background: white;
+ border: 1px solid var(--s2-gray-200);
+ border-radius: var(--s2-radius-200);
+ padding: var(--spacing-500);
+ margin-bottom: var(--spacing-600);
+}
+
+.status-loading {
+ padding: var(--spacing-600);
+ text-align: center;
+ color: var(--s2-gray-600);
+ background: white;
+ border: 1px solid var(--s2-gray-200);
+ border-radius: var(--s2-radius-200);
+ margin-bottom: var(--spacing-600);
+}
+
+.status-grid {
+ display: grid;
+ grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
+ gap: var(--spacing-500);
+}
+
+.status-item {
+ display: flex;
+ flex-direction: column;
+ gap: var(--spacing-100);
+}
+
+.status-item label {
+ font-size: 11px;
+ font-weight: 600;
+ text-transform: uppercase;
+ color: var(--s2-gray-400);
+ letter-spacing: 1px;
+}
+
+.status-item span {
+ font-size: var(--s2-font-size-300);
+ color: var(--s2-gray-900);
+ font-weight: 500;
+}
+
+/* ========== Actions ========== */
+
+.actions {
+ margin-bottom: var(--spacing-600);
+}
+
+button,
+.button {
+ font-family: var(--body-font-family);
+ font-size: 15px;
+ font-weight: 700;
+ padding: 8px 24px;
+ line-height: 18px;
+ border: 2px solid #000;
+ color: #000;
+ border-radius: var(--s2-radius-300);
+ background: none;
+ cursor: pointer;
+ transition: all 0.2s;
+ text-align: center;
+}
+
+button:disabled {
+ background-color: #efefef;
+ border: 2px solid #efefef;
+ color: var(--s2-gray-700);
+ cursor: not-allowed;
+}
+
+button:hover:not(:disabled) {
+ background: var(--s2-gray-100);
+}
+
+button.accent,
+.btn-primary {
+ background: #3b63fb;
+ border: 2px solid #3b63fb;
+ color: #fff;
+}
+
+button.accent:hover:not(:disabled),
+.btn-primary:hover:not(:disabled) {
+ background: #2952e8;
+ border: 2px solid #2952e8;
+}
+
+/* ========== Progress Section ========== */
+
+.progress-section {
+ background: white;
+ border: 1px solid var(--s2-gray-200);
+ border-radius: var(--s2-radius-200);
+ padding: var(--spacing-500);
+ margin-bottom: var(--spacing-600);
+}
+
+.progress-bar {
+ width: 100%;
+ height: 8px;
+ background: var(--s2-gray-200);
+ border-radius: var(--s2-radius-100);
+ overflow: hidden;
+ margin-bottom: var(--spacing-300);
+}
+
+.progress-fill {
+ height: 100%;
+ background: #3b63fb;
+ transition: width 0.3s ease;
+}
+
+.progress-info {
+ display: flex;
+ gap: var(--spacing-300);
+ align-items: center;
+ font-size: var(--s2-font-size-200);
+}
+
+.progress-stage {
+ font-weight: 600;
+ color: var(--s2-gray-900);
+ text-transform: capitalize;
+}
+
+.progress-message {
+ color: var(--s2-gray-600);
+ flex: 1;
+}
+
+.progress-timing {
+ display: flex;
+ gap: var(--spacing-500);
+ margin-top: var(--spacing-200);
+ font-size: var(--s2-font-size-200);
+ color: var(--s2-gray-500);
+}
+
+/* ========== Logs Section ========== */
+
+.logs-section,
+.errors-section {
+ background: white;
+ border: 1px solid var(--s2-gray-200);
+ border-radius: var(--s2-radius-200);
+ padding: var(--spacing-500);
+ margin-bottom: var(--spacing-600);
+}
+
+.logs-list,
+.errors-list {
+ list-style: none;
+ margin: 0;
+ padding: 0;
+ max-height: 400px;
+ overflow-y: auto;
+}
+
+.logs-list li,
+.errors-list li {
+ padding: var(--spacing-200) var(--spacing-300);
+ margin-bottom: var(--spacing-100);
+ border-radius: var(--s2-radius-100);
+ font-size: 13px;
+ font-family: var(--mono-font-family);
+}
+
+.log-info {
+ background: var(--s2-gray-100);
+ color: var(--s2-gray-900);
+}
+
+.log-success {
+ background: var(--s2-green-100);
+ color: var(--s2-green-900);
+}
+
+.log-error,
+.errors-list li {
+ background: var(--s2-red-100);
+ color: var(--s2-red-700);
+}
+
+/* ========== Error Page ========== */
+
+.error {
+ background: white;
+ border: 1px solid var(--s2-gray-200);
+ border-radius: var(--s2-radius-200);
+ padding: var(--spacing-700);
+ text-align: center;
+}
+
+.error h1 {
+ color: var(--s2-red-700);
+}
+
+.error pre {
+ background: var(--s2-gray-100);
+ padding: var(--spacing-400);
+ border-radius: var(--s2-radius-100);
+ text-align: left;
+ display: inline-block;
+ margin-top: var(--spacing-400);
+ font-family: var(--mono-font-family);
+ font-size: 13px;
+}
diff --git a/tools/media/indexer.html b/tools/media/indexer.html
new file mode 100644
index 0000000..1c75061
--- /dev/null
+++ b/tools/media/indexer.html
@@ -0,0 +1,14 @@
+
+
+
+
+
+ Media Index Builder
+
+
+
+
+
+
+
+
diff --git a/tools/media/indexer.js b/tools/media/indexer.js
new file mode 100644
index 0000000..ede955e
--- /dev/null
+++ b/tools/media/indexer.js
@@ -0,0 +1,60 @@
+/* eslint-disable import/no-absolute-path, import/no-unresolved */
+/* The DA SDK is loaded from the da.live CDN and is required for authentication */
+import DA_SDK from 'https://da.live/nx/utils/sdk.js';
+
+import { state, org, repo } from './lib/config.js';
+import { getIndexStatus } from './lib/builder.js';
+import { render, attachEventListeners } from './lib/ui.js';
+
+/** Constants */
+const AUTH_TIMEOUT_MS = 5000; // Timeout for DA authentication
+
+async function init() {
+ if (!org || !repo) {
+ const params = new URLSearchParams(window.location.search);
+ const rawOrg = params.get('org');
+ const rawRepo = params.get('repo') || params.get('site');
+
+ let errorMsg = 'Please provide valid org and repo parameters in the URL:
';
+ if (!rawOrg || !rawRepo) {
+ errorMsg += 'Missing required parameters.
';
+ } else {
+ errorMsg += 'Invalid parameter format. Names must be alphanumeric with optional hyphens, underscores, or dots.
';
+ }
+
+ document.getElementById('app').innerHTML = `
+
+
Configuration Error
+ ${errorMsg}
+
?org=yourorg&repo=yourrepo
+
Example: ?org=mycompany&repo=myproject
+
+ `;
+ return;
+ }
+
+ // Get DA token with timeout
+ try {
+ const tokenPromise = DA_SDK;
+ const timeoutPromise = new Promise((_, reject) => {
+ setTimeout(() => reject(new Error('Authentication timeout')), AUTH_TIMEOUT_MS);
+ });
+
+ const result = await Promise.race([tokenPromise, timeoutPromise]);
+ state.daToken = result?.token;
+ } catch (error) {
+ state.errors.push({ message: `Failed to get DA token: ${error.message}` });
+ }
+
+ if (!state.daToken) {
+ const returnUrl = encodeURIComponent(window.location.href);
+ window.location.href = `https://da.live/?returnUrl=${returnUrl}`;
+ return;
+ }
+
+ state.status = await getIndexStatus();
+ render();
+ attachEventListeners();
+}
+
+init();
diff --git a/tools/media/lib/api.js b/tools/media/lib/api.js
new file mode 100644
index 0000000..f916f8c
--- /dev/null
+++ b/tools/media/lib/api.js
@@ -0,0 +1,314 @@
+/**
+ * DA Admin API functions for fetching logs and saving data
+ */
+
+import {
+ state, DA_ADMIN, org, repo, ref,
+} from './config.js';
+import * as logger from './logger.js';
+
+/** Constants */
+const RATE_LIMIT_DELAY_MS = 100; // Delay between paginated API requests
+
+/**
+ * Fetch with DA authentication token
+ * @param {string} url - URL to fetch
+ * @param {object} opts - Fetch options
+ * @returns {Promise} Fetch response
+ */
+export async function daFetch(url, opts = {}) {
+ opts.headers ||= {};
+ if (state.daToken) {
+ opts.headers.Authorization = `Bearer ${state.daToken}`;
+ }
+ return fetch(url, opts);
+}
+
+/** CORS proxy for cross-origin fetches (same as media-library block) */
+const CORS_PROXY_URL = 'https://media-library-cors-proxy.aem-poc-lab.workers.dev/';
+
+/**
+ * Fetch with CORS proxy. Uses proxy first when cross-origin (e.g. localhost → aem.page)
+ * to avoid CORS errors; direct fetch when same-origin.
+ * @param {string} url - URL to fetch
+ * @param {object} options - Fetch options
+ * @returns {Promise} Fetch response
+ */
+async function fetchWithCorsProxy(url, options = {}) {
+ const targetOrigin = url.startsWith('http') ? new URL(url).origin : null;
+ const isCrossOrigin = targetOrigin && window.location.origin !== targetOrigin;
+
+ if (isCrossOrigin) {
+ const proxyUrl = `${CORS_PROXY_URL}?url=${encodeURIComponent(url)}`;
+ return fetch(proxyUrl, options);
+ }
+
+ try {
+ const response = await fetch(url, options);
+ if (!response.ok) {
+ const proxyUrl = `${CORS_PROXY_URL}?url=${encodeURIComponent(url)}`;
+ return fetch(proxyUrl, options);
+ }
+ return response;
+ } catch (directError) {
+ if (directError.name === 'TypeError'
+ && (directError.message.includes('CORS')
+ || directError.message.includes('blocked')
+ || directError.message.includes('Access-Control-Allow-Origin')
+ || directError.message.includes('Failed to fetch'))) {
+ const proxyUrl = `${CORS_PROXY_URL}?url=${encodeURIComponent(url)}`;
+ return fetch(proxyUrl, options);
+ }
+ throw directError;
+ }
+}
+
+export async function loadMeta(path) {
+ try {
+ const resp = await daFetch(`${DA_ADMIN}/source${path}`);
+ if (resp.ok) {
+ const data = await resp.json();
+ return data.data?.[0] || data;
+ }
+ } catch (error) {
+ logger.error(`Failed to load meta from ${path}:`, error.message);
+ return null;
+ }
+ return null;
+}
+
+/**
+ * Fetch page markdown from preview URL (org, repo from query params).
+ * Uses CORS proxy fallback when direct fetch fails (e.g. cross-origin).
+ * @param {string} pagePath - Path e.g. /drafts/page.md
+ * @returns {Promise} - Raw markdown or null
+ */
+export async function fetchPageMarkdown(pagePath) {
+ try {
+ if (!org || !repo) return null;
+ const path = pagePath.startsWith('/') ? pagePath : `/${pagePath}`;
+ const url = `https://${ref}--${repo}--${org}.aem.page${path}`;
+ const resp = await fetchWithCorsProxy(url);
+ if (!resp.ok) return null;
+ return resp.text();
+ } catch (error) {
+ logger.error(`Failed to fetch page markdown ${pagePath}:`, error.message);
+ return null;
+ }
+}
+
+/**
+ * Load media-index.json from DA (sheet format).
+ * @param {string} path - Path to media-index.json
+ * @returns {Promise} - Array of index entries, or [] if not found
+ */
+export async function loadIndex(path) {
+ try {
+ const resp = await daFetch(`${DA_ADMIN}/source${path}`);
+ if (!resp.ok) return [];
+ const data = await resp.json();
+ const entries = data.data || [];
+ return Array.isArray(entries) ? entries : [];
+ } catch (error) {
+ logger.error(`Failed to load index from ${path}:`, error.message);
+ return [];
+ }
+}
+
+/**
+ * List children of a DA path using the DA Admin List API.
+ * Returns array of items; each item may have path, name, ext, props (with lastModified).
+ * @param {string} path - Path within org/repo (e.g. /.da/mediaindex)
+ * @returns {Promise>}
+ */
+export async function daList(path) {
+ const normalizedPath = path.replace(/^\//, '') || '';
+ const url = `${DA_ADMIN}/list/${org}/${repo}/${normalizedPath}`;
+ const resp = await daFetch(url);
+ if (!resp.ok) return [];
+ const data = await resp.json();
+ const items = Array.isArray(data) ? data : (data.sources || []);
+ return items;
+}
+
+/**
+ * Get media-index.json info from DA Admin List API (not Franklin Admin API).
+ * Uses DA List API since the index is stored in DA.
+ * @param {string} folderPath - Path to mediaindex folder within repo (e.g. .da/mediaindex)
+ * @returns {Promise<{exists: boolean, lastModified: number|null}>}
+ */
+export async function getMediaIndexInfo(folderPath = '.da/mediaindex') {
+ const items = await daList(folderPath);
+ const indexFile = items.find(
+ (item) => (item.name === 'media-index' && item.ext === 'json')
+ || (item.path && item.path.endsWith('/media-index.json')),
+ );
+ if (!indexFile) return { exists: false, lastModified: null };
+ // DA List API: lastModified is Unix timestamp (ms) on item (docs.da.live/developers/api/list)
+ const lastMod = indexFile.lastModified ?? indexFile.props?.lastModified;
+ const ts = lastMod != null && typeof lastMod === 'number' ? lastMod : null;
+ return { exists: true, lastModified: ts };
+}
+
+export async function createSheet(data, type = 'sheet') {
+ const sheetMeta = {
+ total: data.length,
+ limit: data.length,
+ offset: 0,
+ data,
+ ':type': type,
+ };
+ const blob = new Blob([JSON.stringify(sheetMeta, null, 2)], { type: 'application/json' });
+ const formData = new FormData();
+ formData.append('data', blob);
+ return formData;
+}
+
+export async function saveMeta(meta, path) {
+ const metaArray = Array.isArray(meta) ? meta : [meta];
+ const formData = await createSheet(metaArray);
+ return daFetch(`${DA_ADMIN}/source${path}`, {
+ method: 'POST',
+ body: formData,
+ });
+}
+
+function timestampToDuration(timestamp) {
+ if (!timestamp) return '90d';
+ const ageMs = Date.now() - timestamp;
+ const days = Math.ceil(ageMs / (24 * 60 * 60 * 1000));
+ if (days < 1) {
+ const hours = Math.ceil(ageMs / (60 * 60 * 1000));
+ return hours > 0 ? `${hours}h` : '1h';
+ }
+ return `${Math.min(days, 90)}d`;
+}
+
+export async function fetchFromAdminAPI(
+ endpoint,
+ orgName,
+ repoName,
+ refName,
+ since,
+ limit,
+ onPageLoaded,
+) {
+ const fetchParams = new URLSearchParams();
+ fetchParams.append('limit', limit.toString());
+
+ // API default (no since) = from=now-15min, to=now. For initial index use max span.
+ const sinceDuration = since != null ? timestampToDuration(since) : '36500d';
+ fetchParams.append('since', sinceDuration);
+
+ const baseUrl = `https://admin.hlx.page/${endpoint}/${orgName}/${repoName}/${refName}`;
+ const separator = endpoint === 'medialog' ? '/' : '';
+ const url = `${baseUrl}${separator}?${fetchParams.toString()}`;
+
+ const resp = await daFetch(url);
+
+ if (!resp.ok) {
+ throw new Error(`${endpoint} API error: ${resp.status} ${resp.statusText}`);
+ }
+
+ const data = await resp.json();
+ const entries = data.entries || data.data || [];
+ const { nextToken } = data;
+
+ if (onPageLoaded && entries.length > 0) {
+ onPageLoaded(entries, !!nextToken);
+ }
+
+ async function fetchNextPage(token) {
+ if (!token) return [];
+
+ fetchParams.set('nextToken', token);
+ const nextUrl = `${baseUrl}${separator}?${fetchParams.toString()}`;
+ const nextResp = await daFetch(nextUrl);
+
+ if (!nextResp.ok) return [];
+
+ const nextData = await nextResp.json();
+ const nextEntries = nextData.entries || nextData.data || [];
+
+ if (onPageLoaded && nextEntries?.length > 0) {
+ onPageLoaded(nextEntries, !!nextData.nextToken);
+ }
+
+ const remainingEntries = nextData.nextToken
+ ? await fetchNextPage(nextData.nextToken)
+ : [];
+ return [...(nextEntries || []), ...remainingEntries];
+ }
+
+ const additionalEntries = await fetchNextPage(nextToken);
+ return [...entries, ...additionalEntries];
+}
+
+/** Delay helper for rate limiting */
+function sleep(ms) {
+ return new Promise((resolve) => {
+ setTimeout(resolve, ms);
+ });
+}
+
+/**
+ * Stream fetch from Admin API - yields chunks to onChunk, does not accumulate in memory.
+ * @param {string} endpoint - 'log' or 'medialog'
+ * @param {string} orgName - Org
+ * @param {string} repoName - Repo
+ * @param {string} refName - Ref (e.g. 'main')
+ * @param {number|null} since - Timestamp for incremental, or null for full
+ * @param {number} limit - Page size
+ * @param {Function} onChunk - (entries: Array) => void|Promise - called per chunk
+ */
+export async function fetchFromAdminAPIStreaming(
+ endpoint,
+ orgName,
+ repoName,
+ refName,
+ since,
+ limit,
+ onChunk,
+) {
+ const fetchParams = new URLSearchParams();
+ fetchParams.append('limit', limit.toString());
+ const sinceDuration = since != null ? timestampToDuration(since) : '36500d';
+ fetchParams.append('since', sinceDuration);
+
+ const baseUrl = `https://admin.hlx.page/${endpoint}/${orgName}/${repoName}/${refName}`;
+ const separator = endpoint === 'medialog' ? '/' : '';
+ let nextUrl = `${baseUrl}${separator}?${fetchParams.toString()}`;
+
+ /* eslint-disable no-await-in-loop -- sequential fetch required for pagination */
+ while (nextUrl) {
+ const resp = await daFetch(nextUrl);
+
+ if (!resp.ok) {
+ throw new Error(`${endpoint} API error: ${resp.status} ${resp.statusText}`);
+ }
+
+ const data = await resp.json();
+ const entries = data.entries || data.data || [];
+
+ if (entries.length > 0 && onChunk) {
+ await onChunk(entries);
+ }
+
+ const nextLink = data.links?.next;
+ const token = data.nextToken;
+ logger.debug(`[${endpoint}] page: ${entries.length} entries | response keys: ${Object.keys(data).join(', ')} | nextToken=${token ?? 'null'} | links.next=${nextLink ?? 'null'}`);
+
+ if (nextLink && typeof nextLink === 'string' && nextLink.trim()) {
+ const base = `${baseUrl}${separator}`;
+ nextUrl = nextLink.startsWith('http') ? nextLink : new URL(nextLink, base).href;
+ } else if (token) {
+ fetchParams.set('nextToken', token);
+ nextUrl = `${baseUrl}${separator}?${fetchParams.toString()}`;
+ } else {
+ nextUrl = null;
+ }
+
+ if (nextUrl) await sleep(RATE_LIMIT_DELAY_MS);
+ }
+ /* eslint-enable no-await-in-loop */
+}
diff --git a/tools/media/lib/builder.js b/tools/media/lib/builder.js
new file mode 100644
index 0000000..7457bbb
--- /dev/null
+++ b/tools/media/lib/builder.js
@@ -0,0 +1,880 @@
+/**
+ * Core index building logic
+ */
+
+import {
+ org, repo, ref, sitePath, DA_ADMIN,
+} from './config.js';
+import {
+ fetchFromAdminAPI, fetchFromAdminAPIStreaming, createSheet, daFetch, saveMeta, loadMeta,
+ loadIndex, getMediaIndexInfo, fetchPageMarkdown,
+} from './api.js';
+import {
+ normalizePath, isPage, extractName, detectMediaType,
+ isPdf, isSvg, isFragment, isPdfOrSvg, getFileType,
+ isLinkedContentPath, normalizeFilePath,
+ extractFragmentReferences, extractLinks, extractIconReferences,
+} from './helpers.js';
+import * as logger from './logger.js';
+
+/** Constants */
+// 2 minutes tolerance for index/meta alignment
+const INDEX_ALIGNMENT_TOLERANCE_MS = 120_000;
+// 5s window for matching media to page events (full build)
+const MEDIA_ASSOCIATION_WINDOW_MS = 5000;
+// 10s window for incremental media updates
+const INCREMENTAL_WINDOW_MS = 10000;
+// Default page size for Admin API requests
+const API_PAGE_SIZE = 1000;
+// Max concurrent page markdown fetches to avoid overwhelming browser/server
+const MAX_CONCURRENT_FETCHES = 10;
+
+export async function getIndexStatus() {
+ const metaPath = `${sitePath}/.da/mediaindex/medialog-meta.json`;
+ const meta = await loadMeta(metaPath);
+ const { exists: indexExists, lastModified: indexLastModified } = await getMediaIndexInfo('.da/mediaindex');
+
+ return {
+ lastRefresh: meta?.lastFetchTime || null,
+ entriesCount: meta?.entriesCount || 0,
+ lastBuildMode: meta?.lastBuildMode || null,
+ indexExists,
+ indexLastModified,
+ };
+}
+
+/**
+ * Determine if we can do incremental re-index instead of full build.
+ * Re-index when: meta has lastFetchTime, index exists, and index lastModified aligns with meta.
+ * @returns {Promise<{shouldReindex: boolean, reason?: string}>}
+ */
+export async function shouldReindex() {
+ const metaPath = `${sitePath}/.da/mediaindex/medialog-meta.json`;
+ const meta = await loadMeta(metaPath);
+ const { exists: indexExists, lastModified: indexLastModified } = await getMediaIndexInfo('.da/mediaindex');
+
+ if (!meta?.lastFetchTime) {
+ return { shouldReindex: false, reason: 'No previous fetch (meta missing lastFetchTime)' };
+ }
+ if (!indexExists) {
+ return { shouldReindex: false, reason: 'Index file does not exist in DA' };
+ }
+ if (indexLastModified == null) {
+ return { shouldReindex: false, reason: 'DA List API did not return lastModified for media-index.json' };
+ }
+
+ const lastFetch = meta.lastFetchTime;
+ const diff = Math.abs(lastFetch - indexLastModified);
+ if (diff > INDEX_ALIGNMENT_TOLERANCE_MS) {
+ return {
+ shouldReindex: false,
+ reason: `Index lastModified (${indexLastModified}) does not align with meta lastFetchTime (${lastFetch})`,
+ };
+ }
+
+ return { shouldReindex: true };
+}
+
+/**
+ * Execute async tasks with concurrency limit
+ * @param {Array} items - Items to process
+ * @param {Function} fn - Async function to execute per item
+ * @param {number} concurrency - Max concurrent operations
+ * @returns {Promise} Results in order
+ */
+async function processConcurrently(items, fn, concurrency) {
+ const results = [];
+ const executing = [];
+
+ for (let i = 0; i < items.length; i += 1) {
+ const item = items[i];
+ const promise = Promise.resolve().then(() => fn(item, i));
+ results.push(promise);
+
+ if (concurrency <= items.length) {
+ const executingPromise = promise.then(() => {
+ executing.splice(executing.indexOf(executingPromise), 1);
+ });
+ executing.push(executingPromise);
+
+ if (executing.length >= concurrency) {
+ // eslint-disable-next-line no-await-in-loop
+ await Promise.race(executing);
+ }
+ }
+ }
+
+ return Promise.all(results);
+}
+
+/**
+ * Build usage map for linked content (PDFs, SVGs, fragments).
+ * Fetches .md from preview URL and parses markdown link syntax.
+ * @param {Array<{path: string}>} pageEntries - Auditlog entries for pages
+ * @param {Function} onProgress - Progress callback
+ * @returns {Promise<{pdfs: Map, svgs: Map, fragments: Map}>}
+ */
+async function buildContentUsageMap(pageEntries, onProgress) {
+ const usageMap = {
+ fragments: new Map(),
+ pdfs: new Map(),
+ svgs: new Map(),
+ };
+
+ const pagesByPath = new Map();
+ pageEntries.forEach((e) => {
+ const p = normalizePath(e.path);
+ if (!pagesByPath.has(p)) pagesByPath.set(p, []);
+ pagesByPath.get(p).push(e);
+ });
+ pagesByPath.forEach((events) => {
+ events.sort((a, b) => b.timestamp - a.timestamp);
+ });
+
+ const uniquePages = [...pagesByPath.keys()];
+ logger.debug(`[buildContentUsageMap] parsing ${uniquePages.length} unique pages: [${uniquePages.slice(0, 10).join(', ')}${uniquePages.length > 10 ? '...' : ''}]`);
+
+ const results = await processConcurrently(
+ uniquePages,
+ async (normalizedPath, i) => {
+ onProgress?.({ message: `Parsing page ${i + 1}/${uniquePages.length}: ${normalizedPath}` });
+ const md = await fetchPageMarkdown(normalizedPath);
+ return { normalizedPath, md };
+ },
+ MAX_CONCURRENT_FETCHES,
+ );
+
+ const failed = results.filter((r) => !r.md);
+ if (failed.length > 0) {
+ logger.warn(`[buildContentUsageMap] failed to fetch markdown for ${failed.length} pages: [${failed.map((r) => r.normalizedPath).join(', ')}]`);
+ }
+
+ results.forEach(({ normalizedPath, md }) => {
+ if (!md) return;
+
+ const fragments = extractFragmentReferences(md);
+ const pdfs = extractLinks(md, /\.pdf$/);
+ const svgs = extractLinks(md, /\.svg$/);
+ const icons = extractIconReferences(md);
+
+ const addToMap = (map, path) => {
+ if (!map.has(path)) map.set(path, []);
+ if (!map.get(path).includes(normalizedPath)) {
+ map.get(path).push(normalizedPath);
+ }
+ };
+
+ fragments.forEach((f) => addToMap(usageMap.fragments, f));
+ pdfs.forEach((p) => addToMap(usageMap.pdfs, p));
+ svgs.forEach((s) => addToMap(usageMap.svgs, s));
+ icons.forEach((s) => addToMap(usageMap.svgs, s));
+ });
+
+ const iconPathsFromUsage = [...usageMap.svgs.keys()].filter((p) => p.includes('/icons/'));
+ logger.debug(`[buildContentUsageMap] usageMap: pdfs=${usageMap.pdfs.size}, svgs=${usageMap.svgs.size}, fragments=${usageMap.fragments.size} | icon paths from parsing: [${iconPathsFromUsage.join(', ') || 'none'}]`);
+
+ return usageMap;
+}
+
+function noop() {}
+
+/**
+ * Find page events matching media within time window
+ * Matches media to page events that occurred BEFORE media timestamp within window
+ * Time window: MEDIA_ASSOCIATION_WINDOW_MS (5s for full build)
+ * Rationale: Media operations typically follow page preview within seconds
+ * Example: Page preview at T, media upload at T+2s → matched (within 5s window)
+ * Edge case: Media uploaded, then page previewed → not matched (preview must come first)
+ * @param {Map} pagesByPath - Map of normalized path to page events
+ * @param {string} resourcePath - Media resource path
+ * @param {number} mediaTimestamp - Media operation timestamp
+ * @returns {Array} Matching page events
+ */
+function findMatchingPageEvents(pagesByPath, resourcePath, mediaTimestamp) {
+ const events = pagesByPath.get(resourcePath);
+ if (!events || events.length === 0) return [];
+ const minTs = mediaTimestamp - MEDIA_ASSOCIATION_WINDOW_MS;
+ return events.filter(
+ (e) => e.timestamp <= mediaTimestamp && e.timestamp > minTs,
+ );
+}
+
+/** Check memory (Chrome/Edge); returns { warning, usedMB, limitMB } or { warning: false } */
+function checkMemory() {
+ if (typeof performance !== 'undefined' && performance.memory) {
+ const used = performance.memory.usedJSHeapSize / (1024 * 1024);
+ const limit = performance.memory.jsHeapSizeLimit / (1024 * 1024);
+ return { warning: used > limit * 0.8, usedMB: used, limitMB: limit };
+ }
+ return { warning: false };
+}
+
+/**
+ * Remove media entry from index; handle orphaned media
+ * Strategy: If removing last reference to a hash, mark as "unused" vs deleting
+ * Exception: Don't add "unused" if medialog has explicit "delete" for this hash
+ * Rationale: Media files persist in storage when unreferenced; track for cleanup
+ * Example: Media on 2 pages, remove from 1 → still referenced
+ * Remove from both → becomes "unused"
+ * @param {Array} idx - Index array
+ * @param {object} entry - Entry to remove
+ * @param {string} path - Page path
+ * @param {Array} medialog - Medialog entries for delete detection
+ * @returns {number} removed count (0 or 1)
+ */
+function removeMediaMaybeAddOrphan(idx, entry, path, medialog) {
+ const i = idx.findIndex((e) => e.hash === entry.hash && e.page === path);
+ if (i === -1) return 0;
+ const { hash } = entry;
+ const hasDelete = medialog.some((m) => m.mediaHash === hash && m.operation === 'delete');
+ idx.splice(i, 1);
+ const stillHasEntry = idx.some((e) => e.hash === hash);
+ const alreadyUnused = idx.some((e) => e.hash === hash && !e.page);
+ if (!stillHasEntry && !hasDelete && !alreadyUnused) {
+ idx.push({
+ hash,
+ page: '',
+ url: entry.url,
+ name: entry.name,
+ timestamp: entry.timestamp,
+ user: entry.user,
+ operation: entry.operation,
+ type: entry.type,
+ status: 'unused',
+ });
+ }
+ return 1;
+}
+
+/**
+ * Create a linked-content index entry using the same schema as media entries
+ * so the DA sheet stores all rows correctly (no column misalignment).
+ * @param {string} filePath - Path e.g. /icons/headset.svg
+ * @param {string[]} linkedPages - Pages that reference this file
+ * @param {{timestamp: number, user?: string}} fileEvent - Auditlog event
+ * @param {string} status - 'referenced' or 'file-unused'
+ * @returns {object} Entry matching media schema (hash, page, url, name, etc.)
+ */
+function toLinkedContentEntry(filePath, linkedPages, fileEvent, status) {
+ const pageVal = linkedPages.length > 0 ? linkedPages.join(',') : '';
+ return {
+ hash: filePath,
+ page: pageVal,
+ url: '',
+ name: filePath.split('/').pop() || filePath,
+ timestamp: fileEvent.timestamp,
+ user: fileEvent.user || '',
+ operation: 'auditlog-parsed',
+ type: getFileType(filePath),
+ status,
+ source: 'auditlog-parsed',
+ };
+}
+
+/**
+ * Process page-level media updates for incremental indexing
+ * Compares old index entries with new medialog to detect additions/removals
+ * @param {Array} updatedIndex - Index being built (mutated)
+ * @param {Map} pagesByPath - Map of page path to events
+ * @param {Array} medialogEntries - New medialog entries
+ * @param {Function} onLog - Logging callback
+ * @returns {{added: number, removed: number}} Counts
+ */
+function processPageMediaUpdates(updatedIndex, pagesByPath, medialogEntries, onLog) {
+ let added = 0;
+ let removed = 0;
+
+ pagesByPath.forEach((pageEvents, normalizedPath) => {
+ const latestEvent = pageEvents[0];
+ const latestTs = latestEvent.timestamp;
+ const windowStart = latestTs;
+ const windowEnd = latestTs + INCREMENTAL_WINDOW_MS;
+
+ onLog(`--- Page: ${normalizedPath} ---`);
+ onLog(` Latest preview: ${latestTs} (${new Date(latestTs).toISOString()})`);
+ onLog(` Window: [${windowStart}-${windowEnd}] (${INCREMENTAL_WINDOW_MS / 1000}s)`);
+
+ const matchesPage = (m) => m.resourcePath && m.resourcePath === normalizedPath;
+ const pageMedialogAll = medialogEntries.filter(matchesPage);
+ const inWindow = (m) => m.timestamp >= windowStart && m.timestamp < windowEnd;
+ const newPageMedia = pageMedialogAll.filter(inWindow);
+ const outsideWindow = pageMedialogAll.filter((m) => !newPageMedia.includes(m));
+
+ if (pageMedialogAll.length > 0) {
+ onLog(` Medialog for page: ${pageMedialogAll.length} total, ${newPageMedia.length} in window, ${outsideWindow.length} outside`);
+ if (outsideWindow.length > 0) {
+ outsideWindow.slice(0, 3).forEach((m) => {
+ onLog(` Outside: hash=${m.mediaHash} ts=${m.timestamp} (${new Date(m.timestamp).toISOString()})`);
+ });
+ }
+ }
+
+ const oldPageEntries = updatedIndex.filter((e) => e.page === normalizedPath);
+ const oldHashes = new Set(oldPageEntries.map((e) => e.hash));
+ const newHashes = new Set(newPageMedia.map((m) => m.mediaHash));
+
+ onLog(` Old (index): ${oldHashes.size} hashes ${[...oldHashes].slice(0, 5).join(', ')}${oldHashes.size > 5 ? '...' : ''}`);
+ onLog(` New (medialog in window): ${newHashes.size} hashes ${[...newHashes].slice(0, 5).join(', ')}${newHashes.size > 5 ? '...' : ''}`);
+
+ /**
+ * Edge case: Page was previewed but no media in the time window
+ * Scenario: User previewed page, removed all media, then previewed again
+ * Decision: Remove all old media entries for this page (assume removal intended)
+ * Alternative considered: Keep old entries (assume no change)
+ * Rationale: Preview action signals intent to update; empty medialog = intentional removal
+ * Assumption: Events are processed in timestamp order
+ */
+ if (newPageMedia.length === 0 && oldPageEntries.length > 0) {
+ onLog(' Edge case: Page previewed with no media in window - removing old entries');
+ const rm = removeMediaMaybeAddOrphan;
+ oldPageEntries.forEach((oldEntry) => {
+ removed += rm(updatedIndex, oldEntry, normalizedPath, medialogEntries);
+ });
+ return;
+ }
+
+ const toRemove = [...oldHashes].filter((h) => !newHashes.has(h));
+ const toAdd = [...newHashes].filter((h) => !oldHashes.has(h));
+ const unchanged = [...newHashes].filter((h) => oldHashes.has(h));
+
+ if (toRemove.length || toAdd.length) {
+ onLog(` Diff: remove ${toRemove.length} (${toRemove.slice(0, 3).join(', ')}${toRemove.length > 3 ? '...' : ''}), add ${toAdd.length}`);
+ }
+
+ const rm = removeMediaMaybeAddOrphan;
+ toRemove.forEach((hash) => {
+ const oldEntry = oldPageEntries.find((e) => e.hash === hash);
+ if (oldEntry) {
+ removed += rm(updatedIndex, oldEntry, normalizedPath, medialogEntries);
+ }
+ });
+
+ toAdd.forEach((hash) => {
+ const media = newPageMedia.find((m) => m.mediaHash === hash);
+ if (media) {
+ updatedIndex.push({
+ hash: media.mediaHash,
+ page: normalizedPath,
+ url: media.path,
+ name: extractName(media),
+ timestamp: media.timestamp,
+ user: media.user,
+ operation: media.operation,
+ type: detectMediaType(media),
+ status: 'referenced',
+ });
+ added += 1;
+ }
+ });
+
+ unchanged.forEach((hash) => {
+ const idx = updatedIndex.findIndex((e) => e.hash === hash && e.page === normalizedPath);
+ const media = newPageMedia.find((m) => m.mediaHash === hash);
+ if (idx !== -1 && media) {
+ updatedIndex[idx].timestamp = media.timestamp;
+ }
+ });
+ });
+
+ return { added, removed };
+}
+
+/**
+ * Process standalone media uploads (no page association)
+ * @param {Array} updatedIndex - Index being built (mutated)
+ * @param {Array} medialogEntries - New medialog entries
+ * @param {Set} referencedHashes - Already referenced media hashes
+ * @returns {number} Added count
+ */
+function processStandaloneUploads(updatedIndex, medialogEntries, referencedHashes) {
+ let added = 0;
+ const standaloneUploads = medialogEntries.filter((m) => !m.resourcePath && m.originalFilename);
+
+ standaloneUploads.forEach((media) => {
+ if (!referencedHashes.has(media.mediaHash)) {
+ const exists = updatedIndex.some((e) => e.hash === media.mediaHash && !e.page);
+ if (!exists) {
+ updatedIndex.push({
+ hash: media.mediaHash,
+ page: '',
+ url: media.path,
+ name: media.originalFilename.split('/').pop(),
+ timestamp: media.timestamp,
+ user: media.user,
+ operation: media.operation,
+ type: detectMediaType(media),
+ status: 'unused',
+ });
+ added += 1;
+ }
+ }
+ });
+
+ return added;
+}
+
+/**
+ * Process linked content (PDFs, SVGs, fragments) for incremental index
+ * @param {Array} updatedIndex - Index being built (mutated)
+ * @param {Array} files - File events from auditlog
+ * @param {Array} pages - Page events
+ * @param {Function} onProgress - Progress callback
+ * @param {Function} onLog - Log callback
+ * @returns {Promise<{added: number, removed: number}>} Counts
+ */
+async function processLinkedContentIncremental(updatedIndex, files, pages, onProgress, onLog) {
+ let added = 0;
+ let removed = 0;
+
+ const filesByPath = new Map();
+ files.forEach((e) => {
+ if (!isPdfOrSvg(e.path) && !isFragment(e.path)) return;
+ const p = e.path;
+ const existing = filesByPath.get(p);
+ if (!existing || e.timestamp > existing.timestamp) filesByPath.set(p, e);
+ });
+
+ const deletedPaths = new Set();
+ filesByPath.forEach((event, path) => {
+ if (event.method === 'DELETE') deletedPaths.add(path);
+ });
+
+ // Remove deleted linked content
+ deletedPaths.forEach((path) => {
+ const idx = updatedIndex.findIndex(
+ (e) => (e.operation === 'auditlog-parsed' || e.source === 'auditlog-parsed') && e.hash === path,
+ );
+ if (idx !== -1) {
+ updatedIndex.splice(idx, 1);
+ removed += 1;
+ onLog(`Removed linked content (DELETE): ${path}`);
+ }
+ });
+
+ // Build usage map
+ onProgress({ stage: 'processing', message: 'Building usage map for linked content...', percent: 83 });
+ const usageMap = await buildContentUsageMap(pages, (p) => onProgress(p));
+
+ const allLinkedPaths = new Set(filesByPath.keys());
+ ['pdfs', 'svgs', 'fragments'].forEach((key) => {
+ usageMap[key]?.forEach((_, path) => allLinkedPaths.add(path));
+ });
+
+ // Add existing linked content paths whose pages were parsed
+ const parsedPages = new Set(pages.map((p) => normalizePath(p.path)));
+ updatedIndex.forEach((e) => {
+ const isLinkedContent = e.operation === 'auditlog-parsed' || e.source === 'auditlog-parsed';
+ if (!isLinkedContent) return;
+ const entryPages = (e.page || '').split(',').map((p) => p.trim()).filter(Boolean);
+ if (entryPages.some((p) => parsedPages.has(p))) {
+ allLinkedPaths.add(e.hash);
+ }
+ });
+
+ allLinkedPaths.forEach((filePath) => {
+ if (deletedPaths.has(filePath)) return;
+
+ let key = 'fragments';
+ if (isPdf(filePath)) key = 'pdfs';
+ else if (isSvg(filePath)) key = 'svgs';
+ const linkedPages = usageMap[key]?.get(filePath) || [];
+ const status = linkedPages.length > 0 ? 'referenced' : 'file-unused';
+ const fileEvent = filesByPath.get(filePath) || { timestamp: 0, user: '' };
+
+ const isLinked = (e) => (e.operation === 'auditlog-parsed' || e.source === 'auditlog-parsed')
+ && e.hash === filePath;
+ const existingIdx = updatedIndex.findIndex(isLinked);
+
+ if (existingIdx !== -1) {
+ updatedIndex[existingIdx].page = linkedPages.length > 0 ? linkedPages.join(',') : '';
+ updatedIndex[existingIdx].timestamp = fileEvent.timestamp;
+ updatedIndex[existingIdx].status = status;
+ } else {
+ updatedIndex.push(toLinkedContentEntry(filePath, linkedPages, fileEvent, status));
+ added += 1;
+ }
+ });
+
+ return { added, removed };
+}
+
+/**
+ * Incremental re-index: fetch logs since lastFetchTime, merge with existing index.
+ * Detects additions, removals, and updates per page.
+ * @param {Function} onProgress - Progress callback
+ * @param {Function} [onLog] - Optional debug log callback for per-page details
+ */
+export async function buildIncrementalIndex(onProgress, onLog = noop) {
+ const metaPath = `${sitePath}/.da/mediaindex/medialog-meta.json`;
+ const indexPath = `${sitePath}/.da/mediaindex/media-index.json`;
+ const meta = await loadMeta(metaPath);
+ const lastFetchTime = meta?.lastFetchTime;
+
+ if (!lastFetchTime) {
+ throw new Error('Cannot run incremental: meta missing lastFetchTime');
+ }
+
+ onLog(`lastFetchTime: ${lastFetchTime} (${new Date(lastFetchTime).toISOString()})`);
+ onProgress({
+ stage: 'starting',
+ message: 'Mode: Incremental re-index (since last build)',
+ percent: 5,
+ });
+
+ onProgress({ stage: 'loading', message: 'Loading existing index...', percent: 8 });
+ const existingIndex = await loadIndex(indexPath);
+
+ onLog(`Fetching auditlog since ${new Date(lastFetchTime).toISOString()}`);
+ onProgress({ stage: 'fetching', message: 'Fetching new auditlog entries...', percent: 15 });
+ const auditlogEntries = await fetchFromAdminAPI('log', org, repo, ref, lastFetchTime, API_PAGE_SIZE, (entries, hasMore) => {
+ onProgress({
+ stage: 'fetching',
+ message: `Fetched ${entries.length} auditlog entries${hasMore ? ' (more available)' : ''}...`,
+ percent: 25,
+ });
+ });
+
+ const validEntries = auditlogEntries.filter((e) => e && e.path && e.route === 'preview');
+ const pages = validEntries.filter((e) => isPage(e.path));
+
+ onProgress({ stage: 'fetching', message: 'Fetching new medialog entries...', percent: 35 });
+ const medialogEntries = await fetchFromAdminAPI('medialog', org, repo, ref, lastFetchTime, API_PAGE_SIZE, (entries, hasMore) => {
+ onProgress({
+ stage: 'fetching',
+ message: `Fetched ${entries.length} medialog entries${hasMore ? ' (more available)' : ''}...`,
+ percent: 45,
+ });
+ });
+
+ if (pages.length === 0 && medialogEntries.length === 0) {
+ onProgress({
+ stage: 'complete',
+ message: 'No new activity since last build - index unchanged',
+ percent: 100,
+ });
+ return existingIndex;
+ }
+
+ onLog(`Auditlog: ${auditlogEntries.length} entries, ${pages.length} pages`);
+ onLog(`Medialog: ${medialogEntries.length} entries (all since lastFetchTime)`);
+ onProgress({
+ stage: 'processing',
+ message: `Processing ${pages.length} pages with ${medialogEntries.length} medialog entries...`,
+ percent: 55,
+ });
+
+ const updatedIndex = [...existingIndex];
+
+ const pagesByPath = new Map();
+ pages.forEach((e) => {
+ const p = normalizePath(e.path);
+ if (!pagesByPath.has(p)) pagesByPath.set(p, []);
+ pagesByPath.get(p).push(e);
+ });
+
+ /**
+ * Indexing strategy for multiple preview events per page
+ * Rule: Process only the LATEST preview event per page, skip others
+ * Rationale: Latest preview represents current state; earlier previews are superseded
+ * Example: Page previewed at T1, T2, T3 → only process T3's media associations
+ * Trade-off: Simpler logic, potential to miss media if window misaligned (acceptable)
+ */
+ pagesByPath.forEach((events) => {
+ events.sort((a, b) => b.timestamp - a.timestamp);
+ });
+ onLog(`Time window: ${INCREMENTAL_WINDOW_MS / 1000}s (medialog within window of latest preview)`);
+ onLog(`Pages to process: ${pagesByPath.size} (${[...pagesByPath.keys()].join(', ')})`);
+ onLog(`Medialog entries since lastFetch: ${medialogEntries.length}`);
+
+ // Process page-level media updates
+ const pageResults = processPageMediaUpdates(updatedIndex, pagesByPath, medialogEntries, onLog);
+ let { added, removed } = pageResults;
+
+ // Calculate referenced hashes for standalone upload processing
+ const referencedHashes = new Set(
+ updatedIndex.filter((e) => e.page).flatMap((e) => e.hash),
+ );
+
+ // Process standalone uploads
+ const standaloneAdded = processStandaloneUploads(updatedIndex, medialogEntries, referencedHashes);
+ added += standaloneAdded;
+
+ // Process linked content
+ const files = validEntries.filter((e) => !isPage(e.path));
+ const linkedResults = await processLinkedContentIncremental(
+ updatedIndex,
+ files,
+ pages,
+ onProgress,
+ onLog,
+ );
+ added += linkedResults.added;
+ removed += linkedResults.removed;
+
+ onProgress({
+ stage: 'processing',
+ message: `Incremental: +${added} added, -${removed} removed, total: ${updatedIndex.length}`,
+ percent: 85,
+ });
+
+ onProgress({ stage: 'saving', message: `Saving ${updatedIndex.length} entries...`, percent: 90 });
+
+ const formData = await createSheet(updatedIndex);
+ await daFetch(`${DA_ADMIN}/source${indexPath}`, {
+ method: 'POST',
+ body: formData,
+ });
+
+ await saveMeta({
+ lastFetchTime: Date.now(),
+ entriesCount: updatedIndex.length,
+ lastRefreshBy: 'media-indexer',
+ lastBuildMode: 'incremental',
+ }, metaPath);
+
+ onProgress({
+ stage: 'complete',
+ message: `Incremental complete! ${updatedIndex.length} entries (${added} added, ${removed} removed)`,
+ percent: 100,
+ });
+
+ return updatedIndex;
+}
+
+export async function buildInitialIndex(onProgress) {
+ const index = [];
+ const buildMode = 'full'; // incremental not yet implemented
+
+ onProgress({
+ stage: 'starting',
+ message: 'Mode: Full build (rebuilding from auditlog + medialog)',
+ percent: 5,
+ });
+
+ // Phase 1: Stream auditlog, build maps (no full accumulation)
+ onProgress({ stage: 'fetching', message: 'Fetching auditlog (streaming)...', percent: 10 });
+
+ const pagesByPath = new Map(); // normalizedPath -> [events] sorted desc
+ const filesByPath = new Map(); // path -> latest event
+ const deletedPaths = new Set();
+ let auditlogCount = 0;
+
+ await fetchFromAdminAPIStreaming('log', org, repo, ref, null, API_PAGE_SIZE, (chunk) => {
+ const rawCount = chunk.length;
+ const droppedNoPath = chunk.filter((e) => !e?.path).length;
+ const droppedRoute = chunk.filter((e) => e?.path && e.route !== 'preview').length;
+ if (droppedNoPath > 0 || droppedRoute > 0) {
+ logger.debug(`[auditlog chunk] raw=${rawCount}, dropped(no path)=${droppedNoPath}, dropped(route!==preview)=${droppedRoute}`);
+ }
+ chunk.forEach((e) => {
+ if (!e?.path || e.route !== 'preview') return;
+ auditlogCount += 1;
+ if (isPage(e.path)) {
+ const p = normalizePath(e.path);
+ if (!pagesByPath.has(p)) pagesByPath.set(p, []);
+ pagesByPath.get(p).push(e);
+ } else {
+ const fp = normalizeFilePath(e.path);
+ const existing = filesByPath.get(fp);
+ if (!existing || e.timestamp > existing.timestamp) {
+ filesByPath.set(fp, e);
+ }
+ }
+ });
+ onProgress({
+ stage: 'fetching',
+ message: `Auditlog: ${auditlogCount} entries, ${pagesByPath.size} pages...`,
+ percent: 15,
+ });
+ });
+
+ pagesByPath.forEach((events) => events.sort((a, b) => b.timestamp - a.timestamp));
+
+ const pages = [];
+ pagesByPath.forEach((events) => pages.push(...events));
+
+ /**
+ * Deletion detection strategy: Only mark as deleted if LATEST event is DELETE
+ * Rationale: If a file was deleted then re-added, the latest event reflects current state
+ * Assumption: filesByPath contains only the latest event per path (maintained above)
+ * Example timeline: DELETE at T1, POST at T2 → latest=POST → not deleted (correct)
+ */
+ filesByPath.forEach((event, path) => {
+ if (isLinkedContentPath(path) && event.method === 'DELETE') {
+ deletedPaths.add(path);
+ }
+ });
+
+ const iconPathsFromAuditlog = [...filesByPath.keys()].filter((p) => p.includes('/icons/'));
+ const iconPathsInDeleted = [...deletedPaths].filter((p) => p.includes('/icons/'));
+ logger.debug(`[auditlog done] total=${auditlogCount}, pages=${pagesByPath.size}, files=${filesByPath.size}, deleted=${deletedPaths.size}`);
+ logger.debug(` icon paths from auditlog: [${iconPathsFromAuditlog.join(', ') || 'none'}]`);
+ logger.debug(` icon paths in deletedPaths: [${iconPathsInDeleted.join(', ') || 'none'}]`);
+
+ onProgress({
+ stage: 'fetching',
+ message: `Identified ${pages.length} page events, ${filesByPath.size} files`,
+ percent: 25,
+ });
+
+ // Phase 2: Stream medialog, process each chunk (no full accumulation)
+ onProgress({ stage: 'fetching', message: 'Fetching medialog (streaming)...', percent: 30 });
+
+ const entryMap = new Map();
+ const referencedHashes = new Set();
+ const standaloneBuffer = [];
+ let medialogCount = 0;
+
+ await fetchFromAdminAPIStreaming('medialog', org, repo, ref, null, API_PAGE_SIZE, (chunk) => {
+ logger.debug(`[medialog chunk] ${chunk.length} entries`);
+ chunk.forEach((media) => {
+ medialogCount += 1;
+ if (media.resourcePath) {
+ const matches = findMatchingPageEvents(pagesByPath, media.resourcePath, media.timestamp);
+ matches.forEach((pageEvent) => {
+ const normalizedPath = normalizePath(pageEvent.path);
+ const hash = media.mediaHash;
+ const key = `${hash}|${normalizedPath}`;
+ const existing = entryMap.get(key);
+ if (!existing || media.timestamp > existing.timestamp) {
+ entryMap.set(key, {
+ hash,
+ page: normalizedPath,
+ url: media.path,
+ name: extractName(media),
+ timestamp: media.timestamp,
+ user: media.user,
+ operation: media.operation,
+ type: detectMediaType(media),
+ status: 'referenced',
+ });
+ }
+ referencedHashes.add(hash);
+ });
+ } else if (media.originalFilename) {
+ standaloneBuffer.push(media);
+ }
+ });
+ const mem = checkMemory();
+ if (mem.warning) {
+ onProgress({
+ stage: 'processing',
+ message: `Memory: ${mem.usedMB.toFixed(0)}MB / ${mem.limitMB.toFixed(0)}MB`,
+ percent: 35,
+ });
+ } else {
+ onProgress({
+ stage: 'fetching',
+ message: `Medialog: ${medialogCount} entries processed...`,
+ percent: 35,
+ });
+ }
+ });
+
+ onProgress({
+ stage: 'processing',
+ message: `Processed ${medialogCount} medialog, ${entryMap.size} page refs`,
+ percent: 60,
+ });
+
+ // Phase 3: Process standalone uploads
+ standaloneBuffer.forEach((media) => {
+ const hash = media.mediaHash;
+ if (!referencedHashes.has(hash)) {
+ const key = `${hash}|`;
+ const existing = entryMap.get(key);
+ if (!existing || media.timestamp > existing.timestamp) {
+ entryMap.set(key, {
+ hash,
+ page: '',
+ url: media.path,
+ name: media.originalFilename.split('/').pop(),
+ timestamp: media.timestamp,
+ user: media.user,
+ operation: media.operation,
+ type: detectMediaType(media),
+ status: 'unused',
+ });
+ }
+ }
+ });
+
+ onProgress({
+ stage: 'processing',
+ message: `Standalone: ${standaloneBuffer.length}, total: ${entryMap.size}`,
+ percent: 70,
+ });
+
+ // Convert Map to array
+ entryMap.forEach((entry) => {
+ index.push(entry);
+ });
+
+ // Phase 5: Linked content (PDFs, SVGs, fragments) - parse pages for usage
+ onProgress({ stage: 'processing', message: 'Building content usage map (parsing pages)...', percent: 78 });
+ const usageMap = await buildContentUsageMap(pages, (p) => onProgress(p));
+
+ const linkedFilesByPath = new Map();
+ filesByPath.forEach((e, p) => {
+ if (!isPdfOrSvg(p) && !isFragment(p)) return;
+ linkedFilesByPath.set(p, e);
+ });
+
+ const usageKey = (path) => {
+ if (isPdf(path)) return 'pdfs';
+ if (isSvg(path)) return 'svgs';
+ return 'fragments';
+ };
+
+ const allLinkedPaths = new Set(linkedFilesByPath.keys());
+ ['pdfs', 'svgs', 'fragments'].forEach((key) => {
+ usageMap[key]?.forEach((_, path) => allLinkedPaths.add(path));
+ });
+
+ const iconPathsInAllLinked = [...allLinkedPaths].filter((p) => p.includes('/icons/'));
+ logger.debug(`[linked content] linkedFilesByPath=${linkedFilesByPath.size}, allLinkedPaths=${allLinkedPaths.size} (after merge with usageMap) | icon paths: [${iconPathsInAllLinked.join(', ') || 'none'}]`);
+
+ allLinkedPaths.forEach((filePath) => {
+ if (deletedPaths.has(filePath)) {
+ if (filePath.includes('/icons/')) {
+ logger.debug(`[linked content] SKIP (in deletedPaths): ${filePath}`);
+ }
+ return;
+ }
+ const key = usageKey(filePath);
+ const linkedPages = usageMap[key]?.get(filePath) || [];
+ const status = linkedPages.length > 0 ? 'referenced' : 'file-unused';
+ const fileEvent = linkedFilesByPath.get(filePath) || { timestamp: 0, user: '' };
+ index.push(toLinkedContentEntry(filePath, linkedPages, fileEvent, status));
+ });
+
+ const linkedContentCount = index.length - entryMap.size;
+ const iconEntriesInIndex = index.filter((e) => e.hash?.includes?.('/icons/'));
+ logger.debug(`[full build done] media=${entryMap.size}, linked content=${linkedContentCount}, total=${index.length} | icon entries in index: [${iconEntriesInIndex.map((e) => e.hash).join(', ') || 'none'}]`);
+
+ onProgress({
+ stage: 'processing',
+ message: `Added ${allLinkedPaths.size} linked content entries (PDFs, SVGs, fragments)`,
+ percent: 82,
+ });
+
+ onProgress({ stage: 'saving', message: `Saving ${index.length} entries...`, percent: 90 });
+
+ const indexPath = `${sitePath}/.da/mediaindex/media-index.json`;
+ const formData = await createSheet(index);
+ await daFetch(`${DA_ADMIN}/source${indexPath}`, {
+ method: 'POST',
+ body: formData,
+ });
+
+ await saveMeta({
+ lastFetchTime: Date.now(),
+ entriesCount: index.length,
+ lastRefreshBy: 'media-indexer',
+ lastBuildMode: buildMode,
+ }, `${sitePath}/.da/mediaindex/medialog-meta.json`);
+
+ onProgress({ stage: 'complete', message: `Complete! ${index.length} entries indexed`, percent: 100 });
+
+ return index;
+}
diff --git a/tools/media/lib/config.js b/tools/media/lib/config.js
new file mode 100644
index 0000000..621094a
--- /dev/null
+++ b/tools/media/lib/config.js
@@ -0,0 +1,39 @@
+/**
+ * Configuration and state management for media indexer
+ */
+
+/**
+ * Validate GitHub org/repo name to prevent injection attacks
+ * Allows: alphanumeric, hyphens, underscores, dots (standard GitHub naming)
+ * @param {string} name - Org or repo name
+ * @returns {string|null} Validated name or null if invalid
+ */
+function validateGitHubName(name) {
+ if (!name || typeof name !== 'string') return null;
+ // GitHub allows alphanumeric, hyphens, underscores, dots
+ // Must not start/end with special chars, max 100 chars
+ const validPattern = /^[a-zA-Z0-9]([a-zA-Z0-9._-]{0,98}[a-zA-Z0-9])?$/;
+ return validPattern.test(name) ? name : null;
+}
+
+// Parse URL parameters
+const params = new URLSearchParams(window.location.search);
+const rawOrg = params.get('org');
+const rawRepo = params.get('repo') || params.get('site');
+
+export const org = validateGitHubName(rawOrg);
+export const repo = validateGitHubName(rawRepo);
+export const ref = 'main';
+export const sitePath = org && repo ? `/${org}/${repo}` : null;
+
+export const DA_ADMIN = 'https://admin.da.live';
+
+export const state = {
+ building: false,
+ progress: { stage: 'idle', message: '', percent: 0 },
+ buildStartTime: null,
+ errors: [],
+ logs: [],
+ status: null,
+ daToken: null,
+};
diff --git a/tools/media/lib/helpers.js b/tools/media/lib/helpers.js
new file mode 100644
index 0000000..23a0c5f
--- /dev/null
+++ b/tools/media/lib/helpers.js
@@ -0,0 +1,177 @@
+/**
+ * Helper functions for path normalization, type detection, and name extraction
+ */
+
+import * as logger from './logger.js';
+
+/**
+ * Normalize path by removing query params and adding .md for pages
+ * @param {string} path - The path to normalize
+ * @returns {string} Normalized path
+ */
+export function normalizePath(path) {
+ if (!path) return '';
+ let cleanPath = path.split('?')[0].split('#')[0];
+ if (!cleanPath.includes('.') && !cleanPath.startsWith('/media/')) {
+ cleanPath = cleanPath === '/' || cleanPath === '' ? '/index.md' : `${cleanPath}.md`;
+ }
+ return cleanPath;
+}
+
+/**
+ * Check if a path represents a page (not a media file or fragment)
+ * @param {string} path - The path to check
+ * @returns {boolean} True if path is a page
+ */
+export function isPage(path) {
+ if (!path || typeof path !== 'string') return false;
+ return (path.endsWith('.md')
+ || (!path.includes('.') && !path.startsWith('/media/')))
+ && !path.includes('/fragments/');
+}
+
+/**
+ * Extract filename from medialog entry or path
+ * @param {object} mediaEntry - The medialog entry
+ * @returns {string} Extracted filename
+ */
+export function extractName(mediaEntry) {
+ if (!mediaEntry) return '';
+ if (mediaEntry.originalFilename) {
+ return mediaEntry.originalFilename.split('/').pop();
+ }
+ if (!mediaEntry.path) return '';
+ return mediaEntry.path.split('?')[0].split('#')[0].split('/').pop();
+}
+
+/** Phase 2: Linked content type detection */
+export function isPdf(path) {
+ return path && path.toLowerCase().endsWith('.pdf');
+}
+
+export function isSvg(path) {
+ return path && path.toLowerCase().endsWith('.svg');
+}
+
+export function isFragment(path) {
+ return path && path.includes('/fragments/');
+}
+
+/** True if path is PDF, SVG, or fragment (linked content from auditlog) */
+export function isLinkedContentPath(path) {
+ return path && (isPdf(path) || isSvg(path) || isFragment(path));
+}
+
+/** Normalize file path for matching (ensure leading slash) */
+export function normalizeFilePath(path) {
+ if (!path) return '';
+ const p = path.split('?')[0].split('#')[0].trim();
+ return p.startsWith('/') ? p : `/${p}`;
+}
+
+export function isPdfOrSvg(path) {
+ return isPdf(path) || isSvg(path);
+}
+
+/**
+ * Get file type in same format as media: "category > extension"
+ * @param {string} path - File path
+ * @returns {string} e.g. "document > pdf", "image > svg", "content > fragment"
+ */
+export function getFileType(path) {
+ if (isPdf(path)) return 'document > pdf';
+ if (isSvg(path)) return 'image > svg';
+ if (isFragment(path)) return 'content > fragment';
+ return 'unknown';
+}
+
+function toPath(href) {
+ if (!href) return '';
+ try {
+ if (href.startsWith('http')) {
+ return new URL(href).pathname;
+ }
+ return href.startsWith('/') ? href : `/${href}`;
+ } catch (error) {
+ logger.error(`Failed to parse URL ${href}:`, error.message);
+ return href;
+ }
+}
+
+/** Markdown link regex: [text](url) or  - captures URL in group 1 */
+const MD_LINK_RE = /\[[^\]]*\]\(([^)]+)\)/gi;
+
+/** Markdown autolink: - captures URL in group 1 */
+const MD_AUTOLINK_RE = /<(https?:\/\/[^>]+|\/[^>\s]*)>/g;
+
+/** Icon shorthand: :iconname: → /icons/iconname.svg */
+const ICON_RE = /:([a-zA-Z0-9-]+):/g;
+/** Exclude doc terms like "with :svg: syntax" to avoid false positives */
+const ICON_DOC_EXCLUDE = new Set(['svg', 'pdf', 'image', 'link', 'syntax']);
+
+/**
+ * Extract all URLs from markdown: [text](url), , and autolinks
+ * @param {string} md - Raw markdown
+ * @returns {string[]} - URLs from link syntax
+ */
+function extractUrlsFromMarkdown(md) {
+ if (!md || typeof md !== 'string') return [];
+ const fromLinks = [...md.matchAll(MD_LINK_RE)].map((m) => m[1].trim());
+ const fromAutolinks = [...md.matchAll(MD_AUTOLINK_RE)].map((m) => m[1].trim());
+ return [...fromLinks, ...fromAutolinks];
+}
+
+/**
+ * Extract icon references from :iconname: shorthand (resolves to /icons/iconname.svg)
+ * @param {string} md - Raw markdown
+ * @returns {string[]} - Normalized paths like /icons/headset.svg
+ */
+export function extractIconReferences(md) {
+ if (!md || typeof md !== 'string') return [];
+ const matches = [...md.matchAll(ICON_RE)];
+ return [...new Set(
+ matches
+ .filter((m) => !ICON_DOC_EXCLUDE.has(m[1].toLowerCase()))
+ .map((m) => `/icons/${m[1]}.svg`),
+ )];
+}
+
+/**
+ * Extract fragment references from markdown (links to /fragments/...)
+ * @param {string} md - Raw markdown
+ * @returns {string[]} - Normalized paths
+ */
+export function extractFragmentReferences(md) {
+ const urls = extractUrlsFromMarkdown(md);
+ return [...new Set(urls.filter((u) => u.includes('/fragments/')).map((u) => toPath(u)))];
+}
+
+/**
+ * Extract links matching pattern (e.g. .pdf, .svg) from markdown
+ * @param {string} md - Raw markdown
+ * @param {RegExp} pattern - Pattern to match (e.g. /\.pdf$/)
+ * @returns {string[]} - Normalized paths
+ */
+export function extractLinks(md, pattern) {
+ const urls = extractUrlsFromMarkdown(md);
+ const pathPart = (u) => u.split('?')[0].split('#')[0];
+ return [...new Set(urls.filter((u) => pattern.test(pathPart(u))).map((u) => toPath(u)))];
+}
+
+/**
+ * Detect media type from contentType in structured format
+ * @param {object} mediaEntry - The medialog entry
+ * @returns {string} Type in format "category > extension"
+ */
+export function detectMediaType(mediaEntry) {
+ const contentType = mediaEntry.contentType || '';
+ if (contentType.startsWith('image/')) {
+ const ext = contentType.split('/')[1];
+ return `img > ${ext}`;
+ }
+ if (contentType.startsWith('video/')) {
+ const ext = contentType.split('/')[1];
+ return `video > ${ext}`;
+ }
+ return 'unknown';
+}
diff --git a/tools/media/lib/logger.js b/tools/media/lib/logger.js
new file mode 100644
index 0000000..5d7f752
--- /dev/null
+++ b/tools/media/lib/logger.js
@@ -0,0 +1,79 @@
+/**
+ * Logging utility with configurable log levels
+ */
+
+const LOG_LEVELS = {
+ DEBUG: 0,
+ INFO: 1,
+ WARN: 2,
+ ERROR: 3,
+ NONE: 4,
+};
+
+/**
+ * Logger configuration
+ * Set LOG_LEVEL to control verbosity in production
+ */
+const config = {
+ // Change to LOG_LEVELS.INFO or LOG_LEVELS.WARN for production
+ level: LOG_LEVELS.DEBUG,
+ prefix: '[MediaIndexer]',
+};
+
+/**
+ * Set log level
+ * @param {number} level - Log level from LOG_LEVELS
+ */
+export function setLogLevel(level) {
+ config.level = level;
+}
+
+/**
+ * Debug logging - verbose details for development
+ * @param {string} message - Log message
+ * @param {...any} args - Additional arguments
+ */
+export function debug(message, ...args) {
+ if (config.level <= LOG_LEVELS.DEBUG) {
+ // eslint-disable-next-line no-console
+ console.log(`${config.prefix}[DEBUG]`, message, ...args);
+ }
+}
+
+/**
+ * Info logging - general information
+ * @param {string} message - Log message
+ * @param {...any} args - Additional arguments
+ */
+export function info(message, ...args) {
+ if (config.level <= LOG_LEVELS.INFO) {
+ // eslint-disable-next-line no-console
+ console.log(`${config.prefix}[INFO]`, message, ...args);
+ }
+}
+
+/**
+ * Warning logging - potential issues
+ * @param {string} message - Log message
+ * @param {...any} args - Additional arguments
+ */
+export function warn(message, ...args) {
+ if (config.level <= LOG_LEVELS.WARN) {
+ // eslint-disable-next-line no-console
+ console.warn(`${config.prefix}[WARN]`, message, ...args);
+ }
+}
+
+/**
+ * Error logging - failures and exceptions
+ * @param {string} message - Log message
+ * @param {...any} args - Additional arguments
+ */
+export function error(message, ...args) {
+ if (config.level <= LOG_LEVELS.ERROR) {
+ // eslint-disable-next-line no-console
+ console.error(`${config.prefix}[ERROR]`, message, ...args);
+ }
+}
+
+export { LOG_LEVELS };
diff --git a/tools/media/lib/ui.js b/tools/media/lib/ui.js
new file mode 100644
index 0000000..2e4e0cc
--- /dev/null
+++ b/tools/media/lib/ui.js
@@ -0,0 +1,175 @@
+/**
+ * UI rendering and event handling
+ */
+
+import { state, org, repo } from './config.js';
+import {
+ buildInitialIndex, buildIncrementalIndex, shouldReindex, getIndexStatus,
+} from './builder.js';
+
+export function render() {
+ const app = document.getElementById('app');
+
+ const statusHtml = state.status ? `
+
+
Current Index Status
+
+
+
+ ${state.status.lastRefresh ? new Date(state.status.lastRefresh).toLocaleString() : 'Never'}
+
+
+
+ ${state.status.entriesCount || 0}
+
+ ${state.status.lastBuildMode ? `
+
+
+ ${state.status.lastBuildMode === 'full' ? 'Full rebuild' : 'Incremental'}
+
+ ` : ''}
+ ${state.status.indexLastModified != null ? `
+
+
+ ${new Date(state.status.indexLastModified).toLocaleString()}
+
+ ` : ''}
+
+
+ ` : 'Checking status...
';
+
+ const elapsedMs = state.buildStartTime ? Date.now() - state.buildStartTime : 0;
+ const elapsedStr = elapsedMs >= 1000 ? `${(elapsedMs / 1000).toFixed(1)}s` : `${elapsedMs}ms`;
+ const pct = state.progress.percent;
+ const etaMs = pct > 0 && pct < 100 ? (elapsedMs / pct) * (100 - pct) : 0;
+ const etaStr = etaMs > 0 ? `~${(etaMs / 1000).toFixed(1)}s` : '';
+ const { totalMs } = state.progress;
+ let totalStr = '';
+ if (totalMs != null) {
+ totalStr = totalMs >= 1000 ? `${(totalMs / 1000).toFixed(1)}s` : `${totalMs}ms`;
+ }
+
+ const timingHtml = state.progress.stage === 'complete' && totalStr
+ ? `Total: ${totalStr}`
+ : `Elapsed: ${elapsedStr}${etaStr ? `ETA: ${etaStr}` : ''}`;
+
+ const progressHtml = state.building || state.progress.stage !== 'idle' ? `
+
+
Progress
+
+
+ ${state.progress.stage}
+ ${state.progress.message}
+
+
+ ${timingHtml}
+
+
+ ` : '';
+
+ const logsHtml = state.logs.length > 0 ? `
+
+
Logs (${state.logs.length})
+
+ ${state.logs.map((log) => `- ${log.message}
`).join('')}
+
+
+ ` : '';
+
+ const errorsHtml = state.errors.length > 0 ? `
+
+
Errors (${state.errors.length})
+
+ ${state.errors.map((err) => `- ${err.message}
`).join('')}
+
+
+ ` : '';
+
+ app.innerHTML = `
+ Media Index Builder
+ Building index for: ${org}/${repo}
+
+ ${statusHtml}
+
+
+
+
+
+ ${progressHtml}
+ ${errorsHtml}
+ ${logsHtml}
+ `;
+}
+
+export function attachEventListeners() {
+ if (!state.building) {
+ const buildBtn = document.getElementById('buildBtn');
+ if (buildBtn) {
+ buildBtn.addEventListener('click', () => {
+ state.building = true;
+ state.buildStartTime = Date.now();
+ state.errors = [];
+ state.logs = [];
+ state.progress = { stage: 'starting', message: 'Checking build mode...', percent: 0 };
+ render();
+
+ const runBuild = (useIncremental) => {
+ const buildFn = useIncremental ? buildIncrementalIndex : buildInitialIndex;
+ const onLog = (msg) => {
+ state.logs.push({ message: msg, type: 'info' });
+ render();
+ };
+ return buildFn(
+ (progress) => {
+ let finalProgress = progress;
+ if (progress.stage === 'complete' && state.buildStartTime) {
+ const totalMs = Date.now() - state.buildStartTime;
+ const totalStr = totalMs >= 1000 ? `${(totalMs / 1000).toFixed(1)}s` : `${totalMs}ms`;
+ finalProgress = { ...progress, message: `${progress.message} (${totalStr})`, totalMs };
+ }
+ state.progress = finalProgress;
+ state.logs.push({ message: finalProgress.message, type: 'info' });
+ render();
+ },
+ useIncremental ? onLog : undefined,
+ );
+ };
+
+ shouldReindex()
+ .then(({ shouldReindex: useIncremental, reason }) => {
+ if (reason && !useIncremental) {
+ state.logs.push({ message: `Full build: ${reason}`, type: 'info' });
+ }
+ return runBuild(useIncremental);
+ })
+ .then((result) => {
+ const totalMs = state.buildStartTime ? Date.now() - state.buildStartTime : 0;
+ const totalStr = totalMs >= 1000 ? `${(totalMs / 1000).toFixed(1)}s` : `${totalMs}ms`;
+ state.logs.push({
+ message: `Index built successfully: ${result.length} entries (${totalStr})`,
+ type: 'success',
+ });
+ return getIndexStatus();
+ })
+ .then((status) => {
+ state.status = status;
+ })
+ .catch((error) => {
+ state.errors.push({ message: error.message });
+ state.logs.push({ message: `Error: ${error.message}`, type: 'error' });
+ state.progress = { stage: 'error', message: error.message, percent: 0 };
+ })
+ .finally(() => {
+ state.building = false;
+ state.buildStartTime = null;
+ render();
+ attachEventListeners();
+ });
+ });
+ }
+ }
+}
diff --git a/tools/media/media-indexing-strategy.md b/tools/media/media-indexing-strategy.md
new file mode 100644
index 0000000..752d5da
--- /dev/null
+++ b/tools/media/media-indexing-strategy.md
@@ -0,0 +1,2160 @@
+# Media Indexing Strategy for AEM Sites
+**Date:** February 24, 2026
+**Author:** Testing & Analysis with Claude Code
+**Project:** Media Library Integration with Medialog & Auditlog
+
+---
+
+## Summary
+
+Strategy for building a media index by combining **AEM Auditlog** and **Medialog** APIs, including operational architecture for backfilling historical data, initial index population, and ongoing refresh mechanisms.
+
+### Key Findings
+
+- **Two Log Sources:** Auditlog tracks page/file previews; Medialog tracks Media Bus items (images/videos)
+- **Temporal Relationship:** Auditlog entries precede medialog by 1.5-2 seconds
+- **Path Matching:** Critical to normalize paths (`/drafts/page` vs `/drafts/page.md`)
+- **Media Bus vs Content Delivery:** Images/videos use Media Bus (tracked in medialog), PDFs/SVGs/Fragments use content delivery (auditlog only)
+- **Parsing Required:** Fragments, PDFs, and SVGs require parsing page content to determine usage relationships
+- **Duplicate Events:** Every preview creates new log entries, even without content changes
+- **Time Window:** Use 5-second window to match medialog entries to auditlog events
+
+### Operational Architecture
+
+- **Two-Tier Backfill:** Separate CLI tool for medialog backfill, separate DA app for index population
+- **Historical Coverage:** Supports sites created before medialog existed (2023+) via Status API parsing
+- **Initial Setup:** One-time engineer-run process per repository (30-60 minutes total)
+- **Ongoing Refresh:** Browser-based auto-refresh every 10 minutes with distributed locking
+- **Multi-User Support:** Distributed lock prevents race conditions across concurrent users
+- **Scalability:** Handles sites with 10,000+ pages and 50,000+ media items
+
+---
+
+## Log Relationships
+
+### Auditlog vs Medialog
+
+| Aspect | Auditlog | Medialog |
+|--------|----------|----------|
+| **Purpose** | Tracks all preview/publish actions | Tracks Media Bus activity only |
+| **Scope** | Pages, PDFs, SVGs, Fragments, images, videos | Images and videos only |
+| **Timing** | Logged first (T) | Logged ~1.5-2s later (T+1500ms) |
+| **Path Format** | `/drafts/page` | `/drafts/page.md` |
+| **Contains** | Page-level events | Media-level events with `resourcePath` |
+
+### Linking Strategy
+
+```
+Auditlog Entry Medialog Entries
+┌─────────────────────┐ ┌──────────────────────────┐
+│ path: /drafts/page │ ───────>│ resourcePath: /drafts/ │
+│ timestamp: 1000 │ match │ page.md │
+│ │ by: │ timestamp: 1001-1005 │
+└─────────────────────┘ └──────────────────────────┘
+ │
+ │ Multiple media
+ ▼
+ All have same timestamp
+```
+
+**Matching Rules:**
+1. Normalize paths: `auditlog.path` + `.md` = `medialog.resourcePath`
+2. Time window: `medialog.timestamp` within 5 seconds after `auditlog.timestamp`
+3. Group medialog entries by `(resourcePath, timestamp)` to find page's media
+
+---
+
+## Content Types & Tracking Methods
+
+### Complete Matrix
+
+| Content Type | Delivery Method | Auditlog | Medialog | Tracking Method | Usage Relationship |
+|-------------|-----------------|----------|----------|-----------------|-------------------|
+| **Images (embedded)** | Media Bus | Page event | Entry with resourcePath | Medialog linking | From resourcePath field |
+| **Images (standalone)** | Media Bus | File event | Entry with originalFilename | Medialog linking | No usage (standalone) |
+| **Videos (embedded)** | Media Bus | Page event | Entry with resourcePath | Medialog linking | From resourcePath field |
+| **Videos (standalone)** | Media Bus | File event | Entry with originalFilename | Medialog linking | No usage (standalone) |
+| **PDFs** | Content delivery | File event | None | Auditlog + Parse | Parse page HTML for links |
+| **SVGs** | Content delivery | File event | None | Auditlog + Parse | Parse page HTML for links |
+| **Fragments** | Content delivery | File event | None | Auditlog + Parse | Parse page HTML for references |
+| **Icons** | Icons folder | None | None | Not tracked | N/A |
+
+### Why Different Tracking Methods?
+
+**Media Bus Items (Images, Videos):**
+- Deduplicated, content-addressed storage
+- Hash-based URLs: `media_/`
+- Tracked in medialog with `resourcePath` linking to pages
+- **No parsing required** - logs provide complete relationships
+
+**Content Delivery Items (PDFs, SVGs, Fragments):**
+- Regular file delivery
+- Standard preview/publish lifecycle
+- Not content-addressed
+- **Parsing required** - logs don't link to containing pages
+
+**Source:** [AEM Media Documentation](https://www.aem.live/docs/media)
+
+---
+
+## Test Scenarios & Results
+
+### Scenario A: Page with 3 Images
+
+**Action:** Created `/drafts/scenario-a.md` with 3 embedded images, previewed once
+
+**Auditlog Response:**
+```json
+{
+ "entries": [
+ {
+ "path": "/drafts/scenario-a",
+ "timestamp": 1771936397105,
+ "route": "preview",
+ "user": "user@example.com",
+ "status": 200
+ }
+ ]
+}
+```
+
+**Medialog Response:**
+```json
+{
+ "entries": [
+ {
+ "path": "https://main--repo--org.aem.live/media_a1b2c3/image1.jpg",
+ "operation": "ingest",
+ "timestamp": 1771936400523,
+ "resourcePath": "/drafts/scenario-a.md",
+ "contentType": "image/jpeg",
+ "user": "user@example.com",
+ "mediaHash": "a1b2c3",
+ "width": "2000",
+ "height": "1333"
+ },
+ {
+ "path": "https://main--repo--org.aem.live/media_d4e5f6/image2.jpg",
+ "operation": "ingest",
+ "timestamp": 1771936400523,
+ "resourcePath": "/drafts/scenario-a.md",
+ "contentType": "image/jpeg",
+ "user": "user@example.com",
+ "mediaHash": "d4e5f6",
+ "width": "2000",
+ "height": "1500"
+ },
+ {
+ "path": "https://main--repo--org.aem.live/media_g7h8i9/image3.jpg",
+ "operation": "ingest",
+ "timestamp": 1771936400523,
+ "resourcePath": "/drafts/scenario-a.md",
+ "contentType": "image/jpeg",
+ "user": "user@example.com",
+ "mediaHash": "g7h8i9",
+ "width": "2000",
+ "height": "1600"
+ }
+ ]
+}
+```
+
+**Learning:**
+- All media on same page share identical timestamp
+- Media added through markup: `operation: "ingest"` WITHOUT `originalFilename`
+- 3.4 second processing delay between logs
+- `resourcePath` links media to page
+
+---
+
+### Scenario B: Text-Only Page
+
+**Action:** Created `/drafts/scenario-b.md` with only text, previewed
+
+**Auditlog Response:**
+```json
+{
+ "entries": [
+ {
+ "path": "/drafts/scenario-b",
+ "timestamp": 1771936758410,
+ "route": "preview",
+ "user": "user@example.com",
+ "duration": 1112,
+ "status": 200
+ }
+ ]
+}
+```
+
+**Medialog Response:**
+```json
+{
+ "entries": []
+}
+```
+
+**Learning:**
+- Auditlog logs text-only pages
+- Empty medialog = no Media Bus items on page
+- Can detect "all media removed" pattern (with caveats)
+
+---
+
+### Scenario H: Standalone Media Preview
+
+**Action:** Uploaded and previewed 3 standalone files:
+- `/media/standalone-image.jpg` (image)
+- `/media/standalone-doc.pdf` (PDF)
+- `/media/standalone-graphic.svg` (SVG)
+
+**Auditlog Response:**
+```json
+{
+ "entries": [
+ {
+ "path": "/media/standalone-image.jpg",
+ "timestamp": 1771937123456,
+ "route": "preview",
+ "user": "user@example.com",
+ "status": 200
+ },
+ {
+ "path": "/media/standalone-doc.pdf",
+ "timestamp": 1771937125789,
+ "route": "preview",
+ "user": "user@example.com",
+ "status": 200
+ },
+ {
+ "path": "/media/standalone-graphic.svg",
+ "timestamp": 1771937128012,
+ "route": "preview",
+ "user": "user@example.com",
+ "status": 200
+ }
+ ]
+}
+```
+
+**Medialog Response:**
+```json
+{
+ "entries": [
+ {
+ "path": "https://main--repo--org.aem.live/media_j1k2l3/standalone-image.jpg",
+ "operation": "ingest",
+ "timestamp": 1771937124567,
+ "originalFilename": "/media/standalone-image.jpg",
+ "contentType": "image/jpeg",
+ "user": "user@example.com",
+ "mediaHash": "j1k2l3",
+ "owner": "2d0fcd52abc",
+ "repo": "2d0fcd52abc",
+ "width": "1920",
+ "height": "1080"
+ }
+ ]
+}
+```
+
+**Learning:**
+- Images: Appear in BOTH logs
+- PDFs/SVGs: Auditlog ONLY (not on Media Bus)
+- Standalone uploads have `originalFilename` + `owner` + `repo`
+- NO `resourcePath` (not linked to page)
+
+---
+
+### Scenario G: Page with Mixed Media
+
+**Action:** Created `/drafts/scenario-g.md` with:
+- 2 embedded images
+- 1 PDF preview link
+- 1 SVG preview link
+- 1 icon (`:headset:`)
+
+**Auditlog Response (page preview):**
+```json
+{
+ "entries": [
+ {
+ "path": "/drafts/scenario-g",
+ "timestamp": 1771937500000,
+ "route": "preview",
+ "user": "user@example.com",
+ "status": 200
+ }
+ ]
+}
+```
+
+**Auditlog Response (when user clicks PDF/SVG links):**
+```json
+{
+ "entries": [
+ {
+ "path": "/media/standalone-doc.pdf",
+ "timestamp": 1771937510000,
+ "route": "preview",
+ "user": "user@example.com",
+ "status": 200
+ },
+ {
+ "path": "/media/standalone-graphic.svg",
+ "timestamp": 1771937515000,
+ "route": "preview",
+ "user": "user@example.com",
+ "status": 200
+ }
+ ]
+}
+```
+
+**Medialog Response:**
+```json
+{
+ "entries": [
+ {
+ "path": "https://main--repo--org.aem.live/media_m4n5o6/image1.jpg",
+ "operation": "ingest",
+ "timestamp": 1771937501500,
+ "resourcePath": "/drafts/scenario-g.md",
+ "contentType": "image/jpeg",
+ "user": "user@example.com",
+ "mediaHash": "m4n5o6",
+ "width": "1800",
+ "height": "1200"
+ },
+ {
+ "path": "https://main--repo--org.aem.live/media_p7q8r9/image2.jpg",
+ "operation": "ingest",
+ "timestamp": 1771937501500,
+ "resourcePath": "/drafts/scenario-g.md",
+ "contentType": "image/jpeg",
+ "user": "user@example.com",
+ "mediaHash": "p7q8r9",
+ "width": "2000",
+ "height": "1500"
+ }
+ ]
+}
+```
+
+**Learning:**
+- Only embedded images tracked in medialog
+- Icons: Not tracked
+- PDF/SVG links: Create separate auditlog entries when clicked (unrelated timestamps)
+- **Cannot determine page→PDF/SVG relationships from logs**
+- **Parsing required** to find which pages reference PDFs/SVGs/Fragments
+
+---
+
+### Re-Preview Test: Duplicate Events
+
+**Action:** Re-previewed scenario-a, scenario-b, scenario-g without changes
+
+**Auditlog Response:**
+```json
+{
+ "entries": [
+ {
+ "path": "/drafts/scenario-b",
+ "timestamp": 1771938338331,
+ "route": "preview",
+ "user": "user@example.com",
+ "status": 200
+ },
+ {
+ "path": "/drafts/scenario-a",
+ "timestamp": 1771938338335,
+ "route": "preview",
+ "user": "user@example.com",
+ "status": 200
+ },
+ {
+ "path": "/drafts/scenario-g",
+ "timestamp": 1771938338340,
+ "route": "preview",
+ "user": "user@example.com",
+ "status": 200
+ }
+ ]
+}
+```
+
+**Medialog Response:**
+```json
+{
+ "entries": [
+ {
+ "path": "https://main--repo--org.aem.live/media_m4n5o6/image1.jpg",
+ "operation": "reuse",
+ "timestamp": 1771938339903,
+ "resourcePath": "/drafts/scenario-g.md",
+ "contentType": "image/jpeg",
+ "user": "user@example.com",
+ "mediaHash": "m4n5o6"
+ },
+ {
+ "path": "https://main--repo--org.aem.live/media_p7q8r9/image2.jpg",
+ "operation": "reuse",
+ "timestamp": 1771938339903,
+ "resourcePath": "/drafts/scenario-g.md",
+ "contentType": "image/jpeg",
+ "user": "user@example.com",
+ "mediaHash": "p7q8r9"
+ },
+ {
+ "path": "https://main--repo--org.aem.live/media_a1b2c3/image1.jpg",
+ "operation": "reuse",
+ "timestamp": 1771938340350,
+ "resourcePath": "/drafts/scenario-a.md",
+ "contentType": "image/jpeg",
+ "user": "user@example.com",
+ "mediaHash": "a1b2c3"
+ },
+ {
+ "path": "https://main--repo--org.aem.live/media_d4e5f6/image2.jpg",
+ "operation": "reuse",
+ "timestamp": 1771938340350,
+ "resourcePath": "/drafts/scenario-a.md",
+ "contentType": "image/jpeg",
+ "user": "user@example.com",
+ "mediaHash": "d4e5f6"
+ },
+ {
+ "path": "https://main--repo--org.aem.live/media_g7h8i9/image3.jpg",
+ "operation": "reuse",
+ "timestamp": 1771938340350,
+ "resourcePath": "/drafts/scenario-a.md",
+ "contentType": "image/jpeg",
+ "user": "user@example.com",
+ "mediaHash": "g7h8i9"
+ }
+ ]
+}
+```
+
+**Learning:**
+- Every preview creates new events (even without changes)
+- `operation: "reuse"` indicates media already exists
+- Cannot rely on "new events = new content"
+- Must compare current state vs previous state
+
+---
+
+## Parsing Strategy for Linked Content
+
+### Why Parsing is Needed
+
+**Media Bus items** (images/videos) have `resourcePath` in medialog that directly links them to pages. **Content delivery items** (PDFs/SVGs/Fragments) do NOT have this linking - you must parse page HTML to find references.
+
+### Current Implementation
+
+The codebase already parses for fragments:
+
+```javascript
+// From media-library.js (existing code)
+const [fragmentLogEntries, pageLogEntries] = await Promise.all([
+ fetchFragments(this.org, this.repo, 'main', since),
+ fetchPages(this.org, this.repo, 'main', since),
+]);
+
+// Parse pages to build fragment usage map
+const usageMap = await buildFragmentUsageMap(pageLogEntries, this.sitePath);
+
+const mergedData = mergeFragmentEntries(
+ fragmentsData,
+ fragmentLogEntries,
+ this.org,
+ this.repo,
+ usageMap // Usage map from parsing
+);
+```
+
+### Unified Parsing Approach
+
+Extend the existing fragment parsing to include PDFs and SVGs:
+
+```javascript
+async function buildContentUsageMap(pageLogEntries, org, repo) {
+ const usageMap = {
+ fragments: new Map(), // fragment path -> [page paths]
+ pdfs: new Map(), // pdf path -> [page paths]
+ svgs: new Map(), // svg path -> [page paths]
+ };
+
+ for (const pageEvent of pageLogEntries) {
+ // Fetch page HTML (single fetch per page)
+ const html = await fetchPageHtml(pageEvent.path, org, repo);
+
+ // Extract all content types in one pass
+ const fragments = extractFragmentReferences(html);
+ const pdfs = extractLinks(html, /\.pdf$/);
+ const svgs = extractLinks(html, /\.svg$/);
+
+ // Build usage maps
+ fragments.forEach(f => {
+ if (!usageMap.fragments.has(f)) {
+ usageMap.fragments.set(f, []);
+ }
+ usageMap.fragments.get(f).push(pageEvent.path);
+ });
+
+ pdfs.forEach(p => {
+ if (!usageMap.pdfs.has(p)) {
+ usageMap.pdfs.set(p, []);
+ }
+ usageMap.pdfs.get(p).push(pageEvent.path);
+ });
+
+ svgs.forEach(s => {
+ if (!usageMap.svgs.has(s)) {
+ usageMap.svgs.set(s, []);
+ }
+ usageMap.svgs.get(s).push(pageEvent.path);
+ });
+ }
+
+ return usageMap;
+}
+```
+
+### Extraction Helper Functions
+
+```javascript
+function extractFragmentReferences(html) {
+ // Fragments typically appear in href attributes
+ const fragmentPattern = /href="([^"]*\/fragments\/[^"]+)"/g;
+ const matches = [...html.matchAll(fragmentPattern)];
+ return matches.map(m => m[1]).map(normalizePath);
+}
+
+function extractLinks(html, pattern) {
+ // Extract href or src attributes matching pattern
+ const linkPattern = new RegExp(`(?:href|src)="([^"]*${pattern.source})"`, 'gi');
+ const matches = [...html.matchAll(linkPattern)];
+ return matches.map(m => m[1]).map(normalizePath);
+}
+
+function normalizePath(path) {
+ // Remove query params, hashes
+ return path.split('?')[0].split('#')[0];
+}
+```
+
+### Performance Optimization
+
+**Single-pass parsing:**
+```javascript
+// GOOD: Fetch once, extract all
+const html = await fetchPageHtml(page);
+const allContent = {
+ fragments: extractFragments(html),
+ pdfs: extractPdfs(html),
+ svgs: extractSvgs(html),
+};
+
+// BAD: Multiple fetches
+const fragments = extractFragments(await fetchPageHtml(page));
+const pdfs = extractPdfs(await fetchPageHtml(page)); // Duplicate fetch!
+```
+
+### When to Parse
+
+**Parse frequency:**
+- **Initial build:** Parse all pages to establish complete usage map
+- **Incremental update:** Parse only pages with new auditlog events
+- **Validation:** Periodic full re-parse (weekly/monthly) to catch any drift
+
+---
+
+## Operational Architecture
+
+### Overview
+
+Two-tier approach: historical data backfill (Tier 1) + ongoing incremental updates (Tier 2). Separates concerns, handles large sites efficiently, no server infrastructure for ongoing operations.
+
+### Architecture Diagram
+
+```
+Historical Data (2023-2026) Live Data (2026+)
+┌──────────────────────┐ ┌────────────────────┐
+│ Status API │ │ Auditlog API │
+│ (all pages) │ │ Medialog API │
+└──────────┬───────────┘ └─────────┬──────────┘
+ │ │
+ v v
+ ┌──────────────┐ ┌──────────────┐
+ │ Tier 1: │ │ Incremental │
+ │ Medialog │ │ Refresh │
+ │ Backfill CLI │ │ (10-min) │
+ └──────┬───────┘ └──────┬───────┘
+ │ │
+ v │
+ ┌──────────────┐ │
+ │ Medialog API │ │
+ │ (populated) │ │
+ └──────┬───────┘ │
+ │ │
+ └─────────────┬────────────────────┘
+ v
+ ┌──────────────┐
+ │ Tier 2: │
+ │ Index │
+ │ Population │
+ └──────┬───────┘
+ v
+ ┌──────────────┐
+ │ .da/ │
+ │ mediaindex/ │
+ │ index.json │
+ └──────────────┘
+```
+
+---
+
+### Tier 1: Medialog Backfill (One-Time)
+
+**Purpose:** Populate medialog API with historical data for sites created before medialog existed
+
+**Implementation:** CLI tool at `/media-log-ingestor`
+
+**Process:**
+1. Engineer runs CLI tool with org/repo credentials
+2. Tool fetches all pages via Status API
+3. Parses markdown content to extract media references
+4. Sends entries to Medialog API in batches
+5. Deduplicates based on media hash (first = ingest, subsequent = reuse)
+6. Enriches with user information from preview logs
+
+**Command:**
+```bash
+logmedia ingest --org myorg --repo myrepo --token
+```
+
+**Characteristics:**
+- One-time operation per repository
+- Takes 5-30 minutes depending on site size
+- Handles rate limits (10 req/sec)
+- Resumable on failure
+- Creates historical medialog entries with "ingest" operations
+
+**Output:**
+- Medialog API populated with historical media references
+- All pages analyzed, media tracked back to 2023
+- Ready for Tier 2 index building
+
+---
+
+### Tier 2: Index Population & Refresh
+
+#### Initial Population (One-Time per Site)
+
+**Purpose:** Build complete media index from medialog + auditlog data
+
+**Implementation:** Separate DA app at `/tools/media-indexer` (to be created)
+
+**Process:**
+1. Engineer navigates to `/tools/media-indexer`
+2. Clicks "Build Initial Index" button
+3. Server-side process:
+ - Fetches all medialog entries (from Tier 1 backfill)
+ - Fetches all auditlog entries (last 90 days)
+ - Processes and combines data
+ - Parses pages for PDFs/SVGs/Fragments
+ - Deduplicates and sorts
+ - Writes to `.da/mediaindex/index.json`
+4. Displays progress (X of Y pages processed)
+5. Completes in 30-60 seconds
+
+**Characteristics:**
+- One-time per repository
+- Server-side execution (handles large datasets)
+- Shows progress indicator
+- Atomic operation (succeeds or fails completely)
+- Creates complete index with all historical + recent data
+
+**Output:**
+```
+.da/mediaindex/
+├── index.json # Complete media index
+├── medialog-meta.json # { lastFetchTime, entriesCount, lastRefreshBy }
+└── lock.json # Distributed lock (initially unlocked)
+```
+
+---
+
+#### Incremental Refresh (Ongoing)
+
+**Purpose:** Keep index up-to-date with new preview activity
+
+**Two Modes:**
+
+**1. User-Triggered Refresh**
+- User clicks "Refresh" button in media library
+- Acquires distributed lock
+- Fetches logs since last update (incremental)
+- Merges with existing index
+- Updates UI immediately
+- Takes 2-5 seconds
+
+**2. Background Auto-Refresh**
+- Runs every 10 minutes from any open browser
+- Checks if lock is available
+- Checks if index is stale (> 5 minutes old)
+- If both true, performs incremental refresh
+- Silent operation, no UI disruption
+- Dispatches event for UI refresh when complete
+
+---
+
+### Distributed Locking Strategy
+
+**Problem:** Multiple users may have media library open simultaneously, each browser trying to refresh every 10 minutes. Without coordination, this creates race conditions and corrupts the index.
+
+**Solution:** Distributed lock using `.da/mediaindex/lock.json`
+
+#### Lock Structure
+
+```json
+{
+ "locked": true,
+ "lockedBy": "user@example.com|session-abc123",
+ "lockedAt": 1709567890000,
+ "operation": "auto-refresh",
+ "timeout": 300000
+}
+```
+
+#### Lock Behavior
+
+**Acquiring Lock:**
+```javascript
+1. Read current lock.json
+2. If locked=false OR (now - lockedAt) > timeout:
+ - Write new lock with your identity
+ - Wait 500ms
+ - Re-read to verify (race condition check)
+ - If lockedBy matches yours, lock acquired
+ - Else, retry (max 3 attempts with 2s delay)
+3. If locked by someone else:
+ - Return "lock held by X"
+```
+
+**Releasing Lock:**
+```javascript
+1. Read current lock.json
+2. If lockedBy matches your identity:
+ - Write { locked: false, releasedAt: now }
+3. Else, skip (don't own lock)
+```
+
+**Lock Timeout:**
+- Default: 5 minutes (300000ms)
+- After 5 minutes, lock considered expired
+- Next process treats expired lock as unlocked
+- Handles browser crashes gracefully
+
+**Priority:**
+- User-triggered refresh > Background refresh
+- User clicks button: Attempts lock immediately
+- Background timer: Checks lock first, skips if held
+
+---
+
+### File Structure & Metadata
+
+#### .da/mediaindex/index.json
+
+Main index file containing processed media entries:
+
+```json
+[
+ {
+ "hash": "abc123",
+ "url": "https://main--repo--org.aem.live/media_abc123/image.jpg",
+ "name": "image.jpg",
+ "page": "/drafts/my-page.md",
+ "timestamp": 1709567890000,
+ "user": "user@example.com",
+ "operation": "ingest",
+ "type": "img > jpg",
+ "source": "medialog"
+ },
+ {
+ "path": "/media/doc.pdf",
+ "usedIn": ["/drafts/page1.md", "/drafts/page2.md"],
+ "timestamp": 1709567890000,
+ "user": "user@example.com",
+ "type": "document > pdf",
+ "status": "referenced",
+ "source": "auditlog-parsed"
+ }
+]
+```
+
+#### .da/mediaindex/medialog-meta.json
+
+Metadata tracking last refresh:
+
+```json
+{
+ "lastFetchTime": 1709567890000,
+ "entriesCount": 1523,
+ "lastRefreshBy": "user@example.com"
+}
+```
+
+#### .da/mediaindex/lock.json
+
+Distributed lock state:
+
+```json
+{
+ "locked": false,
+ "lockedBy": null,
+ "lockedAt": null,
+ "operation": null,
+ "releasedAt": 1709567890000
+}
+```
+
+---
+
+### Operational Procedures
+
+#### Initial Setup (One-Time per Repository)
+
+**Step 1: Backfill Medialog (if site existed before 2026)**
+
+```bash
+# Clone backfill tool
+git clone
+cd media-log-ingestor
+
+# Install dependencies
+npm install
+
+# Get authentication token
+npm run token # Shows instructions
+
+# Run backfill
+npm run ingest -- --org myorg --repo myrepo --token
+
+# Wait for completion (5-30 minutes)
+```
+
+**Step 2: Build Initial Index**
+
+```
+1. Navigate to: https://main--repo--org.aem.live/tools/media-indexer
+2. Click "Build Initial Index"
+3. Wait for progress bar to complete (30-60 seconds)
+4. Verify: "Index built successfully: 1523 entries"
+```
+
+**Step 3: Enable Auto-Refresh**
+
+```
+1. Open media library: https://main--repo--org.aem.live/tools/media
+2. Auto-refresh starts automatically (10-minute interval)
+3. Verify in console: "Background auto-refresh started"
+```
+
+---
+
+#### Ongoing Operations
+
+**User-Triggered Refresh:**
+- User clicks "Refresh" button
+- Shows progress: "Fetching logs..." → "Processing..." → "Complete"
+- Updates UI with latest media
+- Frequency: As needed (typically when adding new content)
+
+**Background Auto-Refresh:**
+- Runs silently every 10 minutes
+- Logs to console: "Auto-refresh starting..." or "Index fresh, skipping"
+- No user interaction required
+- Handles multiple users gracefully via locking
+
+**Manual Unlock (Admin Only):**
+- If lock stuck (rare), admin can force unlock
+- Click "Force Unlock" button in index manager
+- Confirms before releasing lock
+- Use only when certain no other process is running
+
+---
+
+#### Monitoring & Troubleshooting
+
+**Check Index Status:**
+
+```javascript
+// In browser console
+const status = await getIndexStatus(sitePath);
+console.log(status);
+// {
+// lastRefresh: 1709567890000,
+// entriesCount: 1523,
+// isStale: false,
+// locked: false
+// }
+```
+
+**Common Issues:**
+
+| Issue | Symptom | Solution |
+|-------|---------|----------|
+| Index not updating | Old timestamps | Check lock status, force unlock if stuck |
+| Missing recent media | Added but not shown | Trigger manual refresh |
+| Duplicate entries | Same media appears multiple times | Full rebuild (weekly maintenance) |
+| Lock timeout | "Cannot acquire lock" errors | Wait 5 minutes or force unlock |
+| Parse failures | PDFs/SVGs not tracked | Check page accessibility, retry |
+
+**Maintenance Tasks:**
+
+- **Daily:** None (auto-refresh handles updates)
+- **Weekly:** Review error logs, check for parse failures
+- **Monthly:** Full index rebuild to eliminate drift
+- **Quarterly:** Verify backfill integrity, re-run if needed
+
+---
+
+### Performance & Scalability
+
+#### Incremental Refresh Performance
+
+| Site Size | Pages | Media | Refresh Time |
+|-----------|-------|-------|--------------|
+| Small | < 100 | < 500 | 1-2 seconds |
+| Medium | 100-1000 | 500-5000 | 2-5 seconds |
+| Large | 1000-10000 | 5000-50000 | 5-10 seconds |
+| Very Large | > 10000 | > 50000 | 10-30 seconds |
+
+**Optimization Techniques:**
+- Hash-based lookups (O(1) instead of O(N))
+- Group medialog by resourcePath (reduce iterations)
+- Parse only changed pages (not entire site)
+- Batch writes to DA (reduce API calls)
+- Cache parsed HTML (avoid re-fetching)
+
+#### Lock Contention
+
+**Scenario:** 10 users have media library open
+
+- Each browser runs auto-refresh every 10 minutes
+- On average, 1 lock attempt per minute across all users
+- Lock held for 2-5 seconds during refresh
+- Contention rate: < 10% (most attempts succeed)
+
+**Mitigation:**
+- Lock timeout ensures stale locks don't block
+- Retry logic with exponential backoff
+- Background refresh skips if locked (no retry spam)
+- User-triggered refresh has higher priority
+
+---
+
+### Future Enhancements
+
+**Tier 1 Improvements:**
+- Incremental medialog backfill (only new pages)
+- Scheduled re-backfill for updated pages
+- Webhook integration for real-time updates
+
+**Tier 2 Improvements:**
+- Worker-based refresh (dedicated service vs browser-based)
+- Streaming updates (websocket for live refresh)
+- Index compression for very large sites
+- Pagination for index loading
+
+**Locking Improvements:**
+- Leader election (one browser becomes "leader" for all refreshes)
+- Heartbeat mechanism (detect crashed processes faster)
+- Lock priority queue (order competing processes)
+
+---
+
+## Indexing Flowcharts
+
+### 1. Initial Index Build (First Pull)
+
+```mermaid
+flowchart TD
+ Start([Start: First Pull / Backfill]) --> FetchLogs[Fetch ALL available logs:
- Auditlog
- Medialog
Note: Medialog is new, get all available]
+ FetchLogs --> FilterAudit{Filter auditlog
by file type}
+
+ FilterAudit -->|.md files| ProcessPages[Group: Pages to process]
+ FilterAudit -->|.pdf, .svg, /fragments/| ProcessFiles[Group: Content delivery files]
+
+ ProcessPages --> ParseContent[Parse page HTML:
- Extract fragment references
- Extract PDF links
- Extract SVG links]
+
+ ParseContent --> BuildUsageMap[Build usage maps:
- fragments -> pages
- pdfs -> pages
- svgs -> pages]
+
+ ProcessPages --> LoopPages{For each page}
+ LoopPages --> NormalizePath[Normalize: page -> page.md]
+ NormalizePath --> FindMedia[Find medialog entries WHERE:
- resourcePath = normalized path
- timestamp within 5s of audit timestamp]
+
+ FindMedia --> HasMedia{Media found?}
+
+ HasMedia -->|Yes| CreateRefs[Create index entries:
hash, page, timestamp, status='referenced']
+ HasMedia -->|No| SkipPage[Page has no Media Bus items
May have PDFs/SVGs/fragments]
+
+ CreateRefs --> MorePages{More pages?}
+ SkipPage --> MorePages
+ MorePages -->|Yes| LoopPages
+ MorePages -->|No| ProcessFiles
+
+ ProcessFiles --> LoopFiles{For each file}
+ LoopFiles --> FileType{File type?}
+
+ FileType -->|PDF/SVG/Fragment| CheckUsage[Check in usage map:
Referenced by any page?]
+ FileType -->|Image standalone| FindStandalone[Find in medialog:
originalFilename present?]
+
+ CheckUsage -->|Referenced| CreateLinkedEntry[Create index entry:
path, usedIn pages, type, status='referenced']
+ CheckUsage -->|Not referenced| CreateStandaloneFile[Create index entry:
path, type, status='file-unused']
+
+ FindStandalone -->|Found| CreateStandaloneEntry[Create index entry:
hash, originalFilename, status='uploaded-unused']
+ FindStandalone -->|Not found| SkipFile[Skip: Not Media Bus item]
+
+ CreateLinkedEntry --> MoreFiles{More files?}
+ CreateStandaloneFile --> MoreFiles
+ CreateStandaloneEntry --> MoreFiles
+ SkipFile --> MoreFiles
+
+ MoreFiles -->|Yes| LoopFiles
+ MoreFiles -->|No| MergeUsage[Merge usage map into index]
+
+ BuildUsageMap --> MergeUsage
+ MergeUsage --> SaveIndex[Save index to DA:
/.da/mediaindex/media.json]
+
+ SaveIndex --> SaveMeta[Save metadata:
lastFetchTime, itemCount]
+ SaveMeta --> End([End: Index Built])
+```
+
+---
+
+### 2. Incremental Update Flow
+
+```mermaid
+flowchart TD
+ Start([Start: Incremental Update]) --> LoadMeta[Load index metadata:
Get lastFetchTime]
+ LoadMeta --> FetchNew[Fetch NEW logs since lastFetchTime:
- Auditlog
- Medialog]
+
+ FetchNew --> HasNewEvents{New events exist?}
+ HasNewEvents -->|No| EndEarly([End: No updates needed])
+ HasNewEvents -->|Yes| LoadIndex[Load existing index from DA]
+
+ LoadIndex --> ParseNewPages[Parse newly previewed pages:
Update usage maps for:
- Fragments
- PDFs
- SVGs]
+
+ ParseNewPages --> LoopNewPages{For each page
in new auditlog}
+
+ LoopNewPages --> NormalizePath[Normalize: page -> page.md]
+ NormalizePath --> FindNewMedia[Find NEW medialog entries:
- resourcePath = normalized path
- timestamp within 5s of audit timestamp]
+
+ FindNewMedia --> LoadOldState[Load OLD index entries
for this page]
+
+ LoadOldState --> CompareState{Compare:
Old vs New}
+
+ CompareState --> ExtractHashes[Extract:
- oldHashes from index
- newHashes from medialog]
+
+ ExtractHashes --> CheckChanges{Content
changed?}
+
+ CheckChanges -->|newHashes empty| CheckAmbiguous{Old state
had media?}
+ CheckChanges -->|newHashes exist| CompareHashes[Compare hash sets]
+
+ CheckAmbiguous -->|Yes| AmbiguousCase[Ambiguous: May have PDFs/SVGs
or all removed or processing delay]
+ CheckAmbiguous -->|No| StillText[Still text-only
No action needed]
+
+ AmbiguousCase --> VerifyParse{Parse to verify?}
+ VerifyParse -->|Yes| QuickParse[Quick parse: Check for img tags]
+ VerifyParse -->|No| AssumeRemoved[Assume removed
Mark as medium confidence]
+
+ QuickParse --> HasImages{Images found?}
+ HasImages -->|No| ConfirmedRemoved[Confirmed: All removed]
+ HasImages -->|Yes| DataInconsistency[Data inconsistency
Flag for investigation]
+
+ ConfirmedRemoved --> MarkUnreferenced[DELETE or UPDATE all old entries:
status = 'unreferenced']
+ AssumeRemoved --> MarkUnreferenced
+
+ CompareHashes --> FindAdded[Added = newHashes NOT IN oldHashes]
+ CompareHashes --> FindRemoved[Removed = oldHashes NOT IN newHashes]
+ CompareHashes --> FindUnchanged[Unchanged = intersection]
+
+ FindAdded --> HasAdded{Additions?}
+ HasAdded -->|Yes| InsertNew[INSERT new index entries:
hash, page, timestamp, status='referenced']
+ HasAdded -->|No| CheckRemoved
+
+ FindRemoved --> CheckRemoved{Removals?}
+ CheckRemoved -->|Yes| DeleteOld[DELETE or UPDATE removed entries:
status = 'unreferenced']
+ CheckRemoved -->|No| CheckUnchanged
+
+ FindUnchanged --> CheckUnchanged{Unchanged?}
+ CheckUnchanged -->|Yes| UpdateTimestamp[UPDATE timestamp only
for unchanged entries]
+ CheckUnchanged -->|No| NextPage
+
+ InsertNew --> NextPage
+ DeleteOld --> NextPage
+ UpdateTimestamp --> NextPage
+ MarkUnreferenced --> NextPage
+ StillText --> NextPage
+ DataInconsistency --> NextPage
+
+ NextPage{More pages?}
+ NextPage -->|Yes| LoopNewPages
+ NextPage -->|No| UpdateLinkedContent[Update linked content from usage maps:
- Add new fragment references
- Remove old references
- Update PDF/SVG usage]
+
+ UpdateLinkedContent --> SaveUpdated[Save updated index to DA]
+
+ SaveUpdated --> UpdateMeta[Update metadata:
lastFetchTime = now
itemCount = index.length]
+
+ UpdateMeta --> End([End: Index Updated])
+```
+
+---
+
+### 3. Page State Detection Logic
+
+```mermaid
+flowchart TD
+ Start([Page Preview Event]) --> GetAudit[Auditlog entry:
page X at time T]
+
+ GetAudit --> SearchMedia[Search medialog for entries:
- resourcePath = X.md
- timestamp in T, T+5000ms]
+
+ SearchMedia --> MediaCount{Count of
media entries}
+
+ MediaCount -->|0 entries| CheckHistory1{Check index:
Page existed before?}
+ MediaCount -->|N entries| HasMedia[Scenario: Page has N Media Bus items
May also have PDFs/SVGs/fragments]
+
+ CheckHistory1 -->|No| NewTextPage[New text-only page
OR page with only PDFs/SVGs/fragments
Action: Check parsed content]
+ CheckHistory1 -->|Yes, had media| Ambiguous[Ambiguous scenario:
- All Media Bus items removed?
- Or page has PDFs/SVGs only?
- Or processing delay?]
+ CheckHistory1 -->|Yes, no media| StillText[Still text-only or non-Media Bus
Action: Update timestamp only]
+
+ Ambiguous --> DecideParse{Parse to verify?}
+ DecideParse -->|Yes| ParseCheck[Parse HTML for img tags]
+ DecideParse -->|No| AssumeRemoved[Assume removed
Medium confidence]
+
+ ParseCheck --> ImagesFound{Images in HTML?}
+ ImagesFound -->|No| ConfirmRemoved[Confirmed: All removed
Action: Mark old entries unreferenced]
+ ImagesFound -->|Yes| Inconsistent[Inconsistency detected
Action: Flag for investigation]
+
+ HasMedia --> GroupByTimestamp[Group media by timestamp:
All should have same timestamp]
+
+ GroupByTimestamp --> ExtractHashes[Extract: List of mediaHashes]
+
+ ExtractHashes --> CheckPrevious{Check index:
Page existed before?}
+
+ CheckPrevious -->|No| NewPage[New page with media
Action: Create all entries]
+ CheckPrevious -->|Yes| CompareHashes[Compare old vs new hashes]
+
+ CompareHashes --> DiffResult{Difference?}
+
+ DiffResult -->|Same hashes| NoChange[No content change
Action: Update timestamps]
+ DiffResult -->|Added hashes| MediaAdded[Media added
Action: Insert new entries]
+ DiffResult -->|Removed hashes| SomeRemoved[Media removed
Action: Delete/flag entries]
+ DiffResult -->|Both added & removed| MediaChanged[Media changed
Action: Insert + Delete]
+
+ NewTextPage --> End([End])
+ ConfirmRemoved --> End
+ AssumeRemoved --> End
+ Inconsistent --> End
+ StillText --> End
+ NewPage --> End
+ NoChange --> End
+ MediaAdded --> End
+ SomeRemoved --> End
+ MediaChanged --> End
+```
+
+---
+
+### 4. Medialog Entry Classification
+
+```mermaid
+flowchart TD
+ Start([Medialog Entry]) --> CheckOperation{Check:
operation field}
+
+ CheckOperation -->|"ingest"| CheckPath{Has
resourcePath?}
+ CheckOperation -->|"reuse"| ReuseCase[Reuse Operation]
+ CheckOperation -->|"delete"| DeleteCase[Delete Operation
Future: TBD by API team]
+
+ CheckPath -->|Yes| IngestInPage[Ingest via Markup
New media added to page]
+ CheckPath -->|No| CheckOriginal{Has
originalFilename?}
+
+ CheckOriginal -->|Yes| StandaloneUpload[Standalone Upload
Media previewed alone]
+ CheckOriginal -->|No| AnomalyCase[Anomaly: No path, no filename
Should not occur]
+
+ IngestInPage --> ExtractName1[Extract name from URL path:
media_hash.jpg]
+ IngestInPage --> UseResource1[Use resourcePath for doc field]
+ IngestInPage --> NoOwner1[NO owner/repo fields]
+
+ StandaloneUpload --> ExtractName2[Extract name from originalFilename:
clay-banks-cabin.jpg]
+ StandaloneUpload --> NoDoc2[Doc field = empty/null]
+ StandaloneUpload --> HasOwner2[HAS owner/repo fields]
+
+ ReuseCase --> ExtractName3[Extract name from URL path:
media_hash.jpg]
+ ReuseCase --> UseResource3[Use resourcePath for doc field]
+ ReuseCase --> NoOwner3[NO owner/repo fields]
+
+ DeleteCase --> WaitSpec[Wait for API spec confirmation]
+
+ ExtractName1 --> CreateIndex1[Create index entry:
source='medialog-ingest-page']
+ UseResource1 --> CreateIndex1
+ NoOwner1 --> CreateIndex1
+
+ ExtractName2 --> CreateIndex2[Create index entry:
source='medialog-ingest-standalone']
+ NoDoc2 --> CreateIndex2
+ HasOwner2 --> CreateIndex2
+
+ ExtractName3 --> CreateIndex3[Create index entry:
source='medialog-reuse']
+ UseResource3 --> CreateIndex3
+ NoOwner3 --> CreateIndex3
+
+ CreateIndex1 --> End([Process Complete])
+ CreateIndex2 --> End
+ CreateIndex3 --> End
+ AnomalyCase --> End
+ WaitSpec --> End
+```
+
+---
+
+## Decision Tables
+
+### Table 1: Matching Auditlog to Medialog
+
+| Auditlog Entry | Expected Medialog | Action |
+|----------------|-------------------|--------|
+| Page preview @ T | N entries with resourcePath=page.md, timestamp in [T, T+5000] | Link entries to page, extract media list |
+| Page preview @ T | 0 entries matching | Page is text-only OR all media removed OR has only PDFs/SVGs/fragments |
+| PDF/SVG/Fragment preview @ T | 0 entries | Expected - Not on Media Bus |
+| Image preview @ T | 1 entry with originalFilename=image.jpg | Standalone image upload |
+
+### Table 2: Index Update Actions
+
+| Old Index State | New Medialog State | Action | Index Update |
+|-----------------|-------------------|--------|--------------|
+| Page not in index | Medialog has N hashes | New page | INSERT N entries with status='referenced' |
+| Page has [A,B] | Medialog has [A,B] | No change | UPDATE timestamps only |
+| Page has [A,B] | Medialog has [A,B,C] | Media added | INSERT entry for C |
+| Page has [A,B,C] | Medialog has [A,B] | Media removed | DELETE or FLAG entry for C as 'unreferenced' |
+| Page has [A,B] | Medialog empty | Ambiguous | Parse to verify or assume all removed |
+| Page has [A,B] | Medialog has [C,D] | Complete change | DELETE [A,B], INSERT [C,D] |
+
+### Table 3: Processing Optimization
+
+| Condition | Optimization | Benefit |
+|-----------|--------------|---------|
+| Event timestamp < lastFetchTime | Skip event | Avoid reprocessing old data |
+| Same page, multiple events in batch | Process only latest | Reduce redundant work |
+| No changes detected in comparison | Skip write operation | Reduce DA API calls |
+| medialog entries have same timestamp | Batch process as single page state | Improve efficiency |
+| Parsing multiple content types | Single fetch, extract all | Minimize network calls |
+
+### Table 4: Content Type Decision Matrix
+
+| Content Type | Found In | Requires Parsing | Usage Tracking Method |
+|-------------|----------|------------------|----------------------|
+| Images | Medialog | No | resourcePath field provides direct link |
+| Videos | Medialog | No | resourcePath field provides direct link |
+| PDFs | Auditlog only | Yes | Parse page HTML for PDF links |
+| SVGs | Auditlog only | Yes | Parse page HTML for SVG links |
+| Fragments | Auditlog only | Yes | Parse page HTML for fragment references |
+| Icons | Not tracked | N/A | Not included in index |
+
+---
+
+## Implementation Pseudocode
+
+### Initial Build Algorithm
+
+```javascript
+async function buildInitialIndex(org, repo, ref = 'main') {
+ const index = [];
+
+ // 1. Fetch all available logs (medialog is new, backfill what exists)
+ const auditLogEntries = await fetchAuditLog(org, repo, ref, since=null);
+ const mediaLogEntries = await fetchMediaLog(org, repo, ref, since=null);
+
+ // 2. Separate pages from files
+ const pages = auditLogEntries.filter(e => isPage(e.path));
+ const files = auditLogEntries.filter(e => !pages.includes(e));
+
+ // 3. Parse pages to build usage maps for linked content
+ const usageMap = await buildContentUsageMap(pages, org, repo);
+
+ // 4. Process each page
+ for (const pageEvent of pages) {
+ const normalizedPath = normalizePath(pageEvent.path); // Add .md if needed
+
+ // Find matching medialog entries within 5-second window
+ const pageMedia = mediaLogEntries.filter(m =>
+ m.resourcePath === normalizedPath &&
+ m.timestamp >= pageEvent.timestamp &&
+ m.timestamp < pageEvent.timestamp + 5000
+ );
+
+ // Create index entries for Media Bus items
+ for (const media of pageMedia) {
+ index.push({
+ hash: media.mediaHash,
+ page: normalizedPath,
+ url: media.path,
+ name: extractName(media),
+ timestamp: media.timestamp,
+ user: media.user,
+ operation: media.operation,
+ status: 'referenced',
+ source: 'medialog',
+ type: 'image' // or 'video'
+ });
+ }
+ }
+
+ // 5. Process standalone files and linked content
+ for (const fileEvent of files) {
+ const filePath = fileEvent.path;
+
+ if (isPdfOrSvg(filePath)) {
+ // Check if referenced by any page
+ const linkedPages = usageMap.pdfs.get(filePath) || usageMap.svgs.get(filePath) || [];
+
+ index.push({
+ path: filePath,
+ usedIn: linkedPages,
+ timestamp: fileEvent.timestamp,
+ user: fileEvent.user,
+ type: getFileType(filePath),
+ status: linkedPages.length > 0 ? 'referenced' : 'file-unused',
+ source: 'auditlog-parsed'
+ });
+
+ } else if (isFragment(filePath)) {
+ // Check if referenced by any page
+ const linkedPages = usageMap.fragments.get(filePath) || [];
+
+ index.push({
+ path: filePath,
+ usedIn: linkedPages,
+ timestamp: fileEvent.timestamp,
+ user: fileEvent.user,
+ type: 'fragment',
+ status: linkedPages.length > 0 ? 'referenced' : 'file-unused',
+ source: 'auditlog-parsed'
+ });
+
+ } else if (isImage(filePath)) {
+ // Check if in medialog (standalone upload)
+ const mediaEntry = mediaLogEntries.find(m =>
+ m.originalFilename === filePath &&
+ Math.abs(m.timestamp - fileEvent.timestamp) < 5000
+ );
+
+ if (mediaEntry) {
+ index.push({
+ hash: mediaEntry.mediaHash,
+ url: mediaEntry.path,
+ name: extractFromOriginalFilename(mediaEntry.originalFilename),
+ originalFilename: mediaEntry.originalFilename,
+ timestamp: mediaEntry.timestamp,
+ user: mediaEntry.user,
+ status: 'uploaded-unused',
+ source: 'medialog',
+ type: 'image'
+ });
+ }
+ }
+ }
+
+ // 6. Save index
+ await saveMediaSheet(index, `/${org}/${repo}`);
+ await saveLogMeta(`/${org}/${repo}`, {
+ lastFetchTime: Date.now(),
+ processedItems: index.length
+ });
+
+ return index;
+}
+```
+
+### Incremental Update Algorithm
+
+```javascript
+async function updateIndex(org, repo, ref = 'main') {
+ // 1. Load existing state
+ const meta = await loadLogMeta(`/${org}/${repo}`);
+ const existingIndex = await loadMediaSheet(`/${org}/${repo}`);
+ const lastFetchTime = meta?.lastFetchTime || null;
+
+ // 2. Fetch new events since last update
+ const newAuditLog = await fetchAuditLog(org, repo, ref, since=lastFetchTime);
+ const newMediaLog = await fetchMediaLog(org, repo, ref, since=lastFetchTime);
+
+ if (newAuditLog.length === 0 && newMediaLog.length === 0) {
+ return existingIndex; // No updates needed
+ }
+
+ // 3. Parse newly previewed pages
+ const newPages = newAuditLog.filter(e => isPage(e.path));
+ const newUsageMap = await buildContentUsageMap(newPages, org, repo);
+
+ // 4. Process each new page event
+ const updatedIndex = [...existingIndex];
+
+ for (const pageEvent of newPages) {
+ const normalizedPath = normalizePath(pageEvent.path);
+
+ // Find new medialog entries for this page
+ const newPageMedia = newMediaLog.filter(m =>
+ m.resourcePath === normalizedPath &&
+ m.timestamp >= pageEvent.timestamp &&
+ m.timestamp < pageEvent.timestamp + 5000
+ );
+
+ // Get old state from index
+ const oldPageEntries = existingIndex.filter(e =>
+ e.page === normalizedPath && e.source === 'medialog'
+ );
+ const oldHashes = new Set(oldPageEntries.map(e => e.hash));
+ const newHashes = new Set(newPageMedia.map(m => m.mediaHash));
+
+ // Handle ambiguous case: no medialog entries
+ if (newPageMedia.length === 0 && oldPageEntries.length > 0) {
+ // Option A: Parse to verify (recommended for high-value pages)
+ const shouldVerify = oldPageEntries.length >= 5 || isImportantPage(normalizedPath);
+
+ if (shouldVerify) {
+ const hasImages = await quickParseForImages(pageEvent.path, org, repo);
+
+ if (hasImages === false) {
+ // Confirmed: All removed
+ for (const oldEntry of oldPageEntries) {
+ const idx = updatedIndex.indexOf(oldEntry);
+ if (idx !== -1) updatedIndex.splice(idx, 1);
+ }
+ } else if (hasImages === true) {
+ // Data inconsistency - log for investigation
+ console.warn('Data inconsistency detected:', normalizedPath);
+ }
+ } else {
+ // Option B: Trust logs, assume removed
+ for (const oldEntry of oldPageEntries) {
+ const idx = updatedIndex.indexOf(oldEntry);
+ if (idx !== -1) updatedIndex.splice(idx, 1);
+ }
+ }
+
+ continue;
+ }
+
+ // Detect changes
+ const added = [...newHashes].filter(h => !oldHashes.has(h));
+ const removed = [...oldHashes].filter(h => !newHashes.has(h));
+ const unchanged = [...newHashes].filter(h => oldHashes.has(h));
+
+ // Apply changes
+
+ // 1. Remove deleted media
+ for (const hash of removed) {
+ const idx = updatedIndex.findIndex(e =>
+ e.hash === hash && e.page === normalizedPath
+ );
+ if (idx !== -1) {
+ updatedIndex.splice(idx, 1);
+ }
+ }
+
+ // 2. Add new media
+ for (const hash of added) {
+ const mediaEntry = newPageMedia.find(m => m.mediaHash === hash);
+ updatedIndex.push({
+ hash: mediaEntry.mediaHash,
+ page: normalizedPath,
+ url: mediaEntry.path,
+ name: extractName(mediaEntry),
+ timestamp: mediaEntry.timestamp,
+ user: mediaEntry.user,
+ operation: mediaEntry.operation,
+ status: 'referenced',
+ source: 'medialog',
+ type: 'image'
+ });
+ }
+
+ // 3. Update timestamps for unchanged media
+ for (const hash of unchanged) {
+ const idx = updatedIndex.findIndex(e =>
+ e.hash === hash && e.page === normalizedPath
+ );
+ if (idx !== -1) {
+ const mediaEntry = newPageMedia.find(m => m.mediaHash === hash);
+ updatedIndex[idx].timestamp = mediaEntry.timestamp;
+ }
+ }
+ }
+
+ // 5. Update linked content (PDFs, SVGs, Fragments) from usage map
+ for (const fileEvent of newAuditLog.filter(e => !isPage(e.path))) {
+ const filePath = fileEvent.path;
+
+ if (isPdfOrSvg(filePath) || isFragment(filePath)) {
+ const usageKey = isPdf(filePath) ? 'pdfs' :
+ isSvg(filePath) ? 'svgs' : 'fragments';
+ const linkedPages = newUsageMap[usageKey].get(filePath) || [];
+
+ // Update or create entry
+ const existingIdx = updatedIndex.findIndex(e => e.path === filePath);
+
+ if (existingIdx !== -1) {
+ // Update existing entry
+ updatedIndex[existingIdx].usedIn = linkedPages;
+ updatedIndex[existingIdx].timestamp = fileEvent.timestamp;
+ updatedIndex[existingIdx].status = linkedPages.length > 0 ? 'referenced' : 'file-unused';
+ } else {
+ // Create new entry
+ updatedIndex.push({
+ path: filePath,
+ usedIn: linkedPages,
+ timestamp: fileEvent.timestamp,
+ user: fileEvent.user,
+ type: getFileType(filePath),
+ status: linkedPages.length > 0 ? 'referenced' : 'file-unused',
+ source: 'auditlog-parsed'
+ });
+ }
+ }
+ }
+
+ // 6. Save updated index
+ await saveMediaSheet(updatedIndex, `/${org}/${repo}`);
+ await saveLogMeta(`/${org}/${repo}`, {
+ lastFetchTime: Date.now(),
+ processedItems: updatedIndex.length
+ });
+
+ return updatedIndex;
+}
+```
+
+### Content Usage Map Builder
+
+```javascript
+async function buildContentUsageMap(pageLogEntries, org, repo) {
+ const usageMap = {
+ fragments: new Map(),
+ pdfs: new Map(),
+ svgs: new Map(),
+ };
+
+ for (const pageEvent of pageLogEntries) {
+ try {
+ // Fetch page HTML
+ const html = await fetchPageHtml(pageEvent.path, org, repo);
+
+ // Extract all content types in single pass
+ const fragments = extractFragmentReferences(html);
+ const pdfs = extractLinks(html, /\.pdf$/);
+ const svgs = extractLinks(html, /\.svg$/);
+
+ const normalizedPage = normalizePath(pageEvent.path);
+
+ // Build usage maps
+ fragments.forEach(f => {
+ if (!usageMap.fragments.has(f)) {
+ usageMap.fragments.set(f, []);
+ }
+ if (!usageMap.fragments.get(f).includes(normalizedPage)) {
+ usageMap.fragments.get(f).push(normalizedPage);
+ }
+ });
+
+ pdfs.forEach(p => {
+ if (!usageMap.pdfs.has(p)) {
+ usageMap.pdfs.set(p, []);
+ }
+ if (!usageMap.pdfs.get(p).includes(normalizedPage)) {
+ usageMap.pdfs.get(p).push(normalizedPage);
+ }
+ });
+
+ svgs.forEach(s => {
+ if (!usageMap.svgs.has(s)) {
+ usageMap.svgs.set(s, []);
+ }
+ if (!usageMap.svgs.get(s).includes(normalizedPage)) {
+ usageMap.svgs.get(s).push(normalizedPage);
+ }
+ });
+
+ } catch (error) {
+ console.error(`Failed to parse page ${pageEvent.path}:`, error);
+ // Continue with other pages
+ }
+ }
+
+ return usageMap;
+}
+
+async function fetchPageHtml(pagePath, org, repo, ref = 'main') {
+ const url = `https://${ref}--${repo}--${org}.aem.page${pagePath}`;
+ const response = await fetch(url);
+ if (!response.ok) {
+ throw new Error(`Failed to fetch ${url}: ${response.status}`);
+ }
+ return response.text();
+}
+
+function extractFragmentReferences(html) {
+ const fragmentPattern = /href="([^"]*\/fragments\/[^"]+)"/g;
+ const matches = [...html.matchAll(fragmentPattern)];
+ return matches.map(m => normalizePath(m[1]));
+}
+
+function extractLinks(html, pattern) {
+ const linkPattern = new RegExp(`(?:href|src)="([^"]*${pattern.source})"`, 'gi');
+ const matches = [...html.matchAll(linkPattern)];
+ return matches.map(m => normalizePath(m[1]));
+}
+
+async function quickParseForImages(pagePath, org, repo, ref = 'main') {
+ try {
+ const html = await fetchPageHtml(pagePath, org, repo, ref);
+ // Simple check: Does it contain
tags with media_ URLs?
+ return html.includes('media_') && html.includes('
/drafts/page.md
+ if (!cleanPath.includes('.') && !cleanPath.startsWith('/media/')) {
+ cleanPath = `${cleanPath}.md`;
+ }
+
+ return cleanPath;
+}
+
+function extractName(mediaEntry) {
+ // For "ingest" with originalFilename
+ if (mediaEntry.operation === 'ingest' && mediaEntry.originalFilename) {
+ return mediaEntry.originalFilename.split('/').pop();
+ }
+
+ // For "reuse" or "ingest" without originalFilename
+ const cleanPath = mediaEntry.path.split('?')[0].split('#')[0];
+ return cleanPath.split('/').pop();
+}
+
+function isPage(path) {
+ return (path.endsWith('.md') ||
+ (!path.includes('.') && !path.startsWith('/media/'))) &&
+ !path.includes('/fragments/');
+}
+
+function isPdfOrSvg(path) {
+ return path.endsWith('.pdf') || path.endsWith('.svg');
+}
+
+function isPdf(path) {
+ return path.endsWith('.pdf');
+}
+
+function isSvg(path) {
+ return path.endsWith('.svg');
+}
+
+function isFragment(path) {
+ return path.includes('/fragments/');
+}
+
+function isImage(path) {
+ const imageExts = ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp'];
+ return imageExts.some(ext => path.toLowerCase().endsWith(ext));
+}
+
+function getFileType(path) {
+ if (path.endsWith('.pdf')) return 'pdf';
+ if (path.endsWith('.svg')) return 'svg';
+ if (path.includes('/fragments/')) return 'fragment';
+ if (isImage(path)) return 'image';
+ return 'unknown';
+}
+```
+
+---
+
+## Edge Cases & Handling
+
+### 1. Missing Auditlog Entry for Medialog Events
+
+**Scenario:** Medialog has entries but no matching auditlog entry
+
+**Causes:**
+- Processing delay (auditlog slower than medialog)
+- Auditlog API failure
+- Events outside fetched time range
+
+**Handling:**
+```javascript
+// Queue orphaned medialog entries for next processing cycle
+const orphanedMedia = mediaLog.filter(m => {
+ return !auditLog.some(a =>
+ normalizePath(a.path) === m.resourcePath &&
+ Math.abs(a.timestamp - m.timestamp) < 10000
+ );
+});
+
+// Retry on next incremental update with wider time range
+```
+
+### 2. Timestamp Drift Beyond 5 Seconds
+
+**Scenario:** Medialog timestamp > 5 seconds after auditlog
+
+**Causes:**
+- Heavy server load
+- Batch processing delays
+- Queue backlog
+
+**Handling:**
+```javascript
+// Adaptive time window
+const timeWindow = calculateAdaptiveWindow(processingLoad);
+// Start at 5s, increase to 10s or 15s if needed
+
+// Or use backup matching by sequence
+matchByTimestampProximity(auditLog, mediaLog);
+```
+
+### 3. Duplicate Hash in Multiple Pages
+
+**Scenario:** Same image used in 5 different pages
+
+**Handling:**
+```javascript
+// Create separate index entry for each page reference
+// Reference count = unique pages
+const referenceCount = index.filter(e => e.hash === targetHash).length;
+
+// Each entry tracks its specific page usage
+```
+
+### 4. Page Deleted (No Preview Events)
+
+**Scenario:** User deletes page entirely via DA
+
+**Handling:**
+```javascript
+// After X days (e.g., 30) without preview events:
+const staleThresholdMs = 30 * 24 * 60 * 60 * 1000;
+const cutoffTime = Date.now() - staleThresholdMs;
+
+const staleEntries = index.filter(e =>
+ e.timestamp < cutoffTime
+);
+
+// Option A: Flag as potentially stale
+staleEntries.forEach(e => e.status = 'potentially-stale');
+
+// Option B: Verify by fetching actual document
+// If 404, mark as unreferenced
+```
+
+### 5. Race Condition: Concurrent Previews
+
+**Scenario:** Two users preview same page simultaneously
+
+**Handling:**
+```javascript
+// Use "last-write-wins" with timestamp comparison
+if (newEntry.timestamp > existingEntry.timestamp) {
+ // New entry is more recent, use it
+ replaceEntry(existingEntry, newEntry);
+} else {
+ // Existing entry is more recent, skip
+ skipEntry(newEntry);
+}
+
+// OR: Keep both entries and deduplicate later
+// based on latest timestamp per hash+page combo
+```
+
+### 6. Ambiguous Empty Medialog
+
+**Scenario:** Auditlog shows page preview, medialog is empty, old state had media
+
+**Possible Causes:**
+- All Media Bus items removed
+- Page now has only PDFs/SVGs/Fragments
+- Processing delay
+- API failure
+
+**Handling:**
+```javascript
+async function handleAmbiguousCase(pageEvent, oldEntries) {
+ // Wait for processing delay
+ await sleep(3000);
+
+ // Retry medialog fetch
+ const retryEntries = await fetchMediaLog(...);
+
+ if (retryEntries.length > 0) {
+ return { action: 'update', entries: retryEntries };
+ }
+
+ // Still empty - parse to verify for high-value pages
+ const shouldVerify = oldEntries.length >= 5 || isImportantPage(pageEvent.path);
+
+ if (shouldVerify) {
+ const hasImages = await quickParseForImages(pageEvent.path);
+
+ if (hasImages === false) {
+ return { action: 'remove-all', confidence: 'high' };
+ } else if (hasImages === true) {
+ return { action: 'flag-investigation', confidence: 'low' };
+ }
+ }
+
+ // Default: assume removed
+ return { action: 'remove-all', confidence: 'medium' };
+}
+```
+
+### 7. Parsing Failures
+
+**Scenario:** Cannot fetch or parse page HTML
+
+**Handling:**
+```javascript
+async function buildContentUsageMap(pageLogEntries, org, repo) {
+ const usageMap = { fragments: new Map(), pdfs: new Map(), svgs: new Map() };
+ const failures = [];
+
+ for (const pageEvent of pageLogEntries) {
+ try {
+ const html = await fetchPageHtml(pageEvent.path, org, repo);
+ // Extract content...
+ } catch (error) {
+ failures.push({ page: pageEvent.path, error: error.message });
+ // Continue with other pages
+ }
+ }
+
+ // Log failures for investigation
+ if (failures.length > 0) {
+ console.warn('Failed to parse pages:', failures);
+ }
+
+ return usageMap;
+}
+```
+
+---
+
+## Performance Considerations
+
+### Scaling Factors
+
+| Factor | Impact | Mitigation |
+|--------|--------|------------|
+| **Number of pages** | O(N) processing time | Batch processing, parallel processing |
+| **Media per page** | O(M) comparison operations | Hash-based lookups instead of linear search |
+| **Event frequency** | Incremental update frequency | Adaptive polling (more frequent when active) |
+| **Index size** | Storage and read time | Compress, paginate, or archive old entries |
+| **Time window size** | False matches | Optimize to 5s, expand only if needed |
+| **Parsing pages** | Network and CPU cost | Cache parsed results, parse only changed pages |
+
+### Optimization Strategies
+
+```javascript
+// 1. Use Map for O(1) lookups instead of Array.filter
+const indexMap = new Map();
+existingIndex.forEach(e => {
+ const key = `${e.hash}|${e.page}`;
+ indexMap.set(key, e);
+});
+
+// 2. Group medialog entries by resourcePath first
+const mediaByPage = groupBy(mediaLog, 'resourcePath');
+
+// 3. Process only changed pages
+const changedPages = new Set(newAuditLog.map(e => e.path));
+
+// 4. Batch writes to DA
+const BATCH_SIZE = 100;
+await saveBatch(updatedEntries, BATCH_SIZE);
+
+// 5. Cache parsed HTML results
+const parseCache = new Map();
+const getCachedHtml = async (page) => {
+ if (!parseCache.has(page)) {
+ parseCache.set(page, await fetchPageHtml(page));
+ }
+ return parseCache.get(page);
+};
+
+// 6. Parallel parsing for multiple pages
+const htmlResults = await Promise.all(
+ pages.map(p => fetchPageHtml(p.path, org, repo))
+);
+```
+
+### Parsing Performance
+
+**Minimize parsing overhead:**
+```javascript
+// Parse only when necessary
+const shouldParse = (pageEvent, oldState) => {
+ // Always parse for initial build
+ if (!oldState) return true;
+
+ // Parse if page has new auditlog event
+ if (pageEvent.timestamp > oldState.lastParsed) return true;
+
+ // Skip if recently parsed
+ const cacheAge = Date.now() - oldState.lastParsed;
+ return cacheAge > (24 * 60 * 60 * 1000); // 24 hours
+};
+
+// Single-pass extraction
+const parsePageContent = (html) => {
+ return {
+ fragments: extractFragmentReferences(html),
+ pdfs: extractLinks(html, /\.pdf$/),
+ svgs: extractLinks(html, /\.svg$/),
+ hasImages: html.includes('media_') && html.includes('
{
+ if (entry.status !== 'referenced') return; // Skip unreferenced
+
+ const key = entry.hash || entry.path;
+ if (!counts.has(key)) {
+ counts.set(key, {
+ key,
+ pages: new Set(),
+ lastUsed: 0,
+ type: entry.type
+ });
+ }
+
+ const count = counts.get(key);
+
+ // For Media Bus items (have page field)
+ if (entry.page) {
+ count.pages.add(entry.page);
+ }
+
+ // For linked content (have usedIn field)
+ if (entry.usedIn) {
+ entry.usedIn.forEach(p => count.pages.add(p));
+ }
+
+ if (entry.timestamp > count.lastUsed) {
+ count.lastUsed = entry.timestamp;
+ }
+ });
+
+ // Convert to array with reference counts
+ return Array.from(counts.values()).map(c => ({
+ key: c.key,
+ type: c.type,
+ referenceCount: c.pages.size,
+ pages: Array.from(c.pages),
+ lastUsed: c.lastUsed
+ }));
+}
+```
+
+### Filtering by Content Type
+
+```javascript
+function filterByType(index, type) {
+ return index.filter(e => e.type === type && e.status === 'referenced');
+}
+
+// Examples
+const images = filterByType(index, 'image');
+const pdfs = filterByType(index, 'pdf');
+const fragments = filterByType(index, 'fragment');
+```
+
+### Filtering Unreferenced Media
+
+```javascript
+function getUnreferencedMedia(index) {
+ return index.filter(e =>
+ e.status === 'unreferenced' ||
+ e.status === 'uploaded-unused' ||
+ e.status === 'file-unused'
+ );
+}
+```
+
+### Getting Usage Details for Media Info Panel
+
+```javascript
+function getMediaUsage(index, identifier) {
+ // identifier can be hash (for images) or path (for PDFs/SVGs/fragments)
+ const usageEntries = index.filter(e =>
+ (e.hash === identifier || e.path === identifier) &&
+ e.status === 'referenced'
+ );
+
+ if (usageEntries.length === 0) return [];
+
+ // For Media Bus items (images/videos)
+ if (usageEntries[0].hash) {
+ const byPage = groupBy(usageEntries, 'page');
+
+ return Object.entries(byPage).map(([page, entries]) => ({
+ page,
+ previewCount: entries.length,
+ lastPreview: Math.max(...entries.map(e => e.timestamp)),
+ users: [...new Set(entries.map(e => e.user))]
+ }));
+ }
+
+ // For linked content (PDFs/SVGs/fragments)
+ if (usageEntries[0].usedIn) {
+ return usageEntries[0].usedIn.map(page => ({
+ page,
+ previewCount: 1, // Can't track individual previews for linked content
+ lastPreview: usageEntries[0].timestamp,
+ users: [usageEntries[0].user]
+ }));
+ }
+
+ return [];
+}
+```
+
+---
+
+## Next Steps
+
+### 1. Infrastructure Setup
+
+- [ ] Verify medialog backfill CLI tool is production-ready
+- [ ] Create `/tools/media-indexer` DA app for initial index population
+- [ ] Set up `.da/mediaindex/` directory structure
+- [ ] Document authentication requirements and token management
+- [ ] Test on small pilot repository first
+
+### 2. Implementation Phase
+
+- [ ] Implement initial index build function (in media-indexer app)
+- [ ] Implement incremental update function (in browser)
+- [ ] Add content usage map builder (fragments, PDFs, SVGs)
+- [ ] Implement distributed locking mechanism
+- [ ] Add background auto-refresh with 10-minute interval
+- [ ] Create user-triggered refresh UI
+- [ ] Add error handling and retry logic
+- [ ] Test with production data at scale
+
+### 3. Testing & Validation
+
+- [ ] Unit tests for matching logic
+- [ ] Integration tests with real logs
+- [ ] Performance testing with large datasets (10,000+ pages)
+- [ ] Validate reference counts accuracy
+- [ ] Test parsing extraction functions
+- [ ] Test distributed lock under concurrent load
+- [ ] Test browser crash recovery (lock timeout)
+- [ ] Validate medialog backfill completeness
+
+### 4. Operational Readiness
+
+- [ ] Write operational runbooks for engineers
+- [ ] Create monitoring dashboards for index health
+- [ ] Document troubleshooting procedures
+- [ ] Set up alerts for failures (lock timeouts, parse errors)
+- [ ] Establish maintenance schedule (monthly rebuilds)
+- [ ] Train support team on index operations
+
+### 5. Monitoring & Maintenance
+
+- [ ] Log processing metrics (time, entries, errors)
+- [ ] Alert on anomalies (orphaned entries, large drifts)
+- [ ] Periodic full rebuild (weekly/monthly)
+- [ ] Dashboard for index health
+- [ ] Track parsing failures and success rates
+- [ ] Monitor lock contention and timeout rates
+- [ ] Track refresh performance across site sizes
+
+### 6. Future Enhancements
+
+- [ ] Handle "delete" operations when API confirmed
+- [ ] Add support for video tracking (similar to images)
+- [ ] Implement pagination for large indexes
+- [ ] Add caching layer for frequent queries
+- [ ] Explore real-time updates via webhooks
+- [ ] Optimize parsing performance (parallel processing, caching)
+- [ ] Implement leader election for background refresh
+- [ ] Add incremental medialog backfill for updated pages
+- [ ] Explore worker-based refresh (replace browser-based)
+
+---
+
+## References
+
+- **AEM Media Documentation:** https://www.aem.live/docs/media
+- **Auditlog API:** https://www.aem.live/docs/admin.html#tag/log/operation/getLogs
+- **Medialog API:** (Similar to auditlog, dedicated for Media Bus)
+- **Testing Repository:** `kmurugulla/brightpath`
+- **Test Date:** February 24, 2026
+
+---
+
+## Appendix: Test Data Summary
+
+### All Scenarios Tested
+
+| Scenario | Pages | Media | Key Learning |
+|----------|-------|-------|--------------|
+| A | scenario-a.md | 3 images | First-time ingest via markup |
+| B | scenario-b.md | 0 (text) | Auditlog without medialog |
+| H | Standalone files | 1 image, 1 PDF, 1 SVG | Standalone vs embedded |
+| G | scenario-g.md | 2 images, links | Mixed media behavior, parsing required |
+| Re-preview | All 3 pages | Same media | Duplicate event handling |
+
+### Timestamp Patterns Observed
+
+| Event Type | Typical Delay | Range Observed |
+|------------|--------------|----------------|
+| Auditlog to Medialog | 1.5-2 seconds | 800ms - 3400ms |
+| Multi-page bulk preview | Nearly simultaneous | 4-9ms apart |
+| Media on same page | Identical timestamp | 0ms (exact match) |
+
+### Content Type Tracking Summary
+
+| Content | Logs | Parsing | Usage Link |
+|---------|------|---------|------------|
+| Images/Videos | Medialog | No | resourcePath field |
+| PDFs/SVGs/Fragments | Auditlog only | Yes | Parse HTML |
+
+---
+
+**Last Updated:** February 24, 2026