From aa9b1683fa6421e56670faa826fe1a071b4c3641 Mon Sep 17 00:00:00 2001 From: rcholic Date: Sun, 21 Dec 2025 18:16:41 -0800 Subject: [PATCH 1/2] handle markdown read --- README.md | 23 ++++++++ examples/read-markdown.ts | 67 ++++++++++++++++++++++ src/index.ts | 1 + src/read.ts | 114 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 205 insertions(+) create mode 100644 examples/read-markdown.ts create mode 100644 src/read.ts diff --git a/README.md b/README.md index 51639242..34e25502 100644 --- a/README.md +++ b/README.md @@ -97,6 +97,28 @@ npm run build - `.toHaveText(text)` - `.toHaveCount(n)` +### Content Reading +- `read(browser, options)` - Read page content + - **Default format: `"raw"`** - Returns HTML suitable for Turndown + - `format: "raw"` - Get cleaned HTML + - `format: "markdown"` - Get high-quality markdown (uses Turndown internally) + - `format: "text"` - Get plain text + + **Examples:** + ```typescript + import { read } from './src'; + + // Get raw HTML (default) + const result = await read(browser); + const html = result.content; + + // Get high-quality markdown (uses Turndown automatically) + const result = await read(browser, { format: 'markdown' }); + const markdown = result.content; + ``` + + See `examples/read-markdown.ts` for complete examples. + ## Examples See `examples/` directory: @@ -104,6 +126,7 @@ See `examples/` directory: - `basic-agent.ts` - Basic snapshot - `query-demo.ts` - Query engine - `wait-and-click.ts` - Wait and actions +- `read-markdown.ts` - Reading page content and converting to markdown ## Testing diff --git a/examples/read-markdown.ts b/examples/read-markdown.ts new file mode 100644 index 00000000..2bb8107b --- /dev/null +++ b/examples/read-markdown.ts @@ -0,0 +1,67 @@ +/** + * Example: Reading page content and converting to markdown + * + * This example shows how to use the read() function to get page content + * and convert it to high-quality markdown using Turndown. + */ + +import { SentienceBrowser, read } from '../src'; +import TurndownService from 'turndown'; + +async function main() { + // Initialize browser + const browser = new SentienceBrowser(); + await browser.start(); + + try { + // Navigate to a page + await browser.getPage().goto('https://example.com'); + await browser.getPage().waitForLoadState('networkidle'); + + // Method 1: Get raw HTML (default) and convert with Turndown + console.log('=== Method 1: Raw HTML + Turndown (Recommended) ==='); + const result = await read(browser); // format="raw" is default + const htmlContent = result.content; + + // Convert to markdown using Turndown (better quality) + const turndownService = new TurndownService({ + headingStyle: 'atx', // Use # for headings + bulletListMarker: '-', // Use - for lists + codeBlockStyle: 'fenced', // Use ``` for code blocks + }); + + // Add custom rules for better conversion + turndownService.addRule('strikethrough', { + filter: ['del', 's', 'strike'] as any, + replacement: (content: string) => `~~${content}~~`, + }); + + // Strip unwanted tags + turndownService.remove(['script', 'style', 'nav', 'footer', 'header', 'noscript']); + + const markdown = turndownService.turndown(htmlContent); + console.log(`Markdown length: ${markdown.length} characters`); + console.log(markdown.substring(0, 500)); // Print first 500 chars + console.log('\n'); + + // Method 2: Get high-quality markdown directly (uses Turndown internally) + console.log('=== Method 2: Direct markdown (High-quality via Turndown) ==='); + const result2 = await read(browser, { format: 'markdown' }); + const highQualityMarkdown = result2.content; + console.log(`Markdown length: ${highQualityMarkdown.length} characters`); + console.log(highQualityMarkdown.substring(0, 500)); // Print first 500 chars + console.log('\n'); + + // Method 3: Get plain text + console.log('=== Method 3: Plain text ==='); + const result3 = await read(browser, { format: 'text' }); + const textContent = result3.content; + console.log(`Text length: ${textContent.length} characters`); + console.log(textContent.substring(0, 500)); // Print first 500 chars + } finally { + await browser.close(); + } +} + +main().catch(console.error); + diff --git a/src/index.ts b/src/index.ts index 442e0f92..947ef624 100644 --- a/src/index.ts +++ b/src/index.ts @@ -11,5 +11,6 @@ export { expect, Expectation } from './expect'; export { Inspector, inspect } from './inspector'; export { Recorder, Trace, TraceStep, record } from './recorder'; export { ScriptGenerator, generate } from './generator'; +export { read, ReadOptions, ReadResult } from './read'; export * from './types'; diff --git a/src/read.ts b/src/read.ts new file mode 100644 index 00000000..4c77e9a3 --- /dev/null +++ b/src/read.ts @@ -0,0 +1,114 @@ +/** + * Read page content - supports raw HTML, text, and markdown formats + */ + +import { SentienceBrowser } from './browser'; +import TurndownService from 'turndown'; + +export interface ReadOptions { + format?: 'raw' | 'text' | 'markdown'; +} + +export interface ReadResult { + status: 'success' | 'error'; + url: string; + format: 'raw' | 'text' | 'markdown'; + content: string; + length: number; + error?: string; +} + +/** + * Read page content as raw HTML, text, or markdown + * + * @param browser - SentienceBrowser instance + * @param options - Read options + * @returns ReadResult with page content + * + * @example + * // Get raw HTML (default) + * const result = await read(browser); + * const htmlContent = result.content; + * + * @example + * // Get high-quality markdown (uses Turndown internally) + * const result = await read(browser, { format: 'markdown' }); + * const markdown = result.content; + * + * @example + * // Get plain text + * const result = await read(browser, { format: 'text' }); + * const text = result.content; + */ +export async function read( + browser: SentienceBrowser, + options: ReadOptions = {} +): Promise { + const page = browser.getPage(); + const format = options.format || 'raw'; // Default to 'raw' for Turndown compatibility + + // For markdown format, get raw HTML first, then convert with Turndown + if (format === 'markdown') { + // Get raw HTML from extension + const rawResult = (await page.evaluate( + (opts) => { + return (window as any).sentience.read(opts); + }, + { format: 'raw' } + )) as ReadResult; + + if (rawResult.status !== 'success') { + return rawResult; + } + + // Convert to markdown using Turndown + try { + const turndownService = new TurndownService({ + headingStyle: 'atx', // Use # for headings + bulletListMarker: '-', // Use - for lists + codeBlockStyle: 'fenced', // Use ``` for code blocks + }); + + // Add custom rules for better conversion + turndownService.addRule('strikethrough', { + filter: ['del', 's', 'strike'] as any, + replacement: (content: string) => `~~${content}~~`, + }); + + // Strip unwanted tags + turndownService.remove(['script', 'style', 'nav', 'footer', 'header', 'noscript']); + + const htmlContent = rawResult.content; + const markdownContent = turndownService.turndown(htmlContent); + + // Return result with markdown content + return { + status: 'success', + url: rawResult.url, + format: 'markdown', + content: markdownContent, + length: markdownContent.length, + }; + } catch (e) { + // If conversion fails, return error + return { + status: 'error', + url: rawResult.url, + format: 'markdown', + content: '', + length: 0, + error: `Markdown conversion failed: ${e}`, + }; + } + } else { + // For "raw" or "text", call extension directly + const result = (await page.evaluate( + (opts) => { + return (window as any).sentience.read(opts); + }, + { format } + )) as ReadResult; + + return result; + } +} From 98d6942a978b1b99afca4ec3c793c2241e852714 Mon Sep 17 00:00:00 2001 From: rcholic Date: Sun, 21 Dec 2025 18:19:06 -0800 Subject: [PATCH 2/2] fix sync extension --- .github/workflows/sync-extension.yml | 88 +++++++++++++++++++++++----- 1 file changed, 74 insertions(+), 14 deletions(-) diff --git a/.github/workflows/sync-extension.yml b/.github/workflows/sync-extension.yml index 7fa2ff87..1a2dcefe 100644 --- a/.github/workflows/sync-extension.yml +++ b/.github/workflows/sync-extension.yml @@ -62,31 +62,91 @@ jobs: mkdir -p extension-temp cd extension-temp - # Download each file from release + # Download individual files from release (reliable method) + echo "📁 Downloading individual files from release..." curl -L -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" \ "https://api.github.com/repos/$REPO/releases/tags/$TAG" | \ - jq -r '.assets[] | select(.name | endswith(".js") or endswith(".wasm") or endswith(".json") or endswith(".d.ts")) | .browser_download_url' | \ - while read url; do - filename=$(basename "$url") - curl -L -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" "$url" -o "$filename" + jq -r '.assets[] | select(.name | endswith(".js") or endswith(".wasm") or endswith(".json") or endswith(".d.ts")) | "\(.browser_download_url)|\(.name)"' | \ + while IFS='|' read -r url name; do + if [ -n "$url" ] && [ "$url" != "null" ] && [ -n "$name" ]; then + # Preserve directory structure from asset name + # If name contains '/', create directories + dir=$(dirname "$name") + if [ "$dir" != "." ]; then + mkdir -p "$dir" + fi + echo " Downloading $name..." + curl -L -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" "$url" -o "$name" + fi done + # Verify downloaded files + echo "📋 Downloaded files:" + ls -la + - name: Copy extension files if: steps.release.outputs.skip != 'true' run: | # Create extension directory structure mkdir -p src/extension/pkg - # Copy extension files - cp extension-temp/manifest.json src/extension/ 2>/dev/null || echo "manifest.json not found in release" - cp extension-temp/content.js src/extension/ 2>/dev/null || echo "content.js not found in release" - cp extension-temp/background.js src/extension/ 2>/dev/null || echo "background.js not found in release" - cp extension-temp/injected_api.js src/extension/ 2>/dev/null || echo "injected_api.js not found in release" + # Copy extension files (handle both root and extension-package/ subdirectory) + # Check root first, then extension-package/ subdirectory + if [ -f "extension-temp/manifest.json" ]; then + cp extension-temp/manifest.json src/extension/ + elif [ -f "extension-temp/extension-package/manifest.json" ]; then + cp extension-temp/extension-package/manifest.json src/extension/ + else + echo "⚠️ manifest.json not found" + fi + + if [ -f "extension-temp/content.js" ]; then + cp extension-temp/content.js src/extension/ + elif [ -f "extension-temp/extension-package/content.js" ]; then + cp extension-temp/extension-package/content.js src/extension/ + else + echo "⚠️ content.js not found" + fi + + if [ -f "extension-temp/background.js" ]; then + cp extension-temp/background.js src/extension/ + elif [ -f "extension-temp/extension-package/background.js" ]; then + cp extension-temp/extension-package/background.js src/extension/ + else + echo "⚠️ background.js not found" + fi - # Copy WASM files - cp extension-temp/pkg/sentience_core.js src/extension/pkg/ 2>/dev/null || echo "sentience_core.js not found" - cp extension-temp/pkg/sentience_core_bg.wasm src/extension/pkg/ 2>/dev/null || echo "sentience_core_bg.wasm not found" - cp extension-temp/pkg/*.d.ts src/extension/pkg/ 2>/dev/null || echo "Type definitions not found" + if [ -f "extension-temp/injected_api.js" ]; then + cp extension-temp/injected_api.js src/extension/ + elif [ -f "extension-temp/extension-package/injected_api.js" ]; then + cp extension-temp/extension-package/injected_api.js src/extension/ + else + echo "⚠️ injected_api.js not found" + fi + + # Copy WASM files (check both locations) + if [ -f "extension-temp/pkg/sentience_core.js" ]; then + cp extension-temp/pkg/sentience_core.js src/extension/pkg/ + elif [ -f "extension-temp/extension-package/pkg/sentience_core.js" ]; then + cp extension-temp/extension-package/pkg/sentience_core.js src/extension/pkg/ + else + echo "⚠️ sentience_core.js not found" + fi + + if [ -f "extension-temp/pkg/sentience_core_bg.wasm" ]; then + cp extension-temp/pkg/sentience_core_bg.wasm src/extension/pkg/ + elif [ -f "extension-temp/extension-package/pkg/sentience_core_bg.wasm" ]; then + cp extension-temp/extension-package/pkg/sentience_core_bg.wasm src/extension/pkg/ + else + echo "⚠️ sentience_core_bg.wasm not found" + fi + + # Copy TypeScript definitions + if [ -d "extension-temp/pkg" ]; then + cp extension-temp/pkg/*.d.ts src/extension/pkg/ 2>/dev/null || echo "⚠️ Type definitions not found" + elif [ -d "extension-temp/extension-package/pkg" ]; then + cp extension-temp/extension-package/pkg/*.d.ts src/extension/pkg/ 2>/dev/null || echo "⚠️ Type definitions not found" + fi - name: Check for changes if: steps.release.outputs.skip != 'true'