diff --git a/.github/workflows/sync-extension.yml b/.github/workflows/sync-extension.yml index 201a45ce..6d195e21 100644 --- a/.github/workflows/sync-extension.yml +++ b/.github/workflows/sync-extension.yml @@ -63,36 +63,25 @@ jobs: mkdir -p extension-temp cd extension-temp - # First, try to download the zip archive if available - ZIP_URL=$(curl -s -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" \ + # Download individual files from release (reliable method - no zip) + echo "📁 Downloading individual files from release..." + curl -L -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" \ "https://api.github.com/repos/$REPO/releases/tags/$TAG" | \ - jq -r '.assets[] | select(.name == "extension-package.zip") | .browser_download_url') - - if [ -n "$ZIP_URL" ] && [ "$ZIP_URL" != "null" ]; then - echo "📦 Downloading extension-package.zip..." - curl -L -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" "$ZIP_URL" -o extension-package.zip - unzip -q extension-package.zip -d . - # Files should now be in extension-temp/extension-package/ or extension-temp/ - if [ -d "extension-package" ]; then - mv extension-package/* . 2>/dev/null || true - rmdir extension-package 2>/dev/null || true - fi - else - echo "📁 Downloading individual files from release..." - # Download each file from release - curl -s -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" \ - "https://api.github.com/repos/$REPO/releases/tags/$TAG" | \ - jq -r '.assets[] | select(.name | endswith(".js") or endswith(".wasm") or endswith(".json") or endswith(".d.ts")) | .browser_download_url' | \ - while read url; do - if [ -n "$url" ] && [ "$url" != "null" ]; then - filename=$(basename "$url") - echo " Downloading $filename..." - curl -L -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" "$url" -o "$filename" + jq -r '.assets[] | select(.name | endswith(".js") or endswith(".wasm") or endswith(".json") or endswith(".d.ts")) | "\(.browser_download_url)|\(.name)"' | \ + while IFS='|' read -r url name; do + if [ -n "$url" ] && [ "$url" != "null" ] && [ -n "$name" ]; then + # Preserve directory structure from asset name + # If name contains '/', create directories + dir=$(dirname "$name") + if [ "$dir" != "." ]; then + mkdir -p "$dir" fi - done - fi + echo " Downloading $name..." + curl -L -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" "$url" -o "$name" + fi + done - # Verify files were downloaded + # Verify downloaded files echo "📋 Downloaded files:" ls -la @@ -102,25 +91,53 @@ jobs: # Create extension directory structure mkdir -p src/extension/pkg - # Copy extension files (check both root and pkg subdirectory) - cp extension-temp/manifest.json src/extension/ 2>/dev/null || echo "⚠️ manifest.json not found in release" - cp extension-temp/content.js src/extension/ 2>/dev/null || echo "⚠️ content.js not found in release" - cp extension-temp/background.js src/extension/ 2>/dev/null || echo "⚠️ background.js not found in release" - cp extension-temp/injected_api.js src/extension/ 2>/dev/null || echo "⚠️ injected_api.js not found in release" + # Copy extension files (handle both root and extension-package/ subdirectory) + # Check root first, then extension-package/ subdirectory + if [ -f "extension-temp/manifest.json" ]; then + cp extension-temp/manifest.json src/extension/ + elif [ -f "extension-temp/extension-package/manifest.json" ]; then + cp extension-temp/extension-package/manifest.json src/extension/ + else + echo "⚠️ manifest.json not found" + fi + + if [ -f "extension-temp/content.js" ]; then + cp extension-temp/content.js src/extension/ + elif [ -f "extension-temp/extension-package/content.js" ]; then + cp extension-temp/extension-package/content.js src/extension/ + else + echo "⚠️ content.js not found" + fi + + if [ -f "extension-temp/background.js" ]; then + cp extension-temp/background.js src/extension/ + elif [ -f "extension-temp/extension-package/background.js" ]; then + cp extension-temp/extension-package/background.js src/extension/ + else + echo "⚠️ background.js not found" + fi + + if [ -f "extension-temp/injected_api.js" ]; then + cp extension-temp/injected_api.js src/extension/ + elif [ -f "extension-temp/extension-package/injected_api.js" ]; then + cp extension-temp/extension-package/injected_api.js src/extension/ + else + echo "⚠️ injected_api.js not found" + fi - # Copy WASM files (check both root and pkg subdirectory) + # Copy WASM files (check both locations) if [ -f "extension-temp/pkg/sentience_core.js" ]; then cp extension-temp/pkg/sentience_core.js src/extension/pkg/ - elif [ -f "extension-temp/sentience_core.js" ]; then - cp extension-temp/sentience_core.js src/extension/pkg/ + elif [ -f "extension-temp/extension-package/pkg/sentience_core.js" ]; then + cp extension-temp/extension-package/pkg/sentience_core.js src/extension/pkg/ else echo "⚠️ sentience_core.js not found" fi if [ -f "extension-temp/pkg/sentience_core_bg.wasm" ]; then cp extension-temp/pkg/sentience_core_bg.wasm src/extension/pkg/ - elif [ -f "extension-temp/sentience_core_bg.wasm" ]; then - cp extension-temp/sentience_core_bg.wasm src/extension/pkg/ + elif [ -f "extension-temp/extension-package/pkg/sentience_core_bg.wasm" ]; then + cp extension-temp/extension-package/pkg/sentience_core_bg.wasm src/extension/pkg/ else echo "⚠️ sentience_core_bg.wasm not found" fi @@ -128,8 +145,8 @@ jobs: # Copy TypeScript definitions if [ -d "extension-temp/pkg" ]; then cp extension-temp/pkg/*.d.ts src/extension/pkg/ 2>/dev/null || echo "⚠️ Type definitions not found" - elif [ -d "extension-temp" ]; then - cp extension-temp/*.d.ts src/extension/pkg/ 2>/dev/null || echo "⚠️ Type definitions not found" + elif [ -d "extension-temp/extension-package/pkg" ]; then + cp extension-temp/extension-package/pkg/*.d.ts src/extension/pkg/ 2>/dev/null || echo "⚠️ Type definitions not found" fi # Verify copied files @@ -156,9 +173,9 @@ jobs: if: steps.release.outputs.skip != 'true' && steps.changes.outputs.changed == 'true' uses: peter-evans/create-pull-request@v5 with: - # Use GITHUB_TOKEN (built-in) if repository allows PR creation, otherwise use PR_TOKEN (PAT) + # Use PR_TOKEN if available (for repos with org restrictions), otherwise use GITHUB_TOKEN # To use PAT: create secret named PR_TOKEN with a Personal Access Token that has 'repo' scope - token: ${{ secrets.PR_TOKEN || secrets.GITHUB_TOKEN }} + token: ${{ secrets.PR_TOKEN }} commit-message: "chore: sync extension files from sentience-chrome ${{ steps.release.outputs.tag }}" title: "Sync Extension: ${{ steps.release.outputs.tag }}" body: | diff --git a/README.md b/README.md index 95e8ad81..445a86cf 100644 --- a/README.md +++ b/README.md @@ -105,6 +105,28 @@ npm run build - `.toHaveText(text)` - `.toHaveCount(n)` +### Content Reading +- `read(browser, options)` - Read page content + - **Default format: `"raw"`** - Returns HTML suitable for Turndown + - `format: "raw"` - Get cleaned HTML + - `format: "markdown"` - Get high-quality markdown (uses Turndown internally) + - `format: "text"` - Get plain text + + **Examples:** + ```typescript + import { read } from './src'; + + // Get raw HTML (default) + const result = await read(browser); + const html = result.content; + + // Get high-quality markdown (uses Turndown automatically) + const result = await read(browser, { format: 'markdown' }); + const markdown = result.content; + ``` + + See `examples/read-markdown.ts` for complete examples. + ## Examples See `examples/` directory: @@ -112,6 +134,7 @@ See `examples/` directory: - `basic-agent.ts` - Basic snapshot - `query-demo.ts` - Query engine - `wait-and-click.ts` - Wait and actions +- `read-markdown.ts` - Reading page content and converting to markdown ### Content Reading Example diff --git a/examples/read-markdown.ts b/examples/read-markdown.ts new file mode 100644 index 00000000..2bb8107b --- /dev/null +++ b/examples/read-markdown.ts @@ -0,0 +1,67 @@ +/** + * Example: Reading page content and converting to markdown + * + * This example shows how to use the read() function to get page content + * and convert it to high-quality markdown using Turndown. + */ + +import { SentienceBrowser, read } from '../src'; +import TurndownService from 'turndown'; + +async function main() { + // Initialize browser + const browser = new SentienceBrowser(); + await browser.start(); + + try { + // Navigate to a page + await browser.getPage().goto('https://example.com'); + await browser.getPage().waitForLoadState('networkidle'); + + // Method 1: Get raw HTML (default) and convert with Turndown + console.log('=== Method 1: Raw HTML + Turndown (Recommended) ==='); + const result = await read(browser); // format="raw" is default + const htmlContent = result.content; + + // Convert to markdown using Turndown (better quality) + const turndownService = new TurndownService({ + headingStyle: 'atx', // Use # for headings + bulletListMarker: '-', // Use - for lists + codeBlockStyle: 'fenced', // Use ``` for code blocks + }); + + // Add custom rules for better conversion + turndownService.addRule('strikethrough', { + filter: ['del', 's', 'strike'] as any, + replacement: (content: string) => `~~${content}~~`, + }); + + // Strip unwanted tags + turndownService.remove(['script', 'style', 'nav', 'footer', 'header', 'noscript']); + + const markdown = turndownService.turndown(htmlContent); + console.log(`Markdown length: ${markdown.length} characters`); + console.log(markdown.substring(0, 500)); // Print first 500 chars + console.log('\n'); + + // Method 2: Get high-quality markdown directly (uses Turndown internally) + console.log('=== Method 2: Direct markdown (High-quality via Turndown) ==='); + const result2 = await read(browser, { format: 'markdown' }); + const highQualityMarkdown = result2.content; + console.log(`Markdown length: ${highQualityMarkdown.length} characters`); + console.log(highQualityMarkdown.substring(0, 500)); // Print first 500 chars + console.log('\n'); + + // Method 3: Get plain text + console.log('=== Method 3: Plain text ==='); + const result3 = await read(browser, { format: 'text' }); + const textContent = result3.content; + console.log(`Text length: ${textContent.length} characters`); + console.log(textContent.substring(0, 500)); // Print first 500 chars + } finally { + await browser.close(); + } +} + +main().catch(console.error); + diff --git a/src/read.ts b/src/read.ts index ba6b5c84..a91732b8 100644 --- a/src/read.ts +++ b/src/read.ts @@ -1,80 +1,110 @@ /** - * Read page content - enhanced markdown conversion + * Read page content - supports raw HTML, text, and markdown formats */ import { SentienceBrowser } from './browser'; import TurndownService from 'turndown'; export interface ReadOptions { - format?: 'text' | 'markdown'; - enhance_markdown?: boolean; + format?: 'raw' | 'text' | 'markdown'; + enhanceMarkdown?: boolean; } export interface ReadResult { status: 'success' | 'error'; url: string; - format: 'text' | 'markdown'; + format: 'raw' | 'text' | 'markdown'; content: string; length: number; error?: string; } /** - * Read page content as text or markdown + * Read page content as raw HTML, text, or markdown * * @param browser - SentienceBrowser instance * @param options - Read options * @returns ReadResult with page content + * + * @example + * // Get raw HTML (default) + * const result = await read(browser); + * const htmlContent = result.content; + * + * @example + * // Get high-quality markdown (uses Turndown internally) + * const result = await read(browser, { format: 'markdown' }); + * const markdown = result.content; + * + * @example + * // Get plain text + * const result = await read(browser, { format: 'text' }); + * const text = result.content; */ export async function read( browser: SentienceBrowser, options: ReadOptions = {} ): Promise { const page = browser.getPage(); - const format = options.format || 'text'; - const enhanceMarkdown = options.enhance_markdown !== false; // Default to true - - // Get basic content from extension - const result = (await page.evaluate( - (opts) => { - return (window as any).sentience.read(opts); - }, - { format } - )) as ReadResult; + const format = options.format || 'raw'; // Default to 'raw' for Turndown compatibility + const enhanceMarkdown = options.enhanceMarkdown !== false; // Default to true - // Enhance markdown if requested and format is markdown - if (format === 'markdown' && enhanceMarkdown && result.status === 'success') { - try { - // Get full HTML from page - const htmlContent = await page.evaluate( - () => document.documentElement.outerHTML - ); + if (format === 'markdown' && enhanceMarkdown) { + // Get raw HTML from the extension first + const rawHtmlResult = (await page.evaluate( + (opts) => { + return (window as any).sentience.read(opts); + }, + { format: 'raw' } + )) as ReadResult; - // Use turndown for better conversion - const turndownService = new TurndownService({ - headingStyle: 'atx', // Use # for headings - bulletListMarker: '-', // Use - for lists - codeBlockStyle: 'fenced', // Use ``` for code blocks - }); + if (rawHtmlResult.status === 'success') { + const htmlContent = rawHtmlResult.content; + try { + const turndownService = new TurndownService({ + headingStyle: 'atx', + hr: '---', + bulletListMarker: '-', + codeBlockStyle: 'fenced', + emDelimiter: '*', + }); - // Add custom rules for better conversion - turndownService.addRule('strikethrough', { - filter: ['del', 's', 'strike'] as any, - replacement: (content: string) => `~~${content}~~`, - }); + // Add custom rules for better markdown + turndownService.addRule('strikethrough', { + filter: (node) => ['s', 'del', 'strike'].includes(node.nodeName.toLowerCase()), + replacement: function (content) { + return '~~' + content + '~~'; + }, + }); - // Strip unwanted tags - turndownService.remove(['script', 'style', 'nav', 'footer', 'header', 'noscript']); + // Optionally strip certain tags entirely + turndownService.remove(['script', 'style', 'noscript', 'iframe'] as any); - const enhancedMarkdown = turndownService.turndown(htmlContent); - result.content = enhancedMarkdown; - result.length = enhancedMarkdown.length; - } catch (e) { - // If enhancement fails, use extension's result - result.error = `Markdown enhancement failed: ${e}`; + const markdownContent = turndownService.turndown(htmlContent); + return { + status: 'success', + url: rawHtmlResult.url, + format: 'markdown', + content: markdownContent, + length: markdownContent.length, + }; + } catch (e: any) { + console.warn(`Turndown conversion failed: ${e.message}, falling back to extension's markdown.`); + // Fallback to extension's markdown if Turndown fails + } + } else { + console.warn(`Failed to get raw HTML from extension: ${rawHtmlResult.error}, falling back to extension's markdown.`); + // Fallback to extension's markdown if getting raw HTML fails } } + // If not enhanced markdown, or fallback, call extension with requested format + const result = (await page.evaluate( + (opts) => { + return (window as any).sentience.read(opts); + }, + { format } + )) as ReadResult; + return result; } -