Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 58 additions & 41 deletions .github/workflows/sync-extension.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,36 +63,25 @@ jobs:
mkdir -p extension-temp
cd extension-temp

# First, try to download the zip archive if available
ZIP_URL=$(curl -s -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" \
# Download individual files from release (reliable method - no zip)
echo "📁 Downloading individual files from release..."
curl -L -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" \
"https://api.github.com/repos/$REPO/releases/tags/$TAG" | \
jq -r '.assets[] | select(.name == "extension-package.zip") | .browser_download_url')

if [ -n "$ZIP_URL" ] && [ "$ZIP_URL" != "null" ]; then
echo "📦 Downloading extension-package.zip..."
curl -L -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" "$ZIP_URL" -o extension-package.zip
unzip -q extension-package.zip -d .
# Files should now be in extension-temp/extension-package/ or extension-temp/
if [ -d "extension-package" ]; then
mv extension-package/* . 2>/dev/null || true
rmdir extension-package 2>/dev/null || true
fi
else
echo "📁 Downloading individual files from release..."
# Download each file from release
curl -s -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" \
"https://api.github.com/repos/$REPO/releases/tags/$TAG" | \
jq -r '.assets[] | select(.name | endswith(".js") or endswith(".wasm") or endswith(".json") or endswith(".d.ts")) | .browser_download_url' | \
while read url; do
if [ -n "$url" ] && [ "$url" != "null" ]; then
filename=$(basename "$url")
echo " Downloading $filename..."
curl -L -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" "$url" -o "$filename"
jq -r '.assets[] | select(.name | endswith(".js") or endswith(".wasm") or endswith(".json") or endswith(".d.ts")) | "\(.browser_download_url)|\(.name)"' | \
while IFS='|' read -r url name; do
if [ -n "$url" ] && [ "$url" != "null" ] && [ -n "$name" ]; then
# Preserve directory structure from asset name
# If name contains '/', create directories
dir=$(dirname "$name")
if [ "$dir" != "." ]; then
mkdir -p "$dir"
fi
done
fi
echo " Downloading $name..."
curl -L -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" "$url" -o "$name"
fi
done

# Verify files were downloaded
# Verify downloaded files
echo "📋 Downloaded files:"
ls -la

Expand All @@ -102,34 +91,62 @@ jobs:
# Create extension directory structure
mkdir -p src/extension/pkg

# Copy extension files (check both root and pkg subdirectory)
cp extension-temp/manifest.json src/extension/ 2>/dev/null || echo "⚠️ manifest.json not found in release"
cp extension-temp/content.js src/extension/ 2>/dev/null || echo "⚠️ content.js not found in release"
cp extension-temp/background.js src/extension/ 2>/dev/null || echo "⚠️ background.js not found in release"
cp extension-temp/injected_api.js src/extension/ 2>/dev/null || echo "⚠️ injected_api.js not found in release"
# Copy extension files (handle both root and extension-package/ subdirectory)
# Check root first, then extension-package/ subdirectory
if [ -f "extension-temp/manifest.json" ]; then
cp extension-temp/manifest.json src/extension/
elif [ -f "extension-temp/extension-package/manifest.json" ]; then
cp extension-temp/extension-package/manifest.json src/extension/
else
echo "⚠️ manifest.json not found"
fi

if [ -f "extension-temp/content.js" ]; then
cp extension-temp/content.js src/extension/
elif [ -f "extension-temp/extension-package/content.js" ]; then
cp extension-temp/extension-package/content.js src/extension/
else
echo "⚠️ content.js not found"
fi

if [ -f "extension-temp/background.js" ]; then
cp extension-temp/background.js src/extension/
elif [ -f "extension-temp/extension-package/background.js" ]; then
cp extension-temp/extension-package/background.js src/extension/
else
echo "⚠️ background.js not found"
fi

if [ -f "extension-temp/injected_api.js" ]; then
cp extension-temp/injected_api.js src/extension/
elif [ -f "extension-temp/extension-package/injected_api.js" ]; then
cp extension-temp/extension-package/injected_api.js src/extension/
else
echo "⚠️ injected_api.js not found"
fi

# Copy WASM files (check both root and pkg subdirectory)
# Copy WASM files (check both locations)
if [ -f "extension-temp/pkg/sentience_core.js" ]; then
cp extension-temp/pkg/sentience_core.js src/extension/pkg/
elif [ -f "extension-temp/sentience_core.js" ]; then
cp extension-temp/sentience_core.js src/extension/pkg/
elif [ -f "extension-temp/extension-package/pkg/sentience_core.js" ]; then
cp extension-temp/extension-package/pkg/sentience_core.js src/extension/pkg/
else
echo "⚠️ sentience_core.js not found"
fi

if [ -f "extension-temp/pkg/sentience_core_bg.wasm" ]; then
cp extension-temp/pkg/sentience_core_bg.wasm src/extension/pkg/
elif [ -f "extension-temp/sentience_core_bg.wasm" ]; then
cp extension-temp/sentience_core_bg.wasm src/extension/pkg/
elif [ -f "extension-temp/extension-package/pkg/sentience_core_bg.wasm" ]; then
cp extension-temp/extension-package/pkg/sentience_core_bg.wasm src/extension/pkg/
else
echo "⚠️ sentience_core_bg.wasm not found"
fi

# Copy TypeScript definitions
if [ -d "extension-temp/pkg" ]; then
cp extension-temp/pkg/*.d.ts src/extension/pkg/ 2>/dev/null || echo "⚠️ Type definitions not found"
elif [ -d "extension-temp" ]; then
cp extension-temp/*.d.ts src/extension/pkg/ 2>/dev/null || echo "⚠️ Type definitions not found"
elif [ -d "extension-temp/extension-package/pkg" ]; then
cp extension-temp/extension-package/pkg/*.d.ts src/extension/pkg/ 2>/dev/null || echo "⚠️ Type definitions not found"
fi

# Verify copied files
Expand All @@ -156,9 +173,9 @@ jobs:
if: steps.release.outputs.skip != 'true' && steps.changes.outputs.changed == 'true'
uses: peter-evans/create-pull-request@v5
with:
# Use GITHUB_TOKEN (built-in) if repository allows PR creation, otherwise use PR_TOKEN (PAT)
# Use PR_TOKEN if available (for repos with org restrictions), otherwise use GITHUB_TOKEN
# To use PAT: create secret named PR_TOKEN with a Personal Access Token that has 'repo' scope
token: ${{ secrets.PR_TOKEN || secrets.GITHUB_TOKEN }}
token: ${{ secrets.PR_TOKEN }}
commit-message: "chore: sync extension files from sentience-chrome ${{ steps.release.outputs.tag }}"
title: "Sync Extension: ${{ steps.release.outputs.tag }}"
body: |
Expand Down
23 changes: 23 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -105,13 +105,36 @@ npm run build
- `.toHaveText(text)`
- `.toHaveCount(n)`

### Content Reading
- `read(browser, options)` - Read page content
- **Default format: `"raw"`** - Returns HTML suitable for Turndown
- `format: "raw"` - Get cleaned HTML
- `format: "markdown"` - Get high-quality markdown (uses Turndown internally)
- `format: "text"` - Get plain text

**Examples:**
```typescript
import { read } from './src';

// Get raw HTML (default)
const result = await read(browser);
const html = result.content;

// Get high-quality markdown (uses Turndown automatically)
const result = await read(browser, { format: 'markdown' });
const markdown = result.content;
```

See `examples/read-markdown.ts` for complete examples.

## Examples

See `examples/` directory:
- `hello.ts` - Extension bridge verification
- `basic-agent.ts` - Basic snapshot
- `query-demo.ts` - Query engine
- `wait-and-click.ts` - Wait and actions
- `read-markdown.ts` - Reading page content and converting to markdown

### Content Reading Example

Expand Down
67 changes: 67 additions & 0 deletions examples/read-markdown.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
/**
* Example: Reading page content and converting to markdown
*
* This example shows how to use the read() function to get page content
* and convert it to high-quality markdown using Turndown.
*/

import { SentienceBrowser, read } from '../src';
import TurndownService from 'turndown';

async function main() {
// Initialize browser
const browser = new SentienceBrowser();
await browser.start();

try {
// Navigate to a page
await browser.getPage().goto('https://example.com');
await browser.getPage().waitForLoadState('networkidle');

// Method 1: Get raw HTML (default) and convert with Turndown
console.log('=== Method 1: Raw HTML + Turndown (Recommended) ===');
const result = await read(browser); // format="raw" is default
const htmlContent = result.content;

// Convert to markdown using Turndown (better quality)
const turndownService = new TurndownService({
headingStyle: 'atx', // Use # for headings
bulletListMarker: '-', // Use - for lists
codeBlockStyle: 'fenced', // Use ``` for code blocks
});

// Add custom rules for better conversion
turndownService.addRule('strikethrough', {
filter: ['del', 's', 'strike'] as any,
replacement: (content: string) => `~~${content}~~`,
});

// Strip unwanted tags
turndownService.remove(['script', 'style', 'nav', 'footer', 'header', 'noscript']);

const markdown = turndownService.turndown(htmlContent);
console.log(`Markdown length: ${markdown.length} characters`);
console.log(markdown.substring(0, 500)); // Print first 500 chars
console.log('\n');

// Method 2: Get high-quality markdown directly (uses Turndown internally)
console.log('=== Method 2: Direct markdown (High-quality via Turndown) ===');
const result2 = await read(browser, { format: 'markdown' });
const highQualityMarkdown = result2.content;
console.log(`Markdown length: ${highQualityMarkdown.length} characters`);
console.log(highQualityMarkdown.substring(0, 500)); // Print first 500 chars
console.log('\n');

// Method 3: Get plain text
console.log('=== Method 3: Plain text ===');
const result3 = await read(browser, { format: 'text' });
const textContent = result3.content;
console.log(`Text length: ${textContent.length} characters`);
console.log(textContent.substring(0, 500)); // Print first 500 chars
} finally {
await browser.close();
}
}

main().catch(console.error);

114 changes: 72 additions & 42 deletions src/read.ts
Original file line number Diff line number Diff line change
@@ -1,80 +1,110 @@
/**
* Read page content - enhanced markdown conversion
* Read page content - supports raw HTML, text, and markdown formats
*/

import { SentienceBrowser } from './browser';
import TurndownService from 'turndown';

export interface ReadOptions {
format?: 'text' | 'markdown';
enhance_markdown?: boolean;
format?: 'raw' | 'text' | 'markdown';
enhanceMarkdown?: boolean;
}

export interface ReadResult {
status: 'success' | 'error';
url: string;
format: 'text' | 'markdown';
format: 'raw' | 'text' | 'markdown';
content: string;
length: number;
error?: string;
}

/**
* Read page content as text or markdown
* Read page content as raw HTML, text, or markdown
*
* @param browser - SentienceBrowser instance
* @param options - Read options
* @returns ReadResult with page content
*
* @example
* // Get raw HTML (default)
* const result = await read(browser);
* const htmlContent = result.content;
*
* @example
* // Get high-quality markdown (uses Turndown internally)
* const result = await read(browser, { format: 'markdown' });
* const markdown = result.content;
*
* @example
* // Get plain text
* const result = await read(browser, { format: 'text' });
* const text = result.content;
*/
export async function read(
browser: SentienceBrowser,
options: ReadOptions = {}
): Promise<ReadResult> {
const page = browser.getPage();
const format = options.format || 'text';
const enhanceMarkdown = options.enhance_markdown !== false; // Default to true

// Get basic content from extension
const result = (await page.evaluate(
(opts) => {
return (window as any).sentience.read(opts);
},
{ format }
)) as ReadResult;
const format = options.format || 'raw'; // Default to 'raw' for Turndown compatibility
const enhanceMarkdown = options.enhanceMarkdown !== false; // Default to true

// Enhance markdown if requested and format is markdown
if (format === 'markdown' && enhanceMarkdown && result.status === 'success') {
try {
// Get full HTML from page
const htmlContent = await page.evaluate(
() => document.documentElement.outerHTML
);
if (format === 'markdown' && enhanceMarkdown) {
// Get raw HTML from the extension first
const rawHtmlResult = (await page.evaluate(
(opts) => {
return (window as any).sentience.read(opts);
},
{ format: 'raw' }
)) as ReadResult;

// Use turndown for better conversion
const turndownService = new TurndownService({
headingStyle: 'atx', // Use # for headings
bulletListMarker: '-', // Use - for lists
codeBlockStyle: 'fenced', // Use ``` for code blocks
});
if (rawHtmlResult.status === 'success') {
const htmlContent = rawHtmlResult.content;
try {
const turndownService = new TurndownService({
headingStyle: 'atx',
hr: '---',
bulletListMarker: '-',
codeBlockStyle: 'fenced',
emDelimiter: '*',
});

// Add custom rules for better conversion
turndownService.addRule('strikethrough', {
filter: ['del', 's', 'strike'] as any,
replacement: (content: string) => `~~${content}~~`,
});
// Add custom rules for better markdown
turndownService.addRule('strikethrough', {
filter: (node) => ['s', 'del', 'strike'].includes(node.nodeName.toLowerCase()),
replacement: function (content) {
return '~~' + content + '~~';
},
});

// Strip unwanted tags
turndownService.remove(['script', 'style', 'nav', 'footer', 'header', 'noscript']);
// Optionally strip certain tags entirely
turndownService.remove(['script', 'style', 'noscript', 'iframe'] as any);

const enhancedMarkdown = turndownService.turndown(htmlContent);
result.content = enhancedMarkdown;
result.length = enhancedMarkdown.length;
} catch (e) {
// If enhancement fails, use extension's result
result.error = `Markdown enhancement failed: ${e}`;
const markdownContent = turndownService.turndown(htmlContent);
return {
status: 'success',
url: rawHtmlResult.url,
format: 'markdown',
content: markdownContent,
length: markdownContent.length,
};
} catch (e: any) {
console.warn(`Turndown conversion failed: ${e.message}, falling back to extension's markdown.`);
// Fallback to extension's markdown if Turndown fails
}
} else {
console.warn(`Failed to get raw HTML from extension: ${rawHtmlResult.error}, falling back to extension's markdown.`);
// Fallback to extension's markdown if getting raw HTML fails
}
}

// If not enhanced markdown, or fallback, call extension with requested format
const result = (await page.evaluate(
(opts) => {
return (window as any).sentience.read(opts);
},
{ format }
)) as ReadResult;

return result;
}

Loading