From c7ea21dfe04eb07fc702d89a93d3c0bf99830c42 Mon Sep 17 00:00:00 2001 From: aspectrr Date: Wed, 19 Nov 2025 10:17:06 -0500 Subject: [PATCH 1/4] fix: add pplx guide --- content/docs/overview/guides/aeo.mdx | 6 + content/docs/overview/guides/meta.json | 3 +- content/docs/overview/guides/perplexity.mdx | 574 ++++++++++++++++++ .../docs/overview/guides/playwright-node.mdx | 2 +- 4 files changed, 583 insertions(+), 2 deletions(-) create mode 100644 content/docs/overview/guides/aeo.mdx create mode 100644 content/docs/overview/guides/perplexity.mdx diff --git a/content/docs/overview/guides/aeo.mdx b/content/docs/overview/guides/aeo.mdx new file mode 100644 index 00000000..32976d71 --- /dev/null +++ b/content/docs/overview/guides/aeo.mdx @@ -0,0 +1,6 @@ +--- +title: Build an AEO Scraper (Node) +description: Scrape LLM providers with Steel and synthesize answers with OpenAI +sidebarTitle: AEO Scraper (Node) +llm: true +--- diff --git a/content/docs/overview/guides/meta.json b/content/docs/overview/guides/meta.json index 9e2294ad..a84d93ad 100644 --- a/content/docs/overview/guides/meta.json +++ b/content/docs/overview/guides/meta.json @@ -7,6 +7,7 @@ "playwright-node", "playwright-python", "puppeteer", - "selenium" + "selenium", + "perplexity" ] } diff --git a/content/docs/overview/guides/perplexity.mdx b/content/docs/overview/guides/perplexity.mdx new file mode 100644 index 00000000..c9cfe660 --- /dev/null +++ b/content/docs/overview/guides/perplexity.mdx @@ -0,0 +1,574 @@ +--- +title: Build a Perplexity‑style Search Engine +description: Search with Brave, scrape with Steel, and synthesize with OpenAI using a TypeScript CLI +sidebarTitle: Perplexity Clone (Node) +llm: true +--- + +This guide shows you how to build a Perplexity-like research workflow in Node.js/TypeScript that: +- Generates targeted search queries with OpenAI +- Finds relevant links with the Brave Search API +- Scrapes those links to Markdown via Steel’s /v1/scrape endpoint +- Synthesizes a well-cited answer with inline citations + +Looking for a ready-made starter? Skip to the example project section. + +Quick Start +----------- + +Clone the example and run it locally: + +```bash +git clone https://github.com/steel-dev/steel-cookbook +cd steel-cookbook/examples/steel-perplexity-clone +npm install + +# Create a .env file in this directory with your credentials +# See "Configuration" below for required variables. + +# Option A: put QUERY in .env +npm start + +# Option B: pass QUERY on the fly +QUERY="What are the latest improvements in WebAssembly?" npm start +``` + +- Node.js: Requires Node 18+ +- Credentials: You’ll need API keys for Steel.dev, OpenAI, and Brave Search. + +Project Structure +----------------- + +```bash +examples/steel-perplexity-clone + ├─ src/ + │ ├─ config.ts # Env parsing, defaults, feature flags + │ ├─ clients.ts # Brave search, Steel scrape, OpenAI synthesis + │ └─ index.ts # Main pipeline orchestration + ├─ package.json + ├─ tsconfig.json + └─ README.md +``` + +Configuration +------------- + +Create a `.env` file in `examples/steel-perplexity-clone`: + +```env +NODE_ENV=development + +# OpenAI +OPENAI_API_KEY=sk-... +OPENAI_ORG_ID= +OPENAI_MODEL=gpt-5-nano +OPENAI_ENABLE_WEB_SEARCH=true + +# Steel.dev +STEEL_API_KEY=steel_... +STEEL_SCRAPE_ENDPOINT=https://api.steel.dev/v1/scrape +# Optional pacing between scrape requests (ms), useful for rate limits +STEEL_TIMEOUT=3000 + +# Brave Search +BRAVE_API_KEY=brv_... +BRAVE_SEARCH_ENDPOINT=https://api.search.brave.com/res/v1/web/search +BRAVE_SEARCH_COUNTRY=US +BRAVE_SEARCH_LANG=en +BRAVE_SAFESEARCH=moderate + +# Search behavior +SEARCH_TOP_K=3 +REQUEST_TIMEOUT_MS=30000 +CONCURRENCY=2 + +# Your question to research +QUERY="What are the latest improvements in WebAssembly and their benefits?" +``` + +What this example does +---------------------- + +At a high level: + +1) Generate multiple targeted queries for better coverage +- Uses OpenAI to turn the user query into 3 high‑signal search queries + +2) Search and rank URLs with Brave +- Calls Brave’s Web Search API for each generated query +- Aggregates and ranks URLs using a reciprocal-rank strategy + +3) Scrape sources to Markdown with Steel +- Sends each URL to Steel’s `/v1/scrape` to obtain clean Markdown + +4) Synthesize a well‑cited answer with OpenAI +- Builds a context block from scraped Markdown +- Instructs the model to produce inline [n] citations, matching the material order + +The core orchestration happens here: + +```typescript +import { config } from "./config"; +import { + scrapeUrlsToMarkdown, + synthesizeWithCitations, + multiQueryBraveSearch, +} from "./clients"; + +type SearchResponse = { + query: string; + answer: string; + citations: Array<{ index: number; url: string }>; + model: string; + meta: { + tookMs: number; + }; +}; + +async function main() { + const started = Date.now(); + + const query = config.query; + const topK = config.search.topK; + const concurrency = config.concurrency; + + console.info("Search request received", { + query, + topK, + }); + + // 1) Use Brave to get top relevant URLs (do double to get more relevant results to search) + const { urls } = await multiQueryBraveSearch(query, topK * 2); + // const searchRes = await searchTopRelevantUrls(query, requestedTopK * 2); + // const urls = (searchRes.urls || []).slice(0, requestedTopK * 2); + + // console.log(urls, urls.length); + + if (urls.length === 0) { + return console.error("No URLs found for the given query."); + } + + // 2) Scrape each URL into markdown using Steel.dev + const materials = await scrapeUrlsToMarkdown(urls, concurrency, topK); + + if (materials.length === 0) { + console.error("Failed to scrape all URLs. Try again or refine your query."); + } + + // 3) Use OpenAI to synthesize an answer with inline citations + const synthesis = await synthesizeWithCitations({ + query, + materials, + }); + + const tookMs = Date.now() - started; + + const response: SearchResponse = { + query, + answer: synthesis.answer, + citations: synthesis.sources, + model: config.openai.model, + meta: { tookMs }, + }; + + console.log(response); +} + +// Execute the demo +main() + .then(() => { + process.exit(0); + }) + .catch((error) => { + console.error("Task execution failed:", error); + process.exit(1); + }); +``` + +Step 1: Generate richer search coverage +--------------------------------------- + +- The example asks OpenAI to produce 3 specific queries that maximize recall and signal. +- It then calls Brave Search for each query, pausing briefly between calls. +- Results are aggregated and ranked by frequency and reciprocal rank. + +```typescript +export async function multiQueryBraveSearch( + userQuery: string, + topKPerQuery = config.search.topK, +): Promise { + // 1) Ask OpenAI to produce exactly 3 queries as strict JSON. + const prompt = [ + "You are a search strategist.", + "Given the user's query, generate exactly 3 search queries that maximize the likelihood of finding relevant, recent, and factual information.", + "Avoid generic questions; use specific keywords.", + "", + "Return strict JSON with this shape:", + '{ "queries": ["...", "...", "..."] }', + "", + `User query: ${userQuery}`, + ].join("\n"); + + const completion = await openai.chat.completions.create({ + model: config.openai.model, + messages: [ + { role: "system", content: "You produce JSON only. No prose." }, + { role: "user", content: prompt }, + ], + }); + + const rawContent = + completion.choices?.[0]?.message?.content?.trim() ?? '{"queries": []}'; + + let queries: string[] = []; + try { + const parsed = JSON.parse(rawContent); + if (Array.isArray(parsed?.queries)) { + queries = parsed.queries.map((q: unknown) => + typeof q === "string" ? q.trim() : "", + ); + } + } catch { + // Fallback: split lines + queries = rawContent + .split("\n") + .map((l) => l.replace(/^[-*\d.)\s]+/, "").trim()) + .filter(Boolean) + .slice(0, 3); + } + + // Ensure exactly 3 queries, fall back to the original user query variations if needed + queries = Array.from( + new Set( + queries + .filter(Boolean) + .map((q) => q.replace(/\s+/g, " ").trim()) + .slice(0, 3), + ), + ); + while (queries.length < 3) { + if (queries.length === 0) queries.push(userQuery); + else queries.push(`${userQuery} ${queries.length + 1}`); + } + queries = queries.slice(0, 3); + + console.info("Generated queries", { queries }); + + // 2) For each query, call Brave Search with a 1s delay between calls. + const perQueryUrls: string[][] = []; + for (let i = 0; i < queries.length; i++) { + const q = queries[i]; + if (q == null) { + perQueryUrls.push([]); + continue; + } + if (i > 0) { + await new Promise((r) => setTimeout(r, 1000)); + } + try { + const { urls } = await searchTopRelevantUrls( + q, + topKPerQuery ?? config.search.topK, + ); + perQueryUrls.push(urls); + } catch (err) { + console.warn("Brave search failed for generated query", { + query: q, + err: (err as Error)?.message, + }); + perQueryUrls.push([]); + } + } + + // 3) Rank aggregation: reciprocal rank sum + frequency and best rank tiebreakers + type Acc = { + score: number; + occurrences: number; + ranks: number[]; + }; + const scores = new Map(); + + perQueryUrls.forEach((urls) => { + urls.forEach((u, idx) => { + const url = u.trim(); + if (!url) return; + const rank = idx + 1; // 1-based + const inc = 1 / rank; // reciprocal rank + const prev = scores.get(url) ?? { score: 0, occurrences: 0, ranks: [] }; + prev.score += inc; + prev.occurrences += 1; + prev.ranks.push(rank); + scores.set(url, prev); + }); + }); + + // 4) Deduplicate and sort + const ranked: RankedUrl[] = Array.from(scores.entries()) + .map(([url, acc]) => ({ + url, + score: acc.score, + occurrences: acc.occurrences, + ranks: acc.ranks.sort((a, b) => a - b), + })) + .sort((a, b) => { + if (b.score !== a.score) return b.score - a.score; // primary: score + if (b.occurrences !== a.occurrences) return b.occurrences - a.occurrences; // secondary: frequency + // tertiary: best (lowest) rank + const aBest = a.ranks[0] ?? Number.POSITIVE_INFINITY; + const bBest = b.ranks[0] ?? Number.POSITIVE_INFINITY; + return aBest - bBest; + }); + + console.info("Ranked URLs across multi-query search", { + unique: ranked.length, + }); + + return { + queries, + urls: ranked.map((url) => url.url), + _raw: { openai: completion, perQueryUrls }, + }; +} +``` + +Under the hood, the Brave call itself looks like this: + +```typescript +export async function searchTopRelevantUrls( + query: string, + topK = config.search.topK, +): Promise { + // Build Brave Search request URL with query params + const endpoint = new URL(config.brave.endpoint); + endpoint.searchParams.set("q", query); + endpoint.searchParams.set("country", config.brave.country); + endpoint.searchParams.set("search_lang", config.brave.lang); + endpoint.searchParams.set("safesearch", config.brave.safesearch); + endpoint.searchParams.set( + "count", + String(Math.min(topK, config.search.topK)), + ); + + const res = await fetchWithTimeout(endpoint.toString(), { + headers: { + Accept: "application/json", + "X-Subscription-Token": config.brave.apiKey, + }, + }); + + if (!res.ok) { + const text = await res.text().catch(() => ""); + console.error("Brave search failed", { + status: res.status, + statusText: res.statusText, + response: text?.slice(0, 1000), + }); + throw new Error(`Brave search failed: ${res.status} ${res.statusText}`); + } + + const data = (await res.json()) as any; + + // Extract URLs from Brave response + const urls: string[] = []; + if (data?.web?.results && Array.isArray(data.web.results)) { + for (const r of data.web.results) { + if (typeof r?.url === "string") urls.push(r.url); + } + } else if (Array.isArray(data?.results)) { + for (const r of data.results) { + if (typeof r?.url === "string") urls.push(r.url); + } + } + + if (urls.length === 0) { + console.warn("No URLs returned from Brave, attempting salvage from raw", { + raw: JSON.stringify(data).slice(0, 1000), + }); + const rawText = JSON.stringify(data); + const regex = /\bhttps?:\/\/[^\s"'<>]+/gi; + const salvaged = (rawText.match(regex) ?? []) as string[]; + urls.push(...salvaged); + } + + // Normalize and dedupe + const normalized = Array.from(new Set(urls.map((u) => u.trim()))) + .filter(Boolean) + .slice(0, topK); + + console.info("Collected URLs from Brave", { count: normalized.length }); + + return { + urls: normalized, + _raw: data, + }; +} +``` + +Step 2: Scrape each URL to Markdown with Steel +---------------------------------------------- + +- For each URL, POST to Steel’s scrape endpoint. +- Request Markdown by setting `format: ["markdown"]`. +- The response contains `content.markdown`, `links`, and metadata. + +```typescript +export async function scrapeUrlToMarkdown(url: string): Promise { + const endpoint = config.steel.scrapeEndpoint; + + const body: SteelScrapeRequest = { + url, + format: ["markdown"], + }; + + const res = await fetchWithTimeout(endpoint, { + method: "POST", + headers: { + "Steel-Api-Key": config.steel.apiKey, + "Content-Type": "application/json", + }, + body: JSON.stringify(body), + }); + + if (!res.ok) { + const text = await res.text().catch(() => ""); + console.error("Steel.dev scrape failed", { + status: res.status, + statusText: res.statusText, + url, + response: text?.slice(0, 1000), + }); + throw new Error( + `Steel.dev scrape failed for ${url}: ${res.status} ${res.statusText}`, + ); + } + + const payload = (await res.json()) as SteelScrapeResponse; + const markdown = payload?.content?.markdown; + const links = payload?.links; + + if (!markdown) { + console.warn("Steel.dev response did not include recognizable markdown", { + url, + payload: JSON.stringify(payload).slice(0, 1000), + }); + throw new Error(`Steel.dev response missing markdown content for ${url}`); + } + + return { url, markdown, links }; +} +``` + +Step 3: Synthesize an answer with inline citations +-------------------------------------------------- + +- Build a context that enumerates materials like `[1] URL`, then the Markdown. +- Prompt the model to cite with `[n]` as it writes. +- Return an answer plus a `sources` array mapping `[n] -> url`. + +```typescript +export async function synthesizeWithCitations( + input: SynthesisInput, +): Promise { + // Build context block + const contextHeader = + "Context materials (each item shows [index] and URL, followed by markdown content)"; + const contextLines: string[] = [contextHeader]; + input.materials.forEach((m, i) => { + const idx = i + 1; + contextLines.push(`\n[${idx}] ${m.url}\n---\n${m.markdown}\n`); + }); + + const system = ` You are Perplexity, ... `; + + const user = [`User query: ${input.query}`, "", contextLines.join("\n")].join( + "\n", + ); + + const completion = await openai.chat.completions.create({ + model: config.openai.model, + messages: [ + { role: "system", content: system }, + { role: "user", content: user }, + ], + }); + + const answer = completion.choices?.[0]?.message?.content?.trim() ?? ""; + + // Collect sources in index order for convenience + const sources = input.materials.map((m, i) => ({ index: i + 1, url: m.url })); + + console.info("Synthesis complete", { + answerPreview: answer.slice(0, 160), + }); + + return { + answer, + sources, + _raw: completion, + }; +} +``` + +Run and interpret the output +---------------------------- + +After `npm start`, the script logs a JSON result like: + +```json +{ + "query": "What are the latest improvements in WebAssembly and their benefits?", + "answer": "Recent WebAssembly updates improved component model support and tooling, enabling easier interop and faster iterations.[1][2] These changes reduce bundle size, improve portability, and speed up non‑JS language performance across platforms.[2][3]", + "citations": [ + { "index": 1, "url": "https://example.com/article-1" }, + { "index": 2, "url": "https://example.com/article-2" }, + { "index": 3, "url": "https://example.com/article-3" } + ], + "model": "gpt-5-nano", + "meta": { "tookMs": 12345 } +} +``` + +Tuning and tips +--------------- + +- Expand coverage + - Increase `SEARCH_TOP_K` to retrieve and scrape more URLs. + - `CONCURRENCY` controls how many pages you scrape at once. + +- Respect rate limits + - Steel Hobby plan allows ~20 requests/min. To add an actual delay between scrapes, replace the no-op `setTimeout(() => {}, config.steel.timeout)` with an awaited delay: +```typescript +// Replace this in scrapeUrlsToMarkdown's worker loop: +await new Promise((r) => setTimeout(r, config.steel.timeout)); +``` + +- Timeouts + - `REQUEST_TIMEOUT_MS` applies to both Brave and Steel requests. + +- Models + - Use `OPENAI_MODEL` to choose a cost-effective model for both query generation and synthesis. + +- Debugging + - The code returns `_raw` payloads in some helpers to aid troubleshooting. + - Log the ranked URL list before scraping if you need to inspect relevance. + +Example project +--------------- + +- GitHub: https://github.com/steel-dev/steel-cookbook/tree/main/examples/steel-perplexity-clone + +What to customize next +---------------------- + +- Swap Brave for another Search API if you prefer +- Add caching for search and scrapes +- Stream synthesis tokens for a live UI +- Persist answers and materials to a database +- Filter sources by domain whitelist/blacklist + +Support +------- + +- Steel Documentation: https://docs.steel.dev +- API Reference: https://docs.steel.dev/api-reference +- Discord Community: https://discord.gg/steel-dev diff --git a/content/docs/overview/guides/playwright-node.mdx b/content/docs/overview/guides/playwright-node.mdx index e764acf9..9e8252c8 100644 --- a/content/docs/overview/guides/playwright-node.mdx +++ b/content/docs/overview/guides/playwright-node.mdx @@ -11,7 +11,7 @@ Steel sessions are designed to be easily driven by Playwright. There are two mai -**Quick Start:** Want to jump right in? [Skip to example project](https://docs.steel.dev/overview/guides/connect-with-playwright-node#example-project-scraping-hacker-news). +**Quick Start:** Want to jump right in? [Skip to example project](https://docs.steel.dev/overview/guides/playwright-node#example-project-scraping-hacker-news). Method #1: One-line change (_easiest)_ -------------------------------------- From 343dad07777481af3e84cc413d8d9c29a2033463 Mon Sep 17 00:00:00 2001 From: aspectrr Date: Wed, 19 Nov 2025 10:19:47 -0500 Subject: [PATCH 2/4] fix: update .env --- content/docs/overview/guides/perplexity.mdx | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/content/docs/overview/guides/perplexity.mdx b/content/docs/overview/guides/perplexity.mdx index c9cfe660..138b1a82 100644 --- a/content/docs/overview/guides/perplexity.mdx +++ b/content/docs/overview/guides/perplexity.mdx @@ -62,7 +62,6 @@ NODE_ENV=development OPENAI_API_KEY=sk-... OPENAI_ORG_ID= OPENAI_MODEL=gpt-5-nano -OPENAI_ENABLE_WEB_SEARCH=true # Steel.dev STEEL_API_KEY=steel_... @@ -78,7 +77,7 @@ BRAVE_SEARCH_LANG=en BRAVE_SAFESEARCH=moderate # Search behavior -SEARCH_TOP_K=3 +SEARCH_TOP_K=10 REQUEST_TIMEOUT_MS=30000 CONCURRENCY=2 From 5d867b75e71e6726306d49b3e112e78ce30bb0d6 Mon Sep 17 00:00:00 2001 From: aspectrr Date: Tue, 2 Dec 2025 08:41:23 -0500 Subject: [PATCH 3/4] feat: updated perplexity docs --- content/docs/overview/guides/perplexity.mdx | 304 +++++++------------- 1 file changed, 107 insertions(+), 197 deletions(-) diff --git a/content/docs/overview/guides/perplexity.mdx b/content/docs/overview/guides/perplexity.mdx index 138b1a82..e4f71617 100644 --- a/content/docs/overview/guides/perplexity.mdx +++ b/content/docs/overview/guides/perplexity.mdx @@ -6,7 +6,6 @@ llm: true --- This guide shows you how to build a Perplexity-like research workflow in Node.js/TypeScript that: -- Generates targeted search queries with OpenAI - Finds relevant links with the Brave Search API - Scrapes those links to Markdown via Steel’s /v1/scrape endpoint - Synthesizes a well-cited answer with inline citations @@ -66,8 +65,6 @@ OPENAI_MODEL=gpt-5-nano # Steel.dev STEEL_API_KEY=steel_... STEEL_SCRAPE_ENDPOINT=https://api.steel.dev/v1/scrape -# Optional pacing between scrape requests (ms), useful for rate limits -STEEL_TIMEOUT=3000 # Brave Search BRAVE_API_KEY=brv_... @@ -78,8 +75,8 @@ BRAVE_SAFESEARCH=moderate # Search behavior SEARCH_TOP_K=10 -REQUEST_TIMEOUT_MS=30000 -CONCURRENCY=2 +REQUEST_TIMEOUT_MS=5000 +CONCURRENCY=5 # Your question to research QUERY="What are the latest improvements in WebAssembly and their benefits?" @@ -137,11 +134,7 @@ async function main() { }); // 1) Use Brave to get top relevant URLs (do double to get more relevant results to search) - const { urls } = await multiQueryBraveSearch(query, topK * 2); - // const searchRes = await searchTopRelevantUrls(query, requestedTopK * 2); - // const urls = (searchRes.urls || []).slice(0, requestedTopK * 2); - - // console.log(urls, urls.length); + const { urls } = await singleQueryBraveSearch(query, topK * 2); if (urls.length === 0) { return console.error("No URLs found for the given query."); @@ -192,141 +185,40 @@ Step 1: Generate richer search coverage - Results are aggregated and ranked by frequency and reciprocal rank. ```typescript -export async function multiQueryBraveSearch( +export async function singleQueryBraveSearch( userQuery: string, topKPerQuery = config.search.topK, ): Promise { - // 1) Ask OpenAI to produce exactly 3 queries as strict JSON. - const prompt = [ - "You are a search strategist.", - "Given the user's query, generate exactly 3 search queries that maximize the likelihood of finding relevant, recent, and factual information.", - "Avoid generic questions; use specific keywords.", - "", - "Return strict JSON with this shape:", - '{ "queries": ["...", "...", "..."] }', - "", - `User query: ${userQuery}`, - ].join("\n"); + const spinner = ora("Searching...").start(); + const normalizedQuery = userQuery.trim() || userQuery; + const queries = [normalizedQuery]; - const completion = await openai.chat.completions.create({ - model: config.openai.model, - messages: [ - { role: "system", content: "You produce JSON only. No prose." }, - { role: "user", content: prompt }, - ], - }); - - const rawContent = - completion.choices?.[0]?.message?.content?.trim() ?? '{"queries": []}'; - - let queries: string[] = []; try { - const parsed = JSON.parse(rawContent); - if (Array.isArray(parsed?.queries)) { - queries = parsed.queries.map((q: unknown) => - typeof q === "string" ? q.trim() : "", - ); - } - } catch { - // Fallback: split lines - queries = rawContent - .split("\n") - .map((l) => l.replace(/^[-*\d.)\s]+/, "").trim()) - .filter(Boolean) - .slice(0, 3); - } - - // Ensure exactly 3 queries, fall back to the original user query variations if needed - queries = Array.from( - new Set( - queries - .filter(Boolean) - .map((q) => q.replace(/\s+/g, " ").trim()) - .slice(0, 3), - ), - ); - while (queries.length < 3) { - if (queries.length === 0) queries.push(userQuery); - else queries.push(`${userQuery} ${queries.length + 1}`); - } - queries = queries.slice(0, 3); - - console.info("Generated queries", { queries }); - - // 2) For each query, call Brave Search with a 1s delay between calls. - const perQueryUrls: string[][] = []; - for (let i = 0; i < queries.length; i++) { - const q = queries[i]; - if (q == null) { - perQueryUrls.push([]); - continue; - } - if (i > 0) { - await new Promise((r) => setTimeout(r, 1000)); - } - try { - const { urls } = await searchTopRelevantUrls( - q, - topKPerQuery ?? config.search.topK, - ); - perQueryUrls.push(urls); - } catch (err) { - console.warn("Brave search failed for generated query", { - query: q, - err: (err as Error)?.message, - }); - perQueryUrls.push([]); - } - } - - // 3) Rank aggregation: reciprocal rank sum + frequency and best rank tiebreakers - type Acc = { - score: number; - occurrences: number; - ranks: number[]; - }; - const scores = new Map(); - - perQueryUrls.forEach((urls) => { - urls.forEach((u, idx) => { - const url = u.trim(); - if (!url) return; - const rank = idx + 1; // 1-based - const inc = 1 / rank; // reciprocal rank - const prev = scores.get(url) ?? { score: 0, occurrences: 0, ranks: [] }; - prev.score += inc; - prev.occurrences += 1; - prev.ranks.push(rank); - scores.set(url, prev); - }); - }); + const { urls } = await searchTopRelevantUrls( + normalizedQuery, + topKPerQuery ?? config.search.topK, + ); - // 4) Deduplicate and sort - const ranked: RankedUrl[] = Array.from(scores.entries()) - .map(([url, acc]) => ({ - url, - score: acc.score, - occurrences: acc.occurrences, - ranks: acc.ranks.sort((a, b) => a - b), - })) - .sort((a, b) => { - if (b.score !== a.score) return b.score - a.score; // primary: score - if (b.occurrences !== a.occurrences) return b.occurrences - a.occurrences; // secondary: frequency - // tertiary: best (lowest) rank - const aBest = a.ranks[0] ?? Number.POSITIVE_INFINITY; - const bBest = b.ranks[0] ?? Number.POSITIVE_INFINITY; - return aBest - bBest; + spinner.succeed("Search complete"); + + return { + queries, + urls, + _raw: { perQueryUrls: [urls] }, + }; + } catch (err) { + spinner.fail("Search failed"); + console.warn("Brave search failed for query", { + query: normalizedQuery, + err: (err as Error)?.message, }); - console.info("Ranked URLs across multi-query search", { - unique: ranked.length, - }); - - return { - queries, - urls: ranked.map((url) => url.url), - _raw: { openai: completion, perQueryUrls }, - }; + return { + queries, + urls: [], + _raw: { error: err }, + }; + } } ``` @@ -411,49 +303,31 @@ Step 2: Scrape each URL to Markdown with Steel - The response contains `content.markdown`, `links`, and metadata. ```typescript -export async function scrapeUrlToMarkdown(url: string): Promise { - const endpoint = config.steel.scrapeEndpoint; - - const body: SteelScrapeRequest = { - url, - format: ["markdown"], - }; - - const res = await fetchWithTimeout(endpoint, { - method: "POST", - headers: { - "Steel-Api-Key": config.steel.apiKey, - "Content-Type": "application/json", - }, - body: JSON.stringify(body), - }); +export async function scrapeUrlToMarkdown( + url: string, +): Promise { + try { + const client = new Steel({ + steelAPIKey: config.steel.apiKey, + timeout: config.requestTimeoutMs, + }); - if (!res.ok) { - const text = await res.text().catch(() => ""); - console.error("Steel.dev scrape failed", { - status: res.status, - statusText: res.statusText, + const res = await client.scrape({ url, - response: text?.slice(0, 1000), + format: ["markdown"], }); - throw new Error( - `Steel.dev scrape failed for ${url}: ${res.status} ${res.statusText}`, - ); - } - const payload = (await res.json()) as SteelScrapeResponse; - const markdown = payload?.content?.markdown; - const links = payload?.links; + const markdown = res?.content?.markdown; + const links = res?.links; - if (!markdown) { - console.warn("Steel.dev response did not include recognizable markdown", { - url, - payload: JSON.stringify(payload).slice(0, 1000), - }); - throw new Error(`Steel.dev response missing markdown content for ${url}`); - } + if (!markdown) { + throw new Error(`Steel.dev response missing markdown content for ${url}`); + } - return { url, markdown, links }; + return { url, markdown, links }; + } catch { + return null; + } } ``` @@ -468,6 +342,7 @@ Step 3: Synthesize an answer with inline citations export async function synthesizeWithCitations( input: SynthesisInput, ): Promise { + const spinner = ora("Synthesizing answer...").start(); // Build context block const contextHeader = "Context materials (each item shows [index] and URL, followed by markdown content)"; @@ -477,11 +352,39 @@ export async function synthesizeWithCitations( contextLines.push(`\n[${idx}] ${m.url}\n---\n${m.markdown}\n`); }); - const system = ` You are Perplexity, ... `; + const now = new Date(); + + // Day of week, month, day, year + const dateFormatter = new Intl.DateTimeFormat("en-NZ", { + weekday: "long", + month: "long", + day: "2-digit", + year: "numeric", + timeZone: "Pacific/Auckland", + }); + + // Time with hour + timezone abbreviation + const timeFormatter = new Intl.DateTimeFormat("en-NZ", { + hour: "numeric", + minute: "2-digit", + hour12: true, + timeZone: "Pacific/Auckland", + timeZoneName: "short", // gives "NZDT" + }); + + const dateStr = dateFormatter.format(now); + const timeStr = timeFormatter.format(now); + + // Combine + remove the minutes (":00") if you want "7 PM" instead of "7:00 PM" + const final = `${dateStr}, ${timeStr.replace(/:00/, "")}`; + + const system = ` You are ...` const user = [`User query: ${input.query}`, "", contextLines.join("\n")].join( "\n", ); + let answer = ""; + let started = false; const completion = await openai.chat.completions.create({ model: config.openai.model, @@ -489,15 +392,28 @@ export async function synthesizeWithCitations( { role: "system", content: system }, { role: "user", content: user }, ], + stream: true, }); - const answer = completion.choices?.[0]?.message?.content?.trim() ?? ""; + for await (const chunk of completion) { + const content = chunk.choices[0]?.delta?.content; + if (content) { + if (!started) { + started = true; + spinner.succeed("Answer synthesized"); + process.stdout.write("\n"); + } + answer += content; + process.stdout.write(content); + } + } // Collect sources in index order for convenience const sources = input.materials.map((m, i) => ({ index: i + 1, url: m.url })); - console.info("Synthesis complete", { - answerPreview: answer.slice(0, 160), + console.log("\n\nSources:"); + sources.forEach((source) => { + console.log(`[${source.index}] ${source.url}`); }); return { @@ -511,20 +427,17 @@ export async function synthesizeWithCitations( Run and interpret the output ---------------------------- -After `npm start`, the script logs a JSON result like: - -```json -{ - "query": "What are the latest improvements in WebAssembly and their benefits?", - "answer": "Recent WebAssembly updates improved component model support and tooling, enabling easier interop and faster iterations.[1][2] These changes reduce bundle size, improve portability, and speed up non‑JS language performance across platforms.[2][3]", - "citations": [ - { "index": 1, "url": "https://example.com/article-1" }, - { "index": 2, "url": "https://example.com/article-2" }, - { "index": 3, "url": "https://example.com/article-3" } - ], - "model": "gpt-5-nano", - "meta": { "tookMs": 12345 } -} +After `npm start`, the script logs the result step-by-step: + +```bash +✔ Search complete +✔ Scraping complete +✔ Answer synthesized + +Prediction markets offer a practical way to hedge specific risks and to add liquidity to broader market positions by turning uncertain outcomes into tradable, cash-settled contracts. Their price signals aggregate diverse information in real time, creating hedging tools and a more liquid trading environment than many traditional markets. [1] +... + +## How prediction markets create hedging opportunities ``` Tuning and tips @@ -535,11 +448,8 @@ Tuning and tips - `CONCURRENCY` controls how many pages you scrape at once. - Respect rate limits - - Steel Hobby plan allows ~20 requests/min. To add an actual delay between scrapes, replace the no-op `setTimeout(() => {}, config.steel.timeout)` with an awaited delay: -```typescript -// Replace this in scrapeUrlsToMarkdown's worker loop: -await new Promise((r) => setTimeout(r, config.steel.timeout)); -``` +- Steel Hobby plan allows ~20 requests/min. To add an actual delay between scrapes, replace the no-op `setTimeout(() => {}, config.steel.timeout)` with an awaited delay: + - Timeouts - `REQUEST_TIMEOUT_MS` applies to both Brave and Steel requests. From f46e86660c64143297fdfccf0847d409d76ea406 Mon Sep 17 00:00:00 2001 From: aspectrr Date: Wed, 3 Dec 2025 07:13:57 -0500 Subject: [PATCH 4/4] fix: update perplexity docs --- content/docs/overview/guides/perplexity.mdx | 62 ++++++++------------- 1 file changed, 24 insertions(+), 38 deletions(-) diff --git a/content/docs/overview/guides/perplexity.mdx b/content/docs/overview/guides/perplexity.mdx index e4f71617..876c4e68 100644 --- a/content/docs/overview/guides/perplexity.mdx +++ b/content/docs/overview/guides/perplexity.mdx @@ -64,7 +64,6 @@ OPENAI_MODEL=gpt-5-nano # Steel.dev STEEL_API_KEY=steel_... -STEEL_SCRAPE_ENDPOINT=https://api.steel.dev/v1/scrape # Brave Search BRAVE_API_KEY=brv_... @@ -87,23 +86,18 @@ What this example does At a high level: -1) Generate multiple targeted queries for better coverage -- Uses OpenAI to turn the user query into 3 high‑signal search queries +1. Search Brave for relevant URLs -2) Search and rank URLs with Brave -- Calls Brave’s Web Search API for each generated query -- Aggregates and ranks URLs using a reciprocal-rank strategy +2. Scrape sources to Markdown with Steel + - Sends each URL to Steel’s `/v1/scrape` to obtain clean Markdown -3) Scrape sources to Markdown with Steel -- Sends each URL to Steel’s `/v1/scrape` to obtain clean Markdown - -4) Synthesize a well‑cited answer with OpenAI -- Builds a context block from scraped Markdown -- Instructs the model to produce inline [n] citations, matching the material order +3. Synthesize a well‑cited answer with OpenAI + - Builds a context block from scraped Markdown + - Instructs the model to produce inline [n] citations, matching the material order The core orchestration happens here: -```typescript +```typescript Typescript -wcn -f index.ts import { config } from "./config"; import { scrapeUrlsToMarkdown, @@ -177,14 +171,12 @@ main() }); ``` -Step 1: Generate richer search coverage +Step 1: Get relevant URLs --------------------------------------- -- The example asks OpenAI to produce 3 specific queries that maximize recall and signal. -- It then calls Brave Search for each query, pausing briefly between calls. -- Results are aggregated and ranked by frequency and reciprocal rank. +- The example calls the Brave API to recieve relevant URLs based on the user query -```typescript +```typescript Typescript -wcn export async function singleQueryBraveSearch( userQuery: string, topKPerQuery = config.search.topK, @@ -224,7 +216,7 @@ export async function singleQueryBraveSearch( Under the hood, the Brave call itself looks like this: -```typescript +```typescript Typescript -wcn export async function searchTopRelevantUrls( query: string, topK = config.search.topK, @@ -272,9 +264,6 @@ export async function searchTopRelevantUrls( } if (urls.length === 0) { - console.warn("No URLs returned from Brave, attempting salvage from raw", { - raw: JSON.stringify(data).slice(0, 1000), - }); const rawText = JSON.stringify(data); const regex = /\bhttps?:\/\/[^\s"'<>]+/gi; const salvaged = (rawText.match(regex) ?? []) as string[]; @@ -286,11 +275,8 @@ export async function searchTopRelevantUrls( .filter(Boolean) .slice(0, topK); - console.info("Collected URLs from Brave", { count: normalized.length }); - return { urls: normalized, - _raw: data, }; } ``` @@ -298,11 +284,11 @@ export async function searchTopRelevantUrls( Step 2: Scrape each URL to Markdown with Steel ---------------------------------------------- -- For each URL, POST to Steel’s scrape endpoint. +- For each URL, make a request to Steel's `/v1/scrape` endpoint. - Request Markdown by setting `format: ["markdown"]`. -- The response contains `content.markdown`, `links`, and metadata. +- The response contains `content.markdown`, and metadata. -```typescript +```typescript Typescript -wcn export async function scrapeUrlToMarkdown( url: string, ): Promise { @@ -338,7 +324,7 @@ Step 3: Synthesize an answer with inline citations - Prompt the model to cite with `[n]` as it writes. - Return an answer plus a `sources` array mapping `[n] -> url`. -```typescript +```typescript Typescript -wcn export async function synthesizeWithCitations( input: SynthesisInput, ): Promise { @@ -419,7 +405,6 @@ export async function synthesizeWithCitations( return { answer, sources, - _raw: completion, }; } ``` @@ -429,15 +414,18 @@ Run and interpret the output After `npm start`, the script logs the result step-by-step: -```bash +``` ✔ Search complete ✔ Scraping complete ✔ Answer synthesized -Prediction markets offer a practical way to hedge specific risks and to add liquidity to broader market positions by turning uncertain outcomes into tradable, cash-settled contracts. Their price signals aggregate diverse information in real time, creating hedging tools and a more liquid trading environment than many traditional markets. [1] -... - -## How prediction markets create hedging opportunities +## Prediction Markets +Prediction markets offer a practical way to +hedge specific risks and to add liquidity to broader +market positions by turning uncertain outcomes into tradable, +cash-settled contracts. Their price signals aggregate diverse +information in real time, creating hedging tools and a more +liquid trading environment than many traditional markets. [1] ... ``` Tuning and tips @@ -448,7 +436,7 @@ Tuning and tips - `CONCURRENCY` controls how many pages you scrape at once. - Respect rate limits -- Steel Hobby plan allows ~20 requests/min. To add an actual delay between scrapes, replace the no-op `setTimeout(() => {}, config.steel.timeout)` with an awaited delay: +- Steel Hobby plan allows ~20 requests/min. - Timeouts @@ -458,7 +446,6 @@ Tuning and tips - Use `OPENAI_MODEL` to choose a cost-effective model for both query generation and synthesis. - Debugging - - The code returns `_raw` payloads in some helpers to aid troubleshooting. - Log the ranked URL list before scraping if you need to inspect relevance. Example project @@ -471,7 +458,6 @@ What to customize next - Swap Brave for another Search API if you prefer - Add caching for search and scrapes -- Stream synthesis tokens for a live UI - Persist answers and materials to a database - Filter sources by domain whitelist/blacklist