diff --git a/src/providers/firecrawl.test.ts b/src/providers/firecrawl.test.ts index b74ad50..4470a94 100644 --- a/src/providers/firecrawl.test.ts +++ b/src/providers/firecrawl.test.ts @@ -168,3 +168,46 @@ test("createFirecrawlProvider fetches each URL via /scrape and preserves per-url }, ]); }); + +test("createFirecrawlProvider limits concurrent scrape requests", async () => { + let active = 0; + let maxActive = 0; + + const provider = createFirecrawlProvider(cloudConfig, async (_url, init) => { + active += 1; + maxActive = Math.max(maxActive, active); + + const body = JSON.parse(String(init?.body)); + await new Promise((resolve) => setTimeout(resolve, 10)); + + active -= 1; + return new Response( + JSON.stringify({ + success: true, + data: { + metadata: { + title: body.url, + sourceURL: body.url, + }, + markdown: `Fetched ${body.url}`, + }, + }), + { status: 200 }, + ); + }); + + const urls = [ + "https://a.example", + "https://b.example", + "https://c.example", + "https://d.example", + "https://e.example", + "https://f.example", + "https://g.example", + ]; + + const result = await provider.fetch({ urls }); + + assert.equal(result.results.length, urls.length); + assert.ok(maxActive <= 4, `expected max concurrency <= 4, got ${maxActive}`); +}); diff --git a/src/providers/firecrawl.ts b/src/providers/firecrawl.ts index 216ef92..52d6d59 100644 --- a/src/providers/firecrawl.ts +++ b/src/providers/firecrawl.ts @@ -9,6 +9,7 @@ import type { } from "./types.ts"; const DEFAULT_FIRECRAWL_BASE_URL = "https://api.firecrawl.dev/v2"; +const DEFAULT_FIRECRAWL_FETCH_CONCURRENCY = 4; type FirecrawlSearchPayload = { id?: string; @@ -124,6 +125,26 @@ function resolveFetchFormats(request: NormalizedFetchRequest) { return uniqueFormats(formats.length > 0 ? formats : ["markdown"]); } +async function mapWithConcurrency( + items: TItem[], + concurrency: number, + iteratee: (item: TItem) => Promise, +): Promise { + const results = new Array(items.length); + let nextIndex = 0; + + const workers = Array.from({ length: Math.max(1, Math.min(concurrency, items.length)) }, async () => { + while (nextIndex < items.length) { + const currentIndex = nextIndex; + nextIndex += 1; + results[currentIndex] = await iteratee(items[currentIndex]!); + } + }); + + await Promise.all(workers); + return results; +} + function pickRequestId(payload: { id?: string; request_id?: string }) { return typeof payload.id === "string" ? payload.id @@ -180,37 +201,35 @@ export function createFirecrawlProvider( validateFirecrawlFetchRequest(config.name, request); const formats = resolveFetchFormats(request); - const results = await Promise.all( - request.urls.map(async (url) => { - try { - const payload = await postJson({ - providerName: config.name, - baseUrl, - path: "/scrape", - apiKey: config.apiKey, - fetchImpl, - body: { - url, - formats, - }, - }); - - return { - url: payload.data?.metadata?.sourceURL ?? url, - title: payload.data?.metadata?.title ?? payload.data?.title ?? null, - text: typeof payload.data?.markdown === "string" ? payload.data.markdown : undefined, - summary: typeof payload.data?.summary === "string" ? payload.data.summary : undefined, - images: Array.isArray(payload.data?.images) ? payload.data.images : undefined, - }; - } catch (error) { - return { + const results = await mapWithConcurrency(request.urls, DEFAULT_FIRECRAWL_FETCH_CONCURRENCY, async (url) => { + try { + const payload = await postJson({ + providerName: config.name, + baseUrl, + path: "/scrape", + apiKey: config.apiKey, + fetchImpl, + body: { url, - title: null, - error: (error as Error).message, - }; - } - }), - ); + formats, + }, + }); + + return { + url: payload.data?.metadata?.sourceURL ?? url, + title: payload.data?.metadata?.title ?? payload.data?.title ?? null, + text: typeof payload.data?.markdown === "string" ? payload.data.markdown : undefined, + summary: typeof payload.data?.summary === "string" ? payload.data.summary : undefined, + images: Array.isArray(payload.data?.images) ? payload.data.images : undefined, + }; + } catch (error) { + return { + url, + title: null, + error: error instanceof Error ? error.message : String(error), + }; + } + }); return { providerName: config.name, diff --git a/src/providers/http.test.ts b/src/providers/http.test.ts new file mode 100644 index 0000000..caa20a5 --- /dev/null +++ b/src/providers/http.test.ts @@ -0,0 +1,24 @@ +import test from "node:test"; +import assert from "node:assert/strict"; +import { postJson } from "./http.ts"; + +test("postJson surfaces invalid JSON responses with HTTP context", async () => { + await assert.rejects( + () => + postJson({ + providerName: "firecrawl-main", + baseUrl: "https://api.firecrawl.dev/v2", + path: "/search", + body: { query: "pi docs" }, + fetchImpl: async () => + new Response("not json", { + status: 200, + statusText: "OK", + headers: { + "content-type": "text/html", + }, + }), + }), + /Provider "firecrawl-main" HTTP 200 OK: invalid JSON response: not json<\/html>/, + ); +}); diff --git a/src/providers/http.ts b/src/providers/http.ts index 93f6cea..02177f5 100644 --- a/src/providers/http.ts +++ b/src/providers/http.ts @@ -15,11 +15,13 @@ export function joinApiUrl(baseUrl: string, path: string) { return `${normalizedBaseUrl}${normalizedPath}`; } +function formatHttpErrorMessage(providerName: string, response: Response, text: string) { + return `Provider "${providerName}" HTTP ${response.status} ${response.statusText}: ${text.slice(0, 300)}`; +} + export async function readHttpError(providerName: string, response: Response): Promise { const text = await response.text(); - throw new Error( - `Provider "${providerName}" HTTP ${response.status} ${response.statusText}: ${text.slice(0, 300)}`, - ); + throw new Error(formatHttpErrorMessage(providerName, response, text)); } export async function postJson({ @@ -44,9 +46,14 @@ export async function postJson({ body: JSON.stringify(body), }); + const text = await response.text(); if (!response.ok) { - await readHttpError(providerName, response); + throw new Error(formatHttpErrorMessage(providerName, response, text)); } - return (await response.json()) as T; + try { + return JSON.parse(text) as T; + } catch { + throw new Error(formatHttpErrorMessage(providerName, response, `invalid JSON response: ${text}`)); + } } diff --git a/src/providers/registry.ts b/src/providers/registry.ts index 5a34c5c..d2fbd5d 100644 --- a/src/providers/registry.ts +++ b/src/providers/registry.ts @@ -107,8 +107,16 @@ export function getProviderDescriptor(provider: Pick