From c687250bc3c67d151a56cf65ed67a8739877a9f7 Mon Sep 17 00:00:00 2001 From: anil-bd Date: Mon, 18 May 2026 22:09:15 +0200 Subject: [PATCH 1/2] feat(scraper-create): write {collector_id, ...} envelope to -o MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Today the file `-o create.json` writes only the final AI-progress payload — no `collector_id`, no name, no view_url. The documented recipe in references/recipes.md depends on jq reading the collector_id out of that file: COLLECTOR_ID=$(jq -r '.collector_id // .id' create.json) bdata scraper run "$COLLECTOR_ID" ... Today that returns the string "null" because the field doesn't exist in the file. Every script that follows the docs to chain create → run is silently broken. This change wraps every termination path (success, AI-trigger failure, status=failed, polling exception) in one envelope: { "collector_id": "c_...", "name": "audit-r4-...", "status": "done" | "failed" | "ai_trigger_failed" | "poll_failed", "completed_steps": [...], "view_url": "https://brightdata.com/cp/scrapers/c_...", "created_at": "2026-05-18T07:28:30Z", "error": "..." // failure paths only } Notable design choices: * Every termination path writes the same shape, including failure paths that previously wrote nothing. So a script using `jq -r '.collector_id'` always recovers an id when one exists — even from a stub collector that hit the AI-Flow parallel-job cap. This makes good on SKILL.md's promise that every failure path surfaces the collector_id. * `view_url` is included on every envelope so the user has a one- click recovery path to inspect / finish / delete the scraper in the dashboard, without needing to know the URL pattern. * `created_at` is taken from the template-creation response when the API provides it (`Create_template_response.created`), omitted otherwise — never invented. * New `--legacy-output` flag preserves today's bare-progress shape for one minor version so any existing scripts that depended on the old shape have a migration window. Slated for removal in the next major. * Stdout (the success summary printed to TTY) is unchanged. Only the machine-readable `-o` / `--json` / `--pretty` payload is reshaped. * Scoped strictly to `src/commands/scraper.ts` and the new envelope type. The shared HTTP client and other commands (scrape, search, discover, pipelines, browser) are untouched. Tests: 4 new `build_create_envelope` unit cases covering success, omitted-created_at, failure-with-error, and view_url-on-every- path. 5 new `handle_create_scraper` integration cases covering success envelope, the documented jq recipe, --legacy-output preserving the bare shape, AI-trigger failure envelope (the stub-collector recovery path), poll-status-failed envelope, and poll-exception envelope. Two existing tests updated from strict opts-object matches to objectContaining-style (the contract is now the envelope shape, not the bare payload). 55 / 55 scraper tests pass. The 9 pre-existing failures in unrelated suites (daemon, add-mcp, browser, discover, scrape) on main are unchanged by this PR. Spec: brightdata/skills repo, proposal at skills/scraper-studio/proposals/PR-2-create-envelope.md (to be filed alongside this PR). --- README.md | 38 +++- src/__tests__/commands/scraper.test.ts | 229 ++++++++++++++++++++++++- src/commands/scraper.ts | 105 +++++++++++- src/types/scraper.ts | 18 ++ 4 files changed, 379 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 61333d2..26fb1b9 100644 --- a/README.md +++ b/README.md @@ -319,13 +319,42 @@ brightdata scraper create [options] | `--name ` | Scraper template name (default: `cli-scraper-`) | | `--deliver-webhook ` | Webhook URL for the deliver stub (default: `https://example.com/webhook`) | | `--timeout ` | Polling timeout in seconds (default: `600`) | -| `-o, --output ` | Write output to file | +| `-o, --output ` | Write the JSON envelope to a file (see below) | | `--json` / `--pretty` | JSON output (raw / indented) | +| `--legacy-output` | Write the pre-v0.3 bare AI-progress payload to `-o` instead of the envelope. Migration only. | | `--timing` | Show request timing | | `-k, --api-key ` | Override API key | > **Note:** The scraper is created with a placeholder webhook delivery target (`https://example.com/webhook`). You can reconfigure the actual delivery endpoint in the [Bright Data web UI](https://brightdata.com/cp/scrapers) after creation. +#### Output envelope (`-o create.json`) + +Every termination path — success or failure — writes the same JSON envelope shape: + +```json +{ + "collector_id": "c_mp7x8a9b2c0d1e2f", + "name": "my-product-scraper", + "status": "done", + "completed_steps": ["prepare_intent_analyzer", "planner", "..."], + "view_url": "https://brightdata.com/cp/scrapers/c_mp7x8a9b2c0d1e2f", + "created_at": "2026-05-18T07:28:30Z" +} +``` + +On failure paths the envelope adds an `error` field and the `status` reflects the failure category (`ai_trigger_failed`, `failed`, `poll_failed`). The `collector_id` and `view_url` are still present so you can recover or inspect the half-built scraper. + +This makes the documented chain in [recipes.md](https://github.com/brightdata/skills/blob/main/skills/scraper-studio/references/recipes.md) work as written: + +```bash +brightdata scraper create https://example.com/product/1 "..." \ + --pretty -o create.json +COLLECTOR_ID=$(jq -r '.collector_id' create.json) +brightdata scraper run "$COLLECTOR_ID" https://example.com/product/2 +``` + +Use `--legacy-output` if you have an existing script that depended on the pre-v0.3 bare-progress shape; the flag is supported for one minor version while you migrate. + **Examples** ```bash @@ -333,10 +362,13 @@ brightdata scraper create [options] brightdata scraper create https://example.com/product/1 \ "Extract title, price, and image URL from this product page" -# Name the scraper and save the full AI output to a file +# Name the scraper and save the envelope to a file brightdata scraper create https://example.com/product/1 \ "Extract title, price, and image URL from this product page" \ - --name my-product-scraper --pretty -o scraper-output.json + --name my-product-scraper --pretty -o create.json + +# Capture the collector_id for chaining +COLLECTOR_ID=$(jq -r '.collector_id' create.json) # Use a custom webhook delivery URL brightdata scraper create https://example.com/product/1 \ diff --git a/src/__tests__/commands/scraper.test.ts b/src/__tests__/commands/scraper.test.ts index 39e7e0c..868cbdd 100644 --- a/src/__tests__/commands/scraper.test.ts +++ b/src/__tests__/commands/scraper.test.ts @@ -50,6 +50,7 @@ import { extract_progress_status, format_create_summary, handle_create_scraper, + build_create_envelope, handle_run_scraper, build_run_request, build_run_query, @@ -149,6 +150,218 @@ describe('commands/scraper', ()=>{ }); }); + // PR-2: the envelope contract is the whole point of the PR. + // Lock the shape, the failure-path semantics, and the legacy + // escape hatch in one place. + describe('build_create_envelope (PR-2)', ()=>{ + it('returns the documented success shape', ()=>{ + const env = build_create_envelope({ + collector_id: 'c_xyz', + name: 'product-v1', + status: 'done', + progress: {status: 'done', + completed_steps: ['a', 'b', 'c']}, + created_at: '2026-05-18T07:28:30Z', + }); + expect(env).toEqual({ + collector_id: 'c_xyz', + name: 'product-v1', + status: 'done', + completed_steps: ['a', 'b', 'c'], + view_url: 'https://brightdata.com/cp/scrapers/c_xyz', + created_at: '2026-05-18T07:28:30Z', + }); + }); + + it('omits created_at when not known', ()=>{ + const env = build_create_envelope({ + collector_id: 'c_xyz', + name: 'n', + status: 'done', + progress: {status: 'done', completed_steps: []}, + }); + expect(env).not.toHaveProperty('created_at'); + }); + + it('records the error message and partial steps on failure', + ()=>{ + const env = build_create_envelope({ + collector_id: 'c_xyz', + name: 'n', + status: 'ai_trigger_failed', + error: 'Cannot run more than 3 jobs in parallel', + }); + expect(env.collector_id).toBe('c_xyz'); + expect(env.status).toBe('ai_trigger_failed'); + expect(env.error).toMatch(/parallel/); + expect(env.completed_steps).toEqual([]); + // view_url remains useful even on failure so the user + // can inspect the stub collector in the dashboard. + expect(env.view_url) + .toBe('https://brightdata.com/cp/scrapers/c_xyz'); + }); + + it('still includes view_url on every termination path', ()=>{ + for (const status of ['done', 'failed', 'ai_trigger_failed', + 'poll_failed']) + { + const env = build_create_envelope({ + collector_id: 'c_xyz', name: 'n', status, + }); + expect(env.view_url) + .toBe('https://brightdata.com/cp/scrapers/c_xyz'); + } + }); + }); + + describe('handle_create_scraper envelope output (PR-2)', ()=>{ + const setup_success = ()=>{ + mocks.post + .mockResolvedValueOnce({ + id: 'c_xyz', name: 'product-v1', + created: '2026-05-18T07:28:30Z', + }) + .mockResolvedValueOnce({id: 'ia_xyz', queued: false}); + mocks.poll_until.mockResolvedValue({ + result: {status: 'done', + completed_steps: ['a', 'b', 'c']}, + attempts: 4, + }); + }; + + it('writes the new envelope to -o on success', async()=>{ + setup_success(); + await handle_create_scraper( + 'https://x.com/p', 'd', + {output: 'create.json', pretty: true} + ); + expect(mocks.print).toHaveBeenCalledWith( + expect.objectContaining({ + collector_id: 'c_xyz', + name: 'product-v1', + status: 'done', + completed_steps: ['a', 'b', 'c'], + view_url: 'https://brightdata.com/cp/scrapers/c_xyz', + created_at: '2026-05-18T07:28:30Z', + }), + expect.objectContaining({output: 'create.json'}) + ); + }); + + it('the documented `jq -r .collector_id` recipe works on the ' + +'envelope', async()=>{ + // The bug PR-2 is fixing — yesterday this returned `null`. + setup_success(); + await handle_create_scraper('https://x.com/p', 'd', + {output: 'create.json'}); + const written = mocks.print.mock.calls[0][0] as { + collector_id?: string}; + expect(written.collector_id).toBe('c_xyz'); + }); + + it('--legacy-output preserves the bare progress payload', + async()=>{ + setup_success(); + await handle_create_scraper( + 'https://x.com/p', 'd', + {output: 'create.json', legacyOutput: true} + ); + const written = mocks.print.mock.calls[0][0] as { + collector_id?: unknown; status?: string}; + // Bare progress shape today: status + completed_steps, + // NO collector_id, NO view_url. + expect(written.collector_id).toBeUndefined(); + expect(written).not.toHaveProperty('view_url'); + expect(written.status).toBe('done'); + }); + + it('writes the envelope when AI trigger fails (stub-collector ' + +'recovery path)', async()=>{ + mocks.post + .mockResolvedValueOnce({id: 'c_stub', name: 'n'}) + .mockRejectedValueOnce( + new Error('Cannot run more than 3 jobs in parallel')); + const exit = vi.spyOn(process, 'exit') + .mockImplementation(()=>undefined as never); + const error = vi.spyOn(console, 'error') + .mockImplementation(()=>{}); + await handle_create_scraper( + 'https://x.com/p', 'd', + {output: 'create.json'} + ); + expect(mocks.print).toHaveBeenCalledWith( + expect.objectContaining({ + collector_id: 'c_stub', + status: 'ai_trigger_failed', + error: expect.stringMatching(/parallel/), + view_url: 'https://brightdata.com/cp/scrapers/c_stub', + }), + expect.objectContaining({output: 'create.json'}) + ); + exit.mockRestore(); + error.mockRestore(); + }); + + it('writes the envelope when poll returns status != done', + async()=>{ + mocks.post + .mockResolvedValueOnce({id: 'c_abc', name: 'n'}) + .mockResolvedValueOnce({id: 'ia_xyz', queued: false}); + mocks.poll_until.mockResolvedValue({ + result: {status: 'failed', + completed_steps: ['planner']}, + attempts: 2, + }); + const exit = vi.spyOn(process, 'exit') + .mockImplementation(()=>undefined as never); + const error = vi.spyOn(console, 'error') + .mockImplementation(()=>{}); + await handle_create_scraper( + 'https://x.com/p', 'd', + {output: 'create.json'} + ); + expect(mocks.print).toHaveBeenCalledWith( + expect.objectContaining({ + collector_id: 'c_abc', + status: 'failed', + completed_steps: ['planner'], + error: expect.stringMatching(/finished with status/), + }), + expect.objectContaining({output: 'create.json'}) + ); + exit.mockRestore(); + error.mockRestore(); + }); + + it('writes the envelope when polling itself throws (timeout ' + +'or network)', async()=>{ + mocks.post + .mockResolvedValueOnce({id: 'c_abc', name: 'n'}) + .mockResolvedValueOnce({id: 'ia_xyz', queued: false}); + mocks.poll_until.mockRejectedValue( + new Error( + 'Timeout after 600 seconds waiting for AI generation')); + const exit = vi.spyOn(process, 'exit') + .mockImplementation(()=>undefined as never); + const error = vi.spyOn(console, 'error') + .mockImplementation(()=>{}); + await handle_create_scraper( + 'https://x.com/p', 'd', + {output: 'create.json'} + ); + expect(mocks.print).toHaveBeenCalledWith( + expect.objectContaining({ + collector_id: 'c_abc', + status: 'poll_failed', + error: expect.stringMatching(/Timeout/), + }), + expect.objectContaining({output: 'create.json'}) + ); + exit.mockRestore(); + error.mockRestore(); + }); + }); + describe('handle_create_scraper', ()=>{ it('chains create → trigger → poll and prints JSON in non-TTY', async()=>{ @@ -194,8 +407,17 @@ describe('commands/scraper', ()=>{ timeout_label: expect.stringContaining('c_abc'), }) ); + // PR-2: -o now writes an envelope with collector_id, + // not the raw progress payload. The documented + // `jq -r '.collector_id'` recipe depends on this. expect(mocks.print).toHaveBeenCalledWith( - progress, + expect.objectContaining({ + collector_id: 'c_abc', + name: 'cli-scraper-1', + status: 'done', + completed_steps: ['a', 'b'], + view_url: 'https://brightdata.com/cp/scrapers/c_abc', + }), {json: undefined, pretty: undefined, output: undefined} ); }); @@ -209,7 +431,10 @@ describe('commands/scraper', ()=>{ result: progress, attempts: 1}); await handle_create_scraper('https://x.com', 'd', {json: true}); expect(mocks.print).toHaveBeenCalledWith( - progress, + expect.objectContaining({ + collector_id: 'c_abc', + status: 'done', + }), {json: true, pretty: undefined, output: undefined} ); }); diff --git a/src/commands/scraper.ts b/src/commands/scraper.ts index 9ab4400..73e4963 100644 --- a/src/commands/scraper.ts +++ b/src/commands/scraper.ts @@ -12,6 +12,7 @@ import type { Trigger_ai_response, Ai_progress_response, Scraper_create_opts, + Create_envelope, Run_request, Trigger_immediate_response, Scraper_run_opts, @@ -83,6 +84,43 @@ const format_create_summary = ( return lines.join('\n'); }; +// PR-2: every termination path of `scraper create` writes this same +// envelope shape to -o. Solves the broken `jq -r '.collector_id'` +// recipe in references/recipes.md (today's -o file contains only the +// final progress payload, with no id field). +const build_create_envelope = (params: { + collector_id: string; + name: string; + status: string; + progress?: Ai_progress_response; + created_at?: string; + error?: string; +}): Create_envelope=>({ + collector_id: params.collector_id, + name: params.name, + status: params.status, + completed_steps: params.progress?.completed_steps ?? [], + view_url: `https://brightdata.com/cp/scrapers/${params.collector_id}`, + ...(params.created_at ? {created_at: params.created_at} : {}), + ...(params.error ? {error: params.error} : {}), +}); + +// Write the envelope (or, in --legacy-output mode, the bare progress +// payload) to wherever the user asked. Centralised so success and +// every failure path share one I/O code path. +const emit_create_output = ( + envelope: Create_envelope, + progress: Ai_progress_response|null, + opts: Scraper_create_opts +): void=>{ + const print_opts = {json: opts.json, pretty: opts.pretty, + output: opts.output}; + const payload = opts.legacyOutput && progress + ? (progress as unknown) : envelope; + if (opts.json || opts.pretty || opts.output || !is_tty) + print(payload, print_opts); +}; + const handle_create_scraper = async( url: string, description: string, @@ -100,6 +138,7 @@ const handle_create_scraper = async( const create_spinner = start_spinner('Creating scraper template...'); let collector_id = ''; let scraper_name = template_body.name; + let created_at: string|undefined; try { const template = await post( api_key, @@ -110,11 +149,14 @@ const handle_create_scraper = async( create_spinner.stop(); if (!template.id) { + // Template POST didn't return an id — no collector_id to + // envelope, so no -o file to write. Same as today. fail('Failed to create scraper template (missing id).'); return; } collector_id = template.id; scraper_name = template.name ?? template_body.name; + created_at = template.created; console.error(dim(`Template created: ${collector_id}`)); } catch(e) { create_spinner.stop(); @@ -134,9 +176,23 @@ const handle_create_scraper = async( trigger_spinner.stop(); } catch(e) { trigger_spinner.stop(); + const msg = (e as Error).message; console.error( `Failed to start AI generation for collector ` - +`${collector_id}: ${(e as Error).message}` + +`${collector_id}: ${msg}` + ); + // PR-2: write the envelope even on failure so the user's + // automation can read collector_id + status from the file. + emit_create_output( + build_create_envelope({ + collector_id, + name: scraper_name, + status: 'ai_trigger_failed', + created_at, + error: msg, + }), + null, + opts ); process.exit(1); return; @@ -171,16 +227,36 @@ const handle_create_scraper = async( `AI generation failed (collector ${collector_id}, ` +`status: ${progress.status}).` ); + emit_create_output( + build_create_envelope({ + collector_id, + name: scraper_name, + status: progress.status, + progress, + created_at, + error: `AI generation finished with status ` + +`"${progress.status}".`, + }), + progress, + opts + ); process.exit(1); return; } - const print_opts = {json: opts.json, pretty: opts.pretty, - output: opts.output}; + // Success path. + emit_create_output( + build_create_envelope({ + collector_id, + name: scraper_name, + status: progress.status, + progress, + created_at, + }), + progress, + opts + ); if (opts.json || opts.pretty || opts.output || !is_tty) - { - print(progress, print_opts); return; - } success(format_create_summary( collector_id, scraper_name, progress)); } catch(e) { @@ -189,6 +265,17 @@ const handle_create_scraper = async( const suffix = msg.includes(collector_id) ? '' : ` (collector ${collector_id})`; console.error(`${msg}${suffix}`); + emit_create_output( + build_create_envelope({ + collector_id, + name: scraper_name, + status: 'poll_failed', + created_at, + error: msg, + }), + null, + opts + ); process.exit(1); return; } @@ -547,6 +634,10 @@ const create_subcommand = new Command('create') .option('-o, --output ', 'Write output to file') .option('--json', 'Force JSON output') .option('--pretty', 'Pretty-print JSON output') + .option('--legacy-output', + 'Emit the bare AI-progress payload (pre-v0.3 shape) instead ' + +'of the new {collector_id, name, status, ...} envelope. ' + +'For one-version migration only.') .option('--timing', 'Show request timing') .option('-k, --api-key ', 'Override API key') .action(handle_create_scraper); @@ -584,6 +675,8 @@ export { build_ai_request, extract_progress_status, format_create_summary, + build_create_envelope, + emit_create_output, handle_run_scraper, build_run_request, build_run_query, diff --git a/src/types/scraper.ts b/src/types/scraper.ts index 74dbbcf..8ddabe9 100644 --- a/src/types/scraper.ts +++ b/src/types/scraper.ts @@ -45,6 +45,23 @@ type Scraper_create_opts = { pretty?: boolean; timing?: boolean; apiKey?: string; + // PR-2: when true, write the bare AI-progress payload to -o + // (today's shape) instead of the new envelope. One-version + // migration flag. + legacyOutput?: boolean; +}; + +// PR-2: machine-readable envelope written to -o on every termination +// path of `scraper create`. Replaces the previous bare-progress +// payload so the documented `jq -r '.collector_id'` recipe works. +type Create_envelope = { + collector_id: string; + name: string; + status: string; + completed_steps: string[]; + view_url: string; + created_at?: string; + error?: string; }; type Run_request = { @@ -92,6 +109,7 @@ export type { Trigger_ai_response, Ai_progress_response, Scraper_create_opts, + Create_envelope, Run_request, Trigger_immediate_response, Sync_timeout_response, From b7a9552c181c7b064a9d657b3e6a70a96453fd68 Mon Sep 17 00:00:00 2001 From: meirk-brd Date: Tue, 26 May 2026 12:59:36 +0300 Subject: [PATCH 2/2] refactor(scraper-create): make failed status terminal, dedupe emit, clean envelope errors - extract_progress_status treats failed/error/cancelled as terminal so a failed AI generation surfaces immediately instead of polling until timeout; the status:'failed' envelope branch is now reachable. - emit_create_output returns whether it printed; the success path branches on that via wants_machine_output, so the json/pretty/output/!is_tty decision lives in one place. - envelope error fields carry a clean single line, not the multi-line console text. - README: drop --pretty from the -o examples and note extension-wins. - strip redundant comments and PR-tracking tags. --- README.md | 6 ++-- src/__tests__/commands/scraper.test.ts | 37 +++++++++++----------- src/commands/scraper.ts | 43 +++++++++++++------------- src/types/scraper.ts | 8 +---- 4 files changed, 46 insertions(+), 48 deletions(-) diff --git a/README.md b/README.md index 26fb1b9..31dd9f6 100644 --- a/README.md +++ b/README.md @@ -348,11 +348,13 @@ This makes the documented chain in [recipes.md](https://github.com/brightdata/sk ```bash brightdata scraper create https://example.com/product/1 "..." \ - --pretty -o create.json + -o create.json COLLECTOR_ID=$(jq -r '.collector_id' create.json) brightdata scraper run "$COLLECTOR_ID" https://example.com/product/2 ``` +> The file format follows the `-o` extension, so `.json` is written compact (ideal for `jq`). Use `--pretty` for indented JSON on stdout when you omit `-o`. + Use `--legacy-output` if you have an existing script that depended on the pre-v0.3 bare-progress shape; the flag is supported for one minor version while you migrate. **Examples** @@ -365,7 +367,7 @@ brightdata scraper create https://example.com/product/1 \ # Name the scraper and save the envelope to a file brightdata scraper create https://example.com/product/1 \ "Extract title, price, and image URL from this product page" \ - --name my-product-scraper --pretty -o create.json + --name my-product-scraper -o create.json # Capture the collector_id for chaining COLLECTOR_ID=$(jq -r '.collector_id' create.json) diff --git a/src/__tests__/commands/scraper.test.ts b/src/__tests__/commands/scraper.test.ts index 868cbdd..1cb0878 100644 --- a/src/__tests__/commands/scraper.test.ts +++ b/src/__tests__/commands/scraper.test.ts @@ -116,7 +116,7 @@ describe('commands/scraper', ()=>{ expect(extract_progress_status({status: 'done'})).toBe('done'); }); - it('returns sentinel running token for any non-done status', ()=>{ + it('returns sentinel running token for in-progress statuses', ()=>{ expect(extract_progress_status({status: 'running'})) .toBe('__running__'); expect(extract_progress_status({status: 'queued'})) @@ -125,6 +125,16 @@ describe('commands/scraper', ()=>{ .toBe('__running__'); }); + it('returns terminal failure statuses verbatim so polling stops', + ()=>{ + expect(extract_progress_status({status: 'failed'})) + .toBe('failed'); + expect(extract_progress_status({status: 'error'})) + .toBe('error'); + expect(extract_progress_status({status: 'cancelled'})) + .toBe('cancelled'); + }); + it('returns undefined for missing/invalid input', ()=>{ expect(extract_progress_status(null as never)).toBeUndefined(); expect(extract_progress_status({} as never)).toBeUndefined(); @@ -150,10 +160,7 @@ describe('commands/scraper', ()=>{ }); }); - // PR-2: the envelope contract is the whole point of the PR. - // Lock the shape, the failure-path semantics, and the legacy - // escape hatch in one place. - describe('build_create_envelope (PR-2)', ()=>{ + describe('build_create_envelope', ()=>{ it('returns the documented success shape', ()=>{ const env = build_create_envelope({ collector_id: 'c_xyz', @@ -195,8 +202,6 @@ describe('commands/scraper', ()=>{ expect(env.status).toBe('ai_trigger_failed'); expect(env.error).toMatch(/parallel/); expect(env.completed_steps).toEqual([]); - // view_url remains useful even on failure so the user - // can inspect the stub collector in the dashboard. expect(env.view_url) .toBe('https://brightdata.com/cp/scrapers/c_xyz'); }); @@ -214,7 +219,7 @@ describe('commands/scraper', ()=>{ }); }); - describe('handle_create_scraper envelope output (PR-2)', ()=>{ + describe('handle_create_scraper envelope output', ()=>{ const setup_success = ()=>{ mocks.post .mockResolvedValueOnce({ @@ -250,7 +255,6 @@ describe('commands/scraper', ()=>{ it('the documented `jq -r .collector_id` recipe works on the ' +'envelope', async()=>{ - // The bug PR-2 is fixing — yesterday this returned `null`. setup_success(); await handle_create_scraper('https://x.com/p', 'd', {output: 'create.json'}); @@ -268,19 +272,19 @@ describe('commands/scraper', ()=>{ ); const written = mocks.print.mock.calls[0][0] as { collector_id?: unknown; status?: string}; - // Bare progress shape today: status + completed_steps, - // NO collector_id, NO view_url. expect(written.collector_id).toBeUndefined(); expect(written).not.toHaveProperty('view_url'); expect(written.status).toBe('done'); }); it('writes the envelope when AI trigger fails (stub-collector ' - +'recovery path)', async()=>{ + +'recovery path), with a single-line error', async()=>{ + // multi-line client error -> envelope keeps the first line. mocks.post .mockResolvedValueOnce({id: 'c_stub', name: 'n'}) - .mockRejectedValueOnce( - new Error('Cannot run more than 3 jobs in parallel')); + .mockRejectedValueOnce(new Error( + 'Error: Cannot run more than 3 jobs in parallel\n' + +' Status: 429\n Hint: serialise your launches.')); const exit = vi.spyOn(process, 'exit') .mockImplementation(()=>undefined as never); const error = vi.spyOn(console, 'error') @@ -293,7 +297,7 @@ describe('commands/scraper', ()=>{ expect.objectContaining({ collector_id: 'c_stub', status: 'ai_trigger_failed', - error: expect.stringMatching(/parallel/), + error: 'Cannot run more than 3 jobs in parallel', view_url: 'https://brightdata.com/cp/scrapers/c_stub', }), expect.objectContaining({output: 'create.json'}) @@ -407,9 +411,6 @@ describe('commands/scraper', ()=>{ timeout_label: expect.stringContaining('c_abc'), }) ); - // PR-2: -o now writes an envelope with collector_id, - // not the raw progress payload. The documented - // `jq -r '.collector_id'` recipe depends on this. expect(mocks.print).toHaveBeenCalledWith( expect.objectContaining({ collector_id: 'c_abc', diff --git a/src/commands/scraper.ts b/src/commands/scraper.ts index 73e4963..30237ee 100644 --- a/src/commands/scraper.ts +++ b/src/commands/scraper.ts @@ -24,6 +24,7 @@ const AI_TRIGGER_PATH = 'automate_template'; const AI_PROGRESS_PATH = 'automate_template/progress'; const RUNNING_SENTINEL = '__running__'; const DONE_STATUS = 'done'; +const TERMINAL_FAIL_STATUSES = ['failed', 'error', 'cancelled']; const TRIGGER_IMMEDIATE_ENDPOINT = '/dca/trigger_immediate'; const GET_RESULT_ENDPOINT = '/dca/get_result'; const SYNC_CRAWL_ENDPOINT = '/dca/crawl'; @@ -64,8 +65,12 @@ const extract_progress_status = ( return undefined; if (typeof result.status != 'string') return undefined; - if (result.status == DONE_STATUS) - return DONE_STATUS; + // terminal statuses stop polling; non-done ones route to failure. + if (result.status == DONE_STATUS + || TERMINAL_FAIL_STATUSES.includes(result.status)) + { + return result.status; + } return RUNNING_SENTINEL; }; @@ -84,10 +89,9 @@ const format_create_summary = ( return lines.join('\n'); }; -// PR-2: every termination path of `scraper create` writes this same -// envelope shape to -o. Solves the broken `jq -r '.collector_id'` -// recipe in references/recipes.md (today's -o file contains only the -// final progress payload, with no id field). +const clean_error_message = (msg: string): string=> + msg.split('\n')[0].replace(/^Error:\s*/, '').trim(); + const build_create_envelope = (params: { collector_id: string; name: string; @@ -105,20 +109,22 @@ const build_create_envelope = (params: { ...(params.error ? {error: params.error} : {}), }); -// Write the envelope (or, in --legacy-output mode, the bare progress -// payload) to wherever the user asked. Centralised so success and -// every failure path share one I/O code path. +const wants_machine_output = (opts: Scraper_create_opts): boolean=> + !!(opts.json || opts.pretty || opts.output) || !is_tty; + const emit_create_output = ( envelope: Create_envelope, progress: Ai_progress_response|null, opts: Scraper_create_opts -): void=>{ +): boolean=>{ + if (!wants_machine_output(opts)) + return false; const print_opts = {json: opts.json, pretty: opts.pretty, output: opts.output}; const payload = opts.legacyOutput && progress ? (progress as unknown) : envelope; - if (opts.json || opts.pretty || opts.output || !is_tty) - print(payload, print_opts); + print(payload, print_opts); + return true; }; const handle_create_scraper = async( @@ -149,8 +155,6 @@ const handle_create_scraper = async( create_spinner.stop(); if (!template.id) { - // Template POST didn't return an id — no collector_id to - // envelope, so no -o file to write. Same as today. fail('Failed to create scraper template (missing id).'); return; } @@ -181,15 +185,13 @@ const handle_create_scraper = async( `Failed to start AI generation for collector ` +`${collector_id}: ${msg}` ); - // PR-2: write the envelope even on failure so the user's - // automation can read collector_id + status from the file. emit_create_output( build_create_envelope({ collector_id, name: scraper_name, status: 'ai_trigger_failed', created_at, - error: msg, + error: clean_error_message(msg), }), null, opts @@ -243,8 +245,7 @@ const handle_create_scraper = async( process.exit(1); return; } - // Success path. - emit_create_output( + const emitted = emit_create_output( build_create_envelope({ collector_id, name: scraper_name, @@ -255,7 +256,7 @@ const handle_create_scraper = async( progress, opts ); - if (opts.json || opts.pretty || opts.output || !is_tty) + if (emitted) return; success(format_create_summary( collector_id, scraper_name, progress)); @@ -271,7 +272,7 @@ const handle_create_scraper = async( name: scraper_name, status: 'poll_failed', created_at, - error: msg, + error: clean_error_message(msg), }), null, opts diff --git a/src/types/scraper.ts b/src/types/scraper.ts index 8ddabe9..3d255ac 100644 --- a/src/types/scraper.ts +++ b/src/types/scraper.ts @@ -45,15 +45,9 @@ type Scraper_create_opts = { pretty?: boolean; timing?: boolean; apiKey?: string; - // PR-2: when true, write the bare AI-progress payload to -o - // (today's shape) instead of the new envelope. One-version - // migration flag. - legacyOutput?: boolean; + legacyOutput?: boolean; // emit the pre-v0.3 bare payload to -o }; -// PR-2: machine-readable envelope written to -o on every termination -// path of `scraper create`. Replaces the previous bare-progress -// payload so the documented `jq -r '.collector_id'` recipe works. type Create_envelope = { collector_id: string; name: string;