diff --git a/README.md b/README.md index 61333d2..31dd9f6 100644 --- a/README.md +++ b/README.md @@ -319,13 +319,44 @@ brightdata scraper create [options] | `--name ` | Scraper template name (default: `cli-scraper-`) | | `--deliver-webhook ` | Webhook URL for the deliver stub (default: `https://example.com/webhook`) | | `--timeout ` | Polling timeout in seconds (default: `600`) | -| `-o, --output ` | Write output to file | +| `-o, --output ` | Write the JSON envelope to a file (see below) | | `--json` / `--pretty` | JSON output (raw / indented) | +| `--legacy-output` | Write the pre-v0.3 bare AI-progress payload to `-o` instead of the envelope. Migration only. | | `--timing` | Show request timing | | `-k, --api-key ` | Override API key | > **Note:** The scraper is created with a placeholder webhook delivery target (`https://example.com/webhook`). You can reconfigure the actual delivery endpoint in the [Bright Data web UI](https://brightdata.com/cp/scrapers) after creation. +#### Output envelope (`-o create.json`) + +Every termination path — success or failure — writes the same JSON envelope shape: + +```json +{ + "collector_id": "c_mp7x8a9b2c0d1e2f", + "name": "my-product-scraper", + "status": "done", + "completed_steps": ["prepare_intent_analyzer", "planner", "..."], + "view_url": "https://brightdata.com/cp/scrapers/c_mp7x8a9b2c0d1e2f", + "created_at": "2026-05-18T07:28:30Z" +} +``` + +On failure paths the envelope adds an `error` field and the `status` reflects the failure category (`ai_trigger_failed`, `failed`, `poll_failed`). The `collector_id` and `view_url` are still present so you can recover or inspect the half-built scraper. + +This makes the documented chain in [recipes.md](https://github.com/brightdata/skills/blob/main/skills/scraper-studio/references/recipes.md) work as written: + +```bash +brightdata scraper create https://example.com/product/1 "..." \ + -o create.json +COLLECTOR_ID=$(jq -r '.collector_id' create.json) +brightdata scraper run "$COLLECTOR_ID" https://example.com/product/2 +``` + +> The file format follows the `-o` extension, so `.json` is written compact (ideal for `jq`). Use `--pretty` for indented JSON on stdout when you omit `-o`. + +Use `--legacy-output` if you have an existing script that depended on the pre-v0.3 bare-progress shape; the flag is supported for one minor version while you migrate. + **Examples** ```bash @@ -333,10 +364,13 @@ brightdata scraper create [options] brightdata scraper create https://example.com/product/1 \ "Extract title, price, and image URL from this product page" -# Name the scraper and save the full AI output to a file +# Name the scraper and save the envelope to a file brightdata scraper create https://example.com/product/1 \ "Extract title, price, and image URL from this product page" \ - --name my-product-scraper --pretty -o scraper-output.json + --name my-product-scraper -o create.json + +# Capture the collector_id for chaining +COLLECTOR_ID=$(jq -r '.collector_id' create.json) # Use a custom webhook delivery URL brightdata scraper create https://example.com/product/1 \ diff --git a/src/__tests__/commands/scraper.test.ts b/src/__tests__/commands/scraper.test.ts index 39e7e0c..1cb0878 100644 --- a/src/__tests__/commands/scraper.test.ts +++ b/src/__tests__/commands/scraper.test.ts @@ -50,6 +50,7 @@ import { extract_progress_status, format_create_summary, handle_create_scraper, + build_create_envelope, handle_run_scraper, build_run_request, build_run_query, @@ -115,7 +116,7 @@ describe('commands/scraper', ()=>{ expect(extract_progress_status({status: 'done'})).toBe('done'); }); - it('returns sentinel running token for any non-done status', ()=>{ + it('returns sentinel running token for in-progress statuses', ()=>{ expect(extract_progress_status({status: 'running'})) .toBe('__running__'); expect(extract_progress_status({status: 'queued'})) @@ -124,6 +125,16 @@ describe('commands/scraper', ()=>{ .toBe('__running__'); }); + it('returns terminal failure statuses verbatim so polling stops', + ()=>{ + expect(extract_progress_status({status: 'failed'})) + .toBe('failed'); + expect(extract_progress_status({status: 'error'})) + .toBe('error'); + expect(extract_progress_status({status: 'cancelled'})) + .toBe('cancelled'); + }); + it('returns undefined for missing/invalid input', ()=>{ expect(extract_progress_status(null as never)).toBeUndefined(); expect(extract_progress_status({} as never)).toBeUndefined(); @@ -149,6 +160,212 @@ describe('commands/scraper', ()=>{ }); }); + describe('build_create_envelope', ()=>{ + it('returns the documented success shape', ()=>{ + const env = build_create_envelope({ + collector_id: 'c_xyz', + name: 'product-v1', + status: 'done', + progress: {status: 'done', + completed_steps: ['a', 'b', 'c']}, + created_at: '2026-05-18T07:28:30Z', + }); + expect(env).toEqual({ + collector_id: 'c_xyz', + name: 'product-v1', + status: 'done', + completed_steps: ['a', 'b', 'c'], + view_url: 'https://brightdata.com/cp/scrapers/c_xyz', + created_at: '2026-05-18T07:28:30Z', + }); + }); + + it('omits created_at when not known', ()=>{ + const env = build_create_envelope({ + collector_id: 'c_xyz', + name: 'n', + status: 'done', + progress: {status: 'done', completed_steps: []}, + }); + expect(env).not.toHaveProperty('created_at'); + }); + + it('records the error message and partial steps on failure', + ()=>{ + const env = build_create_envelope({ + collector_id: 'c_xyz', + name: 'n', + status: 'ai_trigger_failed', + error: 'Cannot run more than 3 jobs in parallel', + }); + expect(env.collector_id).toBe('c_xyz'); + expect(env.status).toBe('ai_trigger_failed'); + expect(env.error).toMatch(/parallel/); + expect(env.completed_steps).toEqual([]); + expect(env.view_url) + .toBe('https://brightdata.com/cp/scrapers/c_xyz'); + }); + + it('still includes view_url on every termination path', ()=>{ + for (const status of ['done', 'failed', 'ai_trigger_failed', + 'poll_failed']) + { + const env = build_create_envelope({ + collector_id: 'c_xyz', name: 'n', status, + }); + expect(env.view_url) + .toBe('https://brightdata.com/cp/scrapers/c_xyz'); + } + }); + }); + + describe('handle_create_scraper envelope output', ()=>{ + const setup_success = ()=>{ + mocks.post + .mockResolvedValueOnce({ + id: 'c_xyz', name: 'product-v1', + created: '2026-05-18T07:28:30Z', + }) + .mockResolvedValueOnce({id: 'ia_xyz', queued: false}); + mocks.poll_until.mockResolvedValue({ + result: {status: 'done', + completed_steps: ['a', 'b', 'c']}, + attempts: 4, + }); + }; + + it('writes the new envelope to -o on success', async()=>{ + setup_success(); + await handle_create_scraper( + 'https://x.com/p', 'd', + {output: 'create.json', pretty: true} + ); + expect(mocks.print).toHaveBeenCalledWith( + expect.objectContaining({ + collector_id: 'c_xyz', + name: 'product-v1', + status: 'done', + completed_steps: ['a', 'b', 'c'], + view_url: 'https://brightdata.com/cp/scrapers/c_xyz', + created_at: '2026-05-18T07:28:30Z', + }), + expect.objectContaining({output: 'create.json'}) + ); + }); + + it('the documented `jq -r .collector_id` recipe works on the ' + +'envelope', async()=>{ + setup_success(); + await handle_create_scraper('https://x.com/p', 'd', + {output: 'create.json'}); + const written = mocks.print.mock.calls[0][0] as { + collector_id?: string}; + expect(written.collector_id).toBe('c_xyz'); + }); + + it('--legacy-output preserves the bare progress payload', + async()=>{ + setup_success(); + await handle_create_scraper( + 'https://x.com/p', 'd', + {output: 'create.json', legacyOutput: true} + ); + const written = mocks.print.mock.calls[0][0] as { + collector_id?: unknown; status?: string}; + expect(written.collector_id).toBeUndefined(); + expect(written).not.toHaveProperty('view_url'); + expect(written.status).toBe('done'); + }); + + it('writes the envelope when AI trigger fails (stub-collector ' + +'recovery path), with a single-line error', async()=>{ + // multi-line client error -> envelope keeps the first line. + mocks.post + .mockResolvedValueOnce({id: 'c_stub', name: 'n'}) + .mockRejectedValueOnce(new Error( + 'Error: Cannot run more than 3 jobs in parallel\n' + +' Status: 429\n Hint: serialise your launches.')); + const exit = vi.spyOn(process, 'exit') + .mockImplementation(()=>undefined as never); + const error = vi.spyOn(console, 'error') + .mockImplementation(()=>{}); + await handle_create_scraper( + 'https://x.com/p', 'd', + {output: 'create.json'} + ); + expect(mocks.print).toHaveBeenCalledWith( + expect.objectContaining({ + collector_id: 'c_stub', + status: 'ai_trigger_failed', + error: 'Cannot run more than 3 jobs in parallel', + view_url: 'https://brightdata.com/cp/scrapers/c_stub', + }), + expect.objectContaining({output: 'create.json'}) + ); + exit.mockRestore(); + error.mockRestore(); + }); + + it('writes the envelope when poll returns status != done', + async()=>{ + mocks.post + .mockResolvedValueOnce({id: 'c_abc', name: 'n'}) + .mockResolvedValueOnce({id: 'ia_xyz', queued: false}); + mocks.poll_until.mockResolvedValue({ + result: {status: 'failed', + completed_steps: ['planner']}, + attempts: 2, + }); + const exit = vi.spyOn(process, 'exit') + .mockImplementation(()=>undefined as never); + const error = vi.spyOn(console, 'error') + .mockImplementation(()=>{}); + await handle_create_scraper( + 'https://x.com/p', 'd', + {output: 'create.json'} + ); + expect(mocks.print).toHaveBeenCalledWith( + expect.objectContaining({ + collector_id: 'c_abc', + status: 'failed', + completed_steps: ['planner'], + error: expect.stringMatching(/finished with status/), + }), + expect.objectContaining({output: 'create.json'}) + ); + exit.mockRestore(); + error.mockRestore(); + }); + + it('writes the envelope when polling itself throws (timeout ' + +'or network)', async()=>{ + mocks.post + .mockResolvedValueOnce({id: 'c_abc', name: 'n'}) + .mockResolvedValueOnce({id: 'ia_xyz', queued: false}); + mocks.poll_until.mockRejectedValue( + new Error( + 'Timeout after 600 seconds waiting for AI generation')); + const exit = vi.spyOn(process, 'exit') + .mockImplementation(()=>undefined as never); + const error = vi.spyOn(console, 'error') + .mockImplementation(()=>{}); + await handle_create_scraper( + 'https://x.com/p', 'd', + {output: 'create.json'} + ); + expect(mocks.print).toHaveBeenCalledWith( + expect.objectContaining({ + collector_id: 'c_abc', + status: 'poll_failed', + error: expect.stringMatching(/Timeout/), + }), + expect.objectContaining({output: 'create.json'}) + ); + exit.mockRestore(); + error.mockRestore(); + }); + }); + describe('handle_create_scraper', ()=>{ it('chains create → trigger → poll and prints JSON in non-TTY', async()=>{ @@ -195,7 +412,13 @@ describe('commands/scraper', ()=>{ }) ); expect(mocks.print).toHaveBeenCalledWith( - progress, + expect.objectContaining({ + collector_id: 'c_abc', + name: 'cli-scraper-1', + status: 'done', + completed_steps: ['a', 'b'], + view_url: 'https://brightdata.com/cp/scrapers/c_abc', + }), {json: undefined, pretty: undefined, output: undefined} ); }); @@ -209,7 +432,10 @@ describe('commands/scraper', ()=>{ result: progress, attempts: 1}); await handle_create_scraper('https://x.com', 'd', {json: true}); expect(mocks.print).toHaveBeenCalledWith( - progress, + expect.objectContaining({ + collector_id: 'c_abc', + status: 'done', + }), {json: true, pretty: undefined, output: undefined} ); }); diff --git a/src/commands/scraper.ts b/src/commands/scraper.ts index 9ab4400..30237ee 100644 --- a/src/commands/scraper.ts +++ b/src/commands/scraper.ts @@ -12,6 +12,7 @@ import type { Trigger_ai_response, Ai_progress_response, Scraper_create_opts, + Create_envelope, Run_request, Trigger_immediate_response, Scraper_run_opts, @@ -23,6 +24,7 @@ const AI_TRIGGER_PATH = 'automate_template'; const AI_PROGRESS_PATH = 'automate_template/progress'; const RUNNING_SENTINEL = '__running__'; const DONE_STATUS = 'done'; +const TERMINAL_FAIL_STATUSES = ['failed', 'error', 'cancelled']; const TRIGGER_IMMEDIATE_ENDPOINT = '/dca/trigger_immediate'; const GET_RESULT_ENDPOINT = '/dca/get_result'; const SYNC_CRAWL_ENDPOINT = '/dca/crawl'; @@ -63,8 +65,12 @@ const extract_progress_status = ( return undefined; if (typeof result.status != 'string') return undefined; - if (result.status == DONE_STATUS) - return DONE_STATUS; + // terminal statuses stop polling; non-done ones route to failure. + if (result.status == DONE_STATUS + || TERMINAL_FAIL_STATUSES.includes(result.status)) + { + return result.status; + } return RUNNING_SENTINEL; }; @@ -83,6 +89,44 @@ const format_create_summary = ( return lines.join('\n'); }; +const clean_error_message = (msg: string): string=> + msg.split('\n')[0].replace(/^Error:\s*/, '').trim(); + +const build_create_envelope = (params: { + collector_id: string; + name: string; + status: string; + progress?: Ai_progress_response; + created_at?: string; + error?: string; +}): Create_envelope=>({ + collector_id: params.collector_id, + name: params.name, + status: params.status, + completed_steps: params.progress?.completed_steps ?? [], + view_url: `https://brightdata.com/cp/scrapers/${params.collector_id}`, + ...(params.created_at ? {created_at: params.created_at} : {}), + ...(params.error ? {error: params.error} : {}), +}); + +const wants_machine_output = (opts: Scraper_create_opts): boolean=> + !!(opts.json || opts.pretty || opts.output) || !is_tty; + +const emit_create_output = ( + envelope: Create_envelope, + progress: Ai_progress_response|null, + opts: Scraper_create_opts +): boolean=>{ + if (!wants_machine_output(opts)) + return false; + const print_opts = {json: opts.json, pretty: opts.pretty, + output: opts.output}; + const payload = opts.legacyOutput && progress + ? (progress as unknown) : envelope; + print(payload, print_opts); + return true; +}; + const handle_create_scraper = async( url: string, description: string, @@ -100,6 +144,7 @@ const handle_create_scraper = async( const create_spinner = start_spinner('Creating scraper template...'); let collector_id = ''; let scraper_name = template_body.name; + let created_at: string|undefined; try { const template = await post( api_key, @@ -115,6 +160,7 @@ const handle_create_scraper = async( } collector_id = template.id; scraper_name = template.name ?? template_body.name; + created_at = template.created; console.error(dim(`Template created: ${collector_id}`)); } catch(e) { create_spinner.stop(); @@ -134,9 +180,21 @@ const handle_create_scraper = async( trigger_spinner.stop(); } catch(e) { trigger_spinner.stop(); + const msg = (e as Error).message; console.error( `Failed to start AI generation for collector ` - +`${collector_id}: ${(e as Error).message}` + +`${collector_id}: ${msg}` + ); + emit_create_output( + build_create_envelope({ + collector_id, + name: scraper_name, + status: 'ai_trigger_failed', + created_at, + error: clean_error_message(msg), + }), + null, + opts ); process.exit(1); return; @@ -171,16 +229,35 @@ const handle_create_scraper = async( `AI generation failed (collector ${collector_id}, ` +`status: ${progress.status}).` ); + emit_create_output( + build_create_envelope({ + collector_id, + name: scraper_name, + status: progress.status, + progress, + created_at, + error: `AI generation finished with status ` + +`"${progress.status}".`, + }), + progress, + opts + ); process.exit(1); return; } - const print_opts = {json: opts.json, pretty: opts.pretty, - output: opts.output}; - if (opts.json || opts.pretty || opts.output || !is_tty) - { - print(progress, print_opts); + const emitted = emit_create_output( + build_create_envelope({ + collector_id, + name: scraper_name, + status: progress.status, + progress, + created_at, + }), + progress, + opts + ); + if (emitted) return; - } success(format_create_summary( collector_id, scraper_name, progress)); } catch(e) { @@ -189,6 +266,17 @@ const handle_create_scraper = async( const suffix = msg.includes(collector_id) ? '' : ` (collector ${collector_id})`; console.error(`${msg}${suffix}`); + emit_create_output( + build_create_envelope({ + collector_id, + name: scraper_name, + status: 'poll_failed', + created_at, + error: clean_error_message(msg), + }), + null, + opts + ); process.exit(1); return; } @@ -547,6 +635,10 @@ const create_subcommand = new Command('create') .option('-o, --output ', 'Write output to file') .option('--json', 'Force JSON output') .option('--pretty', 'Pretty-print JSON output') + .option('--legacy-output', + 'Emit the bare AI-progress payload (pre-v0.3 shape) instead ' + +'of the new {collector_id, name, status, ...} envelope. ' + +'For one-version migration only.') .option('--timing', 'Show request timing') .option('-k, --api-key ', 'Override API key') .action(handle_create_scraper); @@ -584,6 +676,8 @@ export { build_ai_request, extract_progress_status, format_create_summary, + build_create_envelope, + emit_create_output, handle_run_scraper, build_run_request, build_run_query, diff --git a/src/types/scraper.ts b/src/types/scraper.ts index 74dbbcf..3d255ac 100644 --- a/src/types/scraper.ts +++ b/src/types/scraper.ts @@ -45,6 +45,17 @@ type Scraper_create_opts = { pretty?: boolean; timing?: boolean; apiKey?: string; + legacyOutput?: boolean; // emit the pre-v0.3 bare payload to -o +}; + +type Create_envelope = { + collector_id: string; + name: string; + status: string; + completed_steps: string[]; + view_url: string; + created_at?: string; + error?: string; }; type Run_request = { @@ -92,6 +103,7 @@ export type { Trigger_ai_response, Ai_progress_response, Scraper_create_opts, + Create_envelope, Run_request, Trigger_immediate_response, Sync_timeout_response,