diff --git a/apps/docs/components/icons.tsx b/apps/docs/components/icons.tsx index bde3013c6a2..d7ae05105db 100644 --- a/apps/docs/components/icons.tsx +++ b/apps/docs/components/icons.tsx @@ -2087,6 +2087,21 @@ export function BrandfetchIcon(props: SVGProps) { ) } +export function BrightDataIcon(props: SVGProps) { + return ( + + + + + ) +} + export function BrowserUseIcon(props: SVGProps) { return ( = { attio: AttioIcon, box: BoxCompanyIcon, brandfetch: BrandfetchIcon, + brightdata: BrightDataIcon, browser_use: BrowserUseIcon, calcom: CalComIcon, calendly: CalendlyIcon, diff --git a/apps/docs/content/docs/en/tools/brightdata.mdx b/apps/docs/content/docs/en/tools/brightdata.mdx new file mode 100644 index 00000000000..65f8327e862 --- /dev/null +++ b/apps/docs/content/docs/en/tools/brightdata.mdx @@ -0,0 +1,201 @@ +--- +title: Bright Data +description: Scrape websites, search engines, and extract structured data +--- + +import { BlockInfoCard } from "@/components/ui/block-info-card" + + + +## Usage Instructions + +Integrate Bright Data into the workflow. Scrape any URL with Web Unlocker, search Google and other engines with SERP API, discover web content ranked by intent, or trigger pre-built scrapers for structured data extraction. + + + +## Tools + +### `brightdata_scrape_url` + +Fetch content from any URL using Bright Data Web Unlocker. Bypasses anti-bot protections, CAPTCHAs, and IP blocks automatically. + +#### Input + +| Parameter | Type | Required | Description | +| --------- | ---- | -------- | ----------- | +| `apiKey` | string | Yes | Bright Data API token | +| `zone` | string | Yes | Web Unlocker zone name from your Bright Data dashboard \(e.g., "web_unlocker1"\) | +| `url` | string | Yes | The URL to scrape \(e.g., "https://example.com/page"\) | +| `format` | string | No | Response format: "raw" for HTML or "json" for parsed content. Defaults to "raw" | +| `country` | string | No | Two-letter country code for geo-targeting \(e.g., "us", "gb"\) | + +#### Output + +| Parameter | Type | Description | +| --------- | ---- | ----------- | +| `content` | string | The scraped page content \(HTML or JSON depending on format\) | +| `url` | string | The URL that was scraped | +| `statusCode` | number | HTTP status code of the response | + +### `brightdata_serp_search` + +Search Google, Bing, DuckDuckGo, or Yandex and get structured search results using Bright Data SERP API. + +#### Input + +| Parameter | Type | Required | Description | +| --------- | ---- | -------- | ----------- | +| `apiKey` | string | Yes | Bright Data API token | +| `zone` | string | Yes | SERP API zone name from your Bright Data dashboard \(e.g., "serp_api1"\) | +| `query` | string | Yes | The search query \(e.g., "best project management tools"\) | +| `searchEngine` | string | No | Search engine to use: "google", "bing", "duckduckgo", or "yandex". Defaults to "google" | +| `country` | string | No | Two-letter country code for localized results \(e.g., "us", "gb"\) | +| `language` | string | No | Two-letter language code \(e.g., "en", "es"\) | +| `numResults` | number | No | Number of results to return \(e.g., 10, 20\). Defaults to 10 | + +#### Output + +| Parameter | Type | Description | +| --------- | ---- | ----------- | +| `results` | array | Array of search results | +| ↳ `title` | string | Title of the search result | +| ↳ `url` | string | URL of the search result | +| ↳ `description` | string | Snippet or description of the result | +| ↳ `rank` | number | Position in search results | +| `query` | string | The search query that was executed | +| `searchEngine` | string | The search engine that was used | + +### `brightdata_discover` + +AI-powered web discovery that finds and ranks results by intent. Returns up to 1,000 results with optional cleaned page content for RAG and verification. + +#### Input + +| Parameter | Type | Required | Description | +| --------- | ---- | -------- | ----------- | +| `apiKey` | string | Yes | Bright Data API token | +| `query` | string | Yes | The search query \(e.g., "competitor pricing changes enterprise plan"\) | +| `numResults` | number | No | Number of results to return, up to 1000. Defaults to 10 | +| `intent` | string | No | Describes what the agent is trying to accomplish, used to rank results by relevance \(e.g., "find official pricing pages and change notes"\) | +| `includeContent` | boolean | No | Whether to include cleaned page content in results | +| `format` | string | No | Response format: "json" or "markdown". Defaults to "json" | +| `language` | string | No | Search language code \(e.g., "en", "es", "fr"\). Defaults to "en" | +| `country` | string | No | Two-letter ISO country code for localized results \(e.g., "us", "gb"\) | + +#### Output + +| Parameter | Type | Description | +| --------- | ---- | ----------- | +| `results` | array | Array of discovered web results ranked by intent relevance | +| ↳ `url` | string | URL of the discovered page | +| ↳ `title` | string | Page title | +| ↳ `description` | string | Page description or snippet | +| ↳ `relevanceScore` | number | AI-calculated relevance score for intent-based ranking | +| ↳ `content` | string | Cleaned page content in the requested format \(when includeContent is true\) | +| `query` | string | The search query that was executed | +| `totalResults` | number | Total number of results returned | + +### `brightdata_sync_scrape` + +Scrape URLs synchronously using a Bright Data pre-built scraper and get structured results directly. Supports up to 20 URLs with a 1-minute timeout. + +#### Input + +| Parameter | Type | Required | Description | +| --------- | ---- | -------- | ----------- | +| `apiKey` | string | Yes | Bright Data API token | +| `datasetId` | string | Yes | Dataset scraper ID from your Bright Data dashboard \(e.g., "gd_l1viktl72bvl7bjuj0"\) | +| `urls` | string | Yes | JSON array of URL objects to scrape, up to 20 \(e.g., \[\{"url": "https://example.com/product"\}\]\) | +| `format` | string | No | Output format: "json", "ndjson", or "csv". Defaults to "json" | +| `includeErrors` | boolean | No | Whether to include error reports in results | + +#### Output + +| Parameter | Type | Description | +| --------- | ---- | ----------- | +| `data` | array | Array of scraped result objects with fields specific to the dataset scraper used | +| `snapshotId` | string | Snapshot ID returned if the request exceeded the 1-minute timeout and switched to async processing | +| `isAsync` | boolean | Whether the request fell back to async mode \(true means use snapshot ID to retrieve results\) | + +### `brightdata_scrape_dataset` + +Trigger a Bright Data pre-built scraper to extract structured data from URLs. Supports 660+ scrapers for platforms like Amazon, LinkedIn, Instagram, and more. + +#### Input + +| Parameter | Type | Required | Description | +| --------- | ---- | -------- | ----------- | +| `apiKey` | string | Yes | Bright Data API token | +| `datasetId` | string | Yes | Dataset scraper ID from your Bright Data dashboard \(e.g., "gd_l1viktl72bvl7bjuj0"\) | +| `urls` | string | Yes | JSON array of URL objects to scrape \(e.g., \[\{"url": "https://example.com/product"\}\]\) | +| `format` | string | No | Output format: "json" or "csv". Defaults to "json" | + +#### Output + +| Parameter | Type | Description | +| --------- | ---- | ----------- | +| `snapshotId` | string | The snapshot ID to retrieve results later | +| `status` | string | Status of the scraping job \(e.g., "triggered", "running"\) | + +### `brightdata_snapshot_status` + +Check the progress of an async Bright Data scraping job. Returns status: starting, running, ready, or failed. + +#### Input + +| Parameter | Type | Required | Description | +| --------- | ---- | -------- | ----------- | +| `apiKey` | string | Yes | Bright Data API token | +| `snapshotId` | string | Yes | The snapshot ID returned when the collection was triggered \(e.g., "s_m4x7enmven8djfqak"\) | + +#### Output + +| Parameter | Type | Description | +| --------- | ---- | ----------- | +| `snapshotId` | string | The snapshot ID that was queried | +| `datasetId` | string | The dataset ID associated with this snapshot | +| `status` | string | Current status of the snapshot: "starting", "running", "ready", or "failed" | + +### `brightdata_download_snapshot` + +Download the results of a completed Bright Data scraping job using its snapshot ID. The snapshot must have ready status. + +#### Input + +| Parameter | Type | Required | Description | +| --------- | ---- | -------- | ----------- | +| `apiKey` | string | Yes | Bright Data API token | +| `snapshotId` | string | Yes | The snapshot ID returned when the collection was triggered \(e.g., "s_m4x7enmven8djfqak"\) | +| `format` | string | No | Output format: "json", "ndjson", "jsonl", or "csv". Defaults to "json" | +| `compress` | boolean | No | Whether to compress the results | + +#### Output + +| Parameter | Type | Description | +| --------- | ---- | ----------- | +| `data` | array | Array of scraped result records | +| `format` | string | The content type of the downloaded data | +| `snapshotId` | string | The snapshot ID that was downloaded | + +### `brightdata_cancel_snapshot` + +Cancel an active Bright Data scraping job using its snapshot ID. Terminates data collection in progress. + +#### Input + +| Parameter | Type | Required | Description | +| --------- | ---- | -------- | ----------- | +| `apiKey` | string | Yes | Bright Data API token | +| `snapshotId` | string | Yes | The snapshot ID of the collection to cancel \(e.g., "s_m4x7enmven8djfqak"\) | + +#### Output + +| Parameter | Type | Description | +| --------- | ---- | ----------- | +| `snapshotId` | string | The snapshot ID that was cancelled | +| `cancelled` | boolean | Whether the cancellation was successful | + + diff --git a/apps/docs/content/docs/en/tools/meta.json b/apps/docs/content/docs/en/tools/meta.json index d11a49bfd71..6f77170985e 100644 --- a/apps/docs/content/docs/en/tools/meta.json +++ b/apps/docs/content/docs/en/tools/meta.json @@ -18,6 +18,7 @@ "attio", "box", "brandfetch", + "brightdata", "browser_use", "calcom", "calendly", diff --git a/apps/sim/app/(landing)/integrations/data/icon-mapping.ts b/apps/sim/app/(landing)/integrations/data/icon-mapping.ts index 1eb1d47659e..0dec44c268f 100644 --- a/apps/sim/app/(landing)/integrations/data/icon-mapping.ts +++ b/apps/sim/app/(landing)/integrations/data/icon-mapping.ts @@ -23,6 +23,7 @@ import { BoxCompanyIcon, BrainIcon, BrandfetchIcon, + BrightDataIcon, BrowserUseIcon, CalComIcon, CalendlyIcon, @@ -215,6 +216,7 @@ export const blockTypeToIconMap: Record = { attio: AttioIcon, box: BoxCompanyIcon, brandfetch: BrandfetchIcon, + brightdata: BrightDataIcon, browser_use: BrowserUseIcon, calcom: CalComIcon, calendly: CalendlyIcon, diff --git a/apps/sim/app/(landing)/integrations/data/integrations.json b/apps/sim/app/(landing)/integrations/data/integrations.json index ec437abe8db..7a2f6cd107f 100644 --- a/apps/sim/app/(landing)/integrations/data/integrations.json +++ b/apps/sim/app/(landing)/integrations/data/integrations.json @@ -214,7 +214,7 @@ "name": "Agiloft", "description": "Manage records in Agiloft CLM", "longDescription": "Integrate with Agiloft contract lifecycle management to create, read, update, delete, and search records. Supports file attachments, SQL-based selection, saved searches, and record locking across any table in your knowledge base.", - "bgColor": "#263A5C", + "bgColor": "#FFFFFF", "iconName": "AgiloftIcon", "docsUrl": "https://docs.sim.ai/tools/agiloft", "operations": [ @@ -1743,6 +1743,57 @@ "integrationTypes": ["sales", "analytics"], "tags": ["enrichment", "marketing"] }, + { + "type": "brightdata", + "slug": "bright-data", + "name": "Bright Data", + "description": "Scrape websites, search engines, and extract structured data", + "longDescription": "Integrate Bright Data into the workflow. Scrape any URL with Web Unlocker, search Google and other engines with SERP API, discover web content ranked by intent, or trigger pre-built scrapers for structured data extraction.", + "bgColor": "#FFFFFF", + "iconName": "BrightDataIcon", + "docsUrl": "https://docs.sim.ai/tools/brightdata", + "operations": [ + { + "name": "Scrape URL", + "description": "Fetch content from any URL using Bright Data Web Unlocker. Bypasses anti-bot protections, CAPTCHAs, and IP blocks automatically." + }, + { + "name": "SERP Search", + "description": "Search Google, Bing, DuckDuckGo, or Yandex and get structured search results using Bright Data SERP API." + }, + { + "name": "Discover", + "description": "AI-powered web discovery that finds and ranks results by intent. Returns up to 1,000 results with optional cleaned page content for RAG and verification." + }, + { + "name": "Sync Scrape", + "description": "Scrape URLs synchronously using a Bright Data pre-built scraper and get structured results directly. Supports up to 20 URLs with a 1-minute timeout." + }, + { + "name": "Scrape Dataset", + "description": "Trigger a Bright Data pre-built scraper to extract structured data from URLs. Supports 660+ scrapers for platforms like Amazon, LinkedIn, Instagram, and more." + }, + { + "name": "Snapshot Status", + "description": "Check the progress of an async Bright Data scraping job. Returns status: starting, running, ready, or failed." + }, + { + "name": "Download Snapshot", + "description": "Download the results of a completed Bright Data scraping job using its snapshot ID. The snapshot must have ready status." + }, + { + "name": "Cancel Snapshot", + "description": "Cancel an active Bright Data scraping job using its snapshot ID. Terminates data collection in progress." + } + ], + "operationCount": 8, + "triggers": [], + "triggerCount": 0, + "authType": "api-key", + "category": "tools", + "integrationTypes": ["search", "developer-tools"], + "tags": ["web-scraping", "automation"] + }, { "type": "browser_use", "slug": "browser-use", diff --git a/apps/sim/blocks/blocks/agiloft.ts b/apps/sim/blocks/blocks/agiloft.ts index 35af080fb97..36e571dad99 100644 --- a/apps/sim/blocks/blocks/agiloft.ts +++ b/apps/sim/blocks/blocks/agiloft.ts @@ -13,7 +13,7 @@ export const AgiloftBlock: BlockConfig = { category: 'tools', integrationType: IntegrationType.Productivity, tags: ['automation'], - bgColor: '#263A5C', + bgColor: '#FFFFFF', icon: AgiloftIcon, authMode: AuthMode.ApiKey, diff --git a/apps/sim/blocks/blocks/brightdata.ts b/apps/sim/blocks/blocks/brightdata.ts new file mode 100644 index 00000000000..ffc0dc1c385 --- /dev/null +++ b/apps/sim/blocks/blocks/brightdata.ts @@ -0,0 +1,346 @@ +import { BrightDataIcon } from '@/components/icons' +import type { BlockConfig } from '@/blocks/types' +import { AuthMode, IntegrationType } from '@/blocks/types' +import type { BrightDataResponse } from '@/tools/brightdata/types' + +export const BrightDataBlock: BlockConfig = { + type: 'brightdata', + name: 'Bright Data', + description: 'Scrape websites, search engines, and extract structured data', + authMode: AuthMode.ApiKey, + longDescription: + 'Integrate Bright Data into the workflow. Scrape any URL with Web Unlocker, search Google and other engines with SERP API, discover web content ranked by intent, or trigger pre-built scrapers for structured data extraction.', + docsLink: 'https://docs.sim.ai/tools/brightdata', + category: 'tools', + integrationType: IntegrationType.Search, + tags: ['web-scraping', 'automation'], + bgColor: '#FFFFFF', + icon: BrightDataIcon, + subBlocks: [ + { + id: 'operation', + title: 'Operation', + type: 'dropdown', + options: [ + { label: 'Scrape URL', id: 'scrape_url' }, + { label: 'SERP Search', id: 'serp_search' }, + { label: 'Discover', id: 'discover' }, + { label: 'Sync Scrape', id: 'sync_scrape' }, + { label: 'Scrape Dataset', id: 'scrape_dataset' }, + { label: 'Snapshot Status', id: 'snapshot_status' }, + { label: 'Download Snapshot', id: 'download_snapshot' }, + { label: 'Cancel Snapshot', id: 'cancel_snapshot' }, + ], + value: () => 'scrape_url', + }, + { + id: 'zone', + title: 'Zone', + type: 'short-input', + placeholder: 'e.g., web_unlocker1', + condition: { field: 'operation', value: ['scrape_url', 'serp_search'] }, + required: { field: 'operation', value: ['scrape_url', 'serp_search'] }, + }, + { + id: 'url', + title: 'URL', + type: 'short-input', + placeholder: 'https://example.com/page', + condition: { field: 'operation', value: 'scrape_url' }, + required: { field: 'operation', value: 'scrape_url' }, + }, + { + id: 'format', + title: 'Format', + type: 'dropdown', + options: [ + { label: 'Raw HTML', id: 'raw' }, + { label: 'JSON', id: 'json' }, + ], + value: () => 'raw', + condition: { field: 'operation', value: 'scrape_url' }, + }, + { + id: 'country', + title: 'Country', + type: 'short-input', + placeholder: 'e.g., us, gb', + mode: 'advanced', + condition: { field: 'operation', value: ['scrape_url', 'serp_search', 'discover'] }, + }, + { + id: 'query', + title: 'Search Query', + type: 'short-input', + placeholder: 'e.g., best project management tools', + condition: { field: 'operation', value: 'serp_search' }, + required: { field: 'operation', value: 'serp_search' }, + }, + { + id: 'searchEngine', + title: 'Search Engine', + type: 'dropdown', + options: [ + { label: 'Google', id: 'google' }, + { label: 'Bing', id: 'bing' }, + { label: 'DuckDuckGo', id: 'duckduckgo' }, + { label: 'Yandex', id: 'yandex' }, + ], + value: () => 'google', + condition: { field: 'operation', value: 'serp_search' }, + }, + { + id: 'language', + title: 'Language', + type: 'short-input', + placeholder: 'e.g., en, es', + mode: 'advanced', + condition: { field: 'operation', value: ['serp_search', 'discover'] }, + }, + { + id: 'numResults', + title: 'Number of Results', + type: 'short-input', + placeholder: '10', + mode: 'advanced', + condition: { field: 'operation', value: ['serp_search', 'discover'] }, + }, + { + id: 'discoverQuery', + title: 'Search Query', + type: 'short-input', + placeholder: 'e.g., competitor pricing changes', + condition: { field: 'operation', value: 'discover' }, + required: { field: 'operation', value: 'discover' }, + }, + { + id: 'intent', + title: 'Intent', + type: 'long-input', + placeholder: + 'Describe what you are looking for (e.g., "find official pricing pages and change notes")', + condition: { field: 'operation', value: 'discover' }, + }, + { + id: 'includeContent', + title: 'Include Page Content', + type: 'switch', + mode: 'advanced', + condition: { field: 'operation', value: 'discover' }, + }, + { + id: 'contentFormat', + title: 'Response Format', + type: 'dropdown', + options: [ + { label: 'JSON', id: 'json' }, + { label: 'Markdown', id: 'markdown' }, + ], + value: () => 'json', + mode: 'advanced', + condition: { field: 'operation', value: 'discover' }, + }, + { + id: 'syncDatasetId', + title: 'Dataset ID', + type: 'short-input', + placeholder: 'e.g., gd_l1viktl72bvl7bjuj0', + condition: { field: 'operation', value: 'sync_scrape' }, + required: { field: 'operation', value: 'sync_scrape' }, + }, + { + id: 'syncUrls', + title: 'URLs (max 20)', + type: 'long-input', + placeholder: '[{"url": "https://example.com/product"}]', + condition: { field: 'operation', value: 'sync_scrape' }, + required: { field: 'operation', value: 'sync_scrape' }, + }, + { + id: 'syncFormat', + title: 'Output Format', + type: 'dropdown', + options: [ + { label: 'JSON', id: 'json' }, + { label: 'NDJSON', id: 'ndjson' }, + { label: 'CSV', id: 'csv' }, + ], + value: () => 'json', + condition: { field: 'operation', value: 'sync_scrape' }, + }, + { + id: 'datasetId', + title: 'Dataset ID', + type: 'short-input', + placeholder: 'e.g., gd_l1viktl72bvl7bjuj0', + condition: { field: 'operation', value: 'scrape_dataset' }, + required: { field: 'operation', value: 'scrape_dataset' }, + }, + { + id: 'urls', + title: 'URLs', + type: 'long-input', + placeholder: '[{"url": "https://example.com/product"}]', + condition: { field: 'operation', value: 'scrape_dataset' }, + required: { field: 'operation', value: 'scrape_dataset' }, + }, + { + id: 'datasetFormat', + title: 'Output Format', + type: 'dropdown', + options: [ + { label: 'JSON', id: 'json' }, + { label: 'CSV', id: 'csv' }, + ], + value: () => 'json', + condition: { field: 'operation', value: 'scrape_dataset' }, + }, + { + id: 'snapshotId', + title: 'Snapshot ID', + type: 'short-input', + placeholder: 'e.g., s_m4x7enmven8djfqak', + condition: { + field: 'operation', + value: ['snapshot_status', 'download_snapshot', 'cancel_snapshot'], + }, + required: { + field: 'operation', + value: ['snapshot_status', 'download_snapshot', 'cancel_snapshot'], + }, + }, + { + id: 'downloadFormat', + title: 'Download Format', + type: 'dropdown', + options: [ + { label: 'JSON', id: 'json' }, + { label: 'NDJSON', id: 'ndjson' }, + { label: 'CSV', id: 'csv' }, + ], + value: () => 'json', + condition: { field: 'operation', value: 'download_snapshot' }, + }, + { + id: 'apiKey', + title: 'API Key', + type: 'short-input', + placeholder: 'Enter your Bright Data API token', + password: true, + required: true, + }, + ], + tools: { + access: [ + 'brightdata_scrape_url', + 'brightdata_serp_search', + 'brightdata_discover', + 'brightdata_sync_scrape', + 'brightdata_scrape_dataset', + 'brightdata_snapshot_status', + 'brightdata_download_snapshot', + 'brightdata_cancel_snapshot', + ], + config: { + tool: (params) => `brightdata_${params.operation}`, + params: (params) => { + const result: Record = { apiKey: params.apiKey } + + switch (params.operation) { + case 'scrape_url': + result.zone = params.zone + result.url = params.url + if (params.format) result.format = params.format + if (params.country) result.country = params.country + break + + case 'serp_search': + result.zone = params.zone + result.query = params.query + if (params.searchEngine) result.searchEngine = params.searchEngine + if (params.country) result.country = params.country + if (params.language) result.language = params.language + if (params.numResults) result.numResults = Number(params.numResults) + break + + case 'discover': + result.query = params.discoverQuery + if (params.numResults) result.numResults = Number(params.numResults) + if (params.intent) result.intent = params.intent + if (params.includeContent != null) result.includeContent = params.includeContent + if (params.contentFormat) result.format = params.contentFormat + if (params.language) result.language = params.language + if (params.country) result.country = params.country + break + + case 'sync_scrape': + result.datasetId = params.syncDatasetId + result.urls = params.syncUrls + if (params.syncFormat) result.format = params.syncFormat + break + + case 'scrape_dataset': + result.datasetId = params.datasetId + result.urls = params.urls + if (params.datasetFormat) result.format = params.datasetFormat + break + + case 'snapshot_status': + result.snapshotId = params.snapshotId + break + + case 'download_snapshot': + result.snapshotId = params.snapshotId + if (params.downloadFormat) result.format = params.downloadFormat + break + + case 'cancel_snapshot': + result.snapshotId = params.snapshotId + break + } + + return result + }, + }, + }, + inputs: { + operation: { type: 'string', description: 'Operation to perform' }, + apiKey: { type: 'string', description: 'Bright Data API token' }, + zone: { type: 'string', description: 'Bright Data zone name' }, + url: { type: 'string', description: 'URL to scrape' }, + format: { type: 'string', description: 'Response format' }, + country: { type: 'string', description: 'Country code for geo-targeting' }, + query: { type: 'string', description: 'Search query' }, + searchEngine: { type: 'string', description: 'Search engine to use' }, + language: { type: 'string', description: 'Language code' }, + numResults: { type: 'number', description: 'Number of results' }, + discoverQuery: { type: 'string', description: 'Discover search query' }, + intent: { type: 'string', description: 'Intent for ranking results' }, + includeContent: { type: 'boolean', description: 'Include page content in discover results' }, + contentFormat: { type: 'string', description: 'Content format for discover results' }, + syncDatasetId: { type: 'string', description: 'Dataset scraper ID for sync scrape' }, + syncUrls: { type: 'string', description: 'JSON array of URL objects for sync scrape' }, + syncFormat: { type: 'string', description: 'Output format for sync scrape' }, + datasetId: { type: 'string', description: 'Dataset scraper ID' }, + urls: { type: 'string', description: 'JSON array of URL objects to scrape' }, + datasetFormat: { type: 'string', description: 'Dataset output format' }, + snapshotId: { type: 'string', description: 'Snapshot ID for status/download/cancel' }, + downloadFormat: { type: 'string', description: 'Download output format' }, + }, + outputs: { + content: { type: 'string', description: 'Scraped page content' }, + url: { type: 'string', description: 'URL that was scraped' }, + statusCode: { type: 'number', description: 'HTTP status code' }, + results: { type: 'json', description: 'Search or discover results array' }, + query: { type: 'string', description: 'Search query executed' }, + searchEngine: { type: 'string', description: 'Search engine used' }, + totalResults: { type: 'number', description: 'Total number of discover results' }, + data: { type: 'json', description: 'Scraped data records' }, + snapshotId: { type: 'string', description: 'Snapshot ID' }, + isAsync: { type: 'boolean', description: 'Whether sync scrape fell back to async' }, + status: { type: 'string', description: 'Job status' }, + datasetId: { type: 'string', description: 'Dataset ID of the snapshot' }, + format: { type: 'string', description: 'Content type of downloaded data' }, + cancelled: { type: 'boolean', description: 'Whether cancellation was successful' }, + }, +} diff --git a/apps/sim/blocks/registry.ts b/apps/sim/blocks/registry.ts index 4ab0d88a16b..270bea945c7 100644 --- a/apps/sim/blocks/registry.ts +++ b/apps/sim/blocks/registry.ts @@ -18,6 +18,7 @@ import { AthenaBlock } from '@/blocks/blocks/athena' import { AttioBlock } from '@/blocks/blocks/attio' import { BoxBlock } from '@/blocks/blocks/box' import { BrandfetchBlock } from '@/blocks/blocks/brandfetch' +import { BrightDataBlock } from '@/blocks/blocks/brightdata' import { BrowserUseBlock } from '@/blocks/blocks/browser_use' import { CalComBlock } from '@/blocks/blocks/calcom' import { CalendlyBlock } from '@/blocks/blocks/calendly' @@ -245,6 +246,7 @@ export const registry: Record = { athena: AthenaBlock, attio: AttioBlock, brandfetch: BrandfetchBlock, + brightdata: BrightDataBlock, box: BoxBlock, browser_use: BrowserUseBlock, calcom: CalComBlock, diff --git a/apps/sim/components/icons.tsx b/apps/sim/components/icons.tsx index bde3013c6a2..d7ae05105db 100644 --- a/apps/sim/components/icons.tsx +++ b/apps/sim/components/icons.tsx @@ -2087,6 +2087,21 @@ export function BrandfetchIcon(props: SVGProps) { ) } +export function BrightDataIcon(props: SVGProps) { + return ( + + + + + ) +} + export function BrowserUseIcon(props: SVGProps) { return ( = { + id: 'brightdata_cancel_snapshot', + name: 'Bright Data Cancel Snapshot', + description: + 'Cancel an active Bright Data scraping job using its snapshot ID. Terminates data collection in progress.', + version: '1.0.0', + + params: { + apiKey: { + type: 'string', + required: true, + visibility: 'user-only', + description: 'Bright Data API token', + }, + snapshotId: { + type: 'string', + required: true, + visibility: 'user-or-llm', + description: 'The snapshot ID of the collection to cancel (e.g., "s_m4x7enmven8djfqak")', + }, + }, + + request: { + method: 'POST', + url: (params) => + `https://api.brightdata.com/datasets/v3/snapshot/${params.snapshotId?.trim()}/cancel`, + headers: (params) => ({ + Authorization: `Bearer ${params.apiKey}`, + }), + }, + + transformResponse: async (response: Response) => { + if (!response.ok) { + const errorText = await response.text() + throw new Error(errorText || `Cancel snapshot failed with status ${response.status}`) + } + + const data = (await response.json().catch(() => null)) as Record | null + return { + success: true, + output: { + snapshotId: (data?.snapshot_id as string) ?? null, + cancelled: true, + }, + } + }, + + outputs: { + snapshotId: { + type: 'string', + description: 'The snapshot ID that was cancelled', + optional: true, + }, + cancelled: { + type: 'boolean', + description: 'Whether the cancellation was successful', + }, + }, +} diff --git a/apps/sim/tools/brightdata/discover.ts b/apps/sim/tools/brightdata/discover.ts new file mode 100644 index 00000000000..8f0f8ced6b8 --- /dev/null +++ b/apps/sim/tools/brightdata/discover.ts @@ -0,0 +1,158 @@ +import type { BrightDataDiscoverParams, BrightDataDiscoverResponse } from '@/tools/brightdata/types' +import type { ToolConfig } from '@/tools/types' + +export const brightDataDiscoverTool: ToolConfig< + BrightDataDiscoverParams, + BrightDataDiscoverResponse +> = { + id: 'brightdata_discover', + name: 'Bright Data Discover', + description: + 'AI-powered web discovery that finds and ranks results by intent. Returns up to 1,000 results with optional cleaned page content for RAG and verification.', + version: '1.0.0', + + params: { + apiKey: { + type: 'string', + required: true, + visibility: 'user-only', + description: 'Bright Data API token', + }, + query: { + type: 'string', + required: true, + visibility: 'user-or-llm', + description: 'The search query (e.g., "competitor pricing changes enterprise plan")', + }, + numResults: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Number of results to return, up to 1000. Defaults to 10', + }, + intent: { + type: 'string', + required: false, + visibility: 'user-or-llm', + description: + 'Describes what the agent is trying to accomplish, used to rank results by relevance (e.g., "find official pricing pages and change notes")', + }, + includeContent: { + type: 'boolean', + required: false, + visibility: 'user-or-llm', + description: 'Whether to include cleaned page content in results', + }, + format: { + type: 'string', + required: false, + visibility: 'user-or-llm', + description: 'Response format: "json" or "markdown". Defaults to "json"', + }, + language: { + type: 'string', + required: false, + visibility: 'user-or-llm', + description: 'Search language code (e.g., "en", "es", "fr"). Defaults to "en"', + }, + country: { + type: 'string', + required: false, + visibility: 'user-or-llm', + description: 'Two-letter ISO country code for localized results (e.g., "us", "gb")', + }, + }, + + request: { + method: 'POST', + url: 'https://api.brightdata.com/discover', + headers: (params) => ({ + 'Content-Type': 'application/json', + Authorization: `Bearer ${params.apiKey}`, + }), + body: (params) => { + const body: Record = { + query: params.query, + } + if (params.numResults) body.num_results = params.numResults + if (params.intent) body.intent = params.intent + if (params.includeContent != null) body.include_content = params.includeContent + if (params.format) body.format = params.format + if (params.language) body.language = params.language + if (params.country) body.country = params.country + return body + }, + }, + + transformResponse: async (response: Response) => { + if (!response.ok) { + const errorText = await response.text() + throw new Error(errorText || `Discover request failed with status ${response.status}`) + } + + const data = await response.json() + + let results: Array<{ + url: string | null + title: string | null + description: string | null + relevanceScore: number | null + content: string | null + }> = [] + + const items = Array.isArray(data) ? data : (data?.results ?? data?.data ?? []) + + if (Array.isArray(items)) { + results = items.map((item: Record) => ({ + url: (item.link as string) ?? (item.url as string) ?? null, + title: (item.title as string) ?? null, + description: (item.description as string) ?? (item.snippet as string) ?? null, + relevanceScore: (item.relevance_score as number) ?? null, + content: + (item.content as string) ?? (item.text as string) ?? (item.markdown as string) ?? null, + })) + } + + return { + success: true, + output: { + results, + query: null, + totalResults: results.length, + }, + } + }, + + outputs: { + results: { + type: 'array', + description: 'Array of discovered web results ranked by intent relevance', + items: { + type: 'object', + description: 'A discovered result', + properties: { + url: { type: 'string', description: 'URL of the discovered page', optional: true }, + title: { type: 'string', description: 'Page title', optional: true }, + description: { + type: 'string', + description: 'Page description or snippet', + optional: true, + }, + relevanceScore: { + type: 'number', + description: 'AI-calculated relevance score for intent-based ranking', + optional: true, + }, + content: { + type: 'string', + description: + 'Cleaned page content in the requested format (when includeContent is true)', + optional: true, + }, + }, + }, + }, + query: { type: 'string', description: 'The search query that was executed', optional: true }, + totalResults: { type: 'number', description: 'Total number of results returned' }, + }, +} diff --git a/apps/sim/tools/brightdata/download_snapshot.ts b/apps/sim/tools/brightdata/download_snapshot.ts new file mode 100644 index 00000000000..c62cfc4c684 --- /dev/null +++ b/apps/sim/tools/brightdata/download_snapshot.ts @@ -0,0 +1,116 @@ +import type { + BrightDataDownloadSnapshotParams, + BrightDataDownloadSnapshotResponse, +} from '@/tools/brightdata/types' +import type { ToolConfig } from '@/tools/types' + +export const brightDataDownloadSnapshotTool: ToolConfig< + BrightDataDownloadSnapshotParams, + BrightDataDownloadSnapshotResponse +> = { + id: 'brightdata_download_snapshot', + name: 'Bright Data Download Snapshot', + description: + 'Download the results of a completed Bright Data scraping job using its snapshot ID. The snapshot must have ready status.', + version: '1.0.0', + + params: { + apiKey: { + type: 'string', + required: true, + visibility: 'user-only', + description: 'Bright Data API token', + }, + snapshotId: { + type: 'string', + required: true, + visibility: 'user-or-llm', + description: + 'The snapshot ID returned when the collection was triggered (e.g., "s_m4x7enmven8djfqak")', + }, + format: { + type: 'string', + required: false, + visibility: 'user-or-llm', + description: 'Output format: "json", "ndjson", "jsonl", or "csv". Defaults to "json"', + }, + compress: { + type: 'boolean', + required: false, + visibility: 'user-or-llm', + description: 'Whether to compress the results', + }, + }, + + request: { + method: 'GET', + url: (params) => { + const queryParams = new URLSearchParams() + if (params.format) queryParams.set('format', params.format) + if (params.compress) queryParams.set('compress', 'true') + const qs = queryParams.toString() + return `https://api.brightdata.com/datasets/v3/snapshot/${params.snapshotId?.trim()}${qs ? `?${qs}` : ''}` + }, + headers: (params) => ({ + Authorization: `Bearer ${params.apiKey}`, + }), + }, + + transformResponse: async (response: Response) => { + if (response.status === 409) { + throw new Error( + 'Snapshot is not ready for download. Check the snapshot status first and wait until it is "ready".' + ) + } + + if (!response.ok) { + const errorText = await response.text() + throw new Error(errorText || `Snapshot download failed with status ${response.status}`) + } + + const contentType = response.headers.get('content-type') || '' + let data: Array> + + if (contentType.includes('application/json')) { + const parsed = await response.json() + data = Array.isArray(parsed) ? parsed : [parsed] + } else { + const text = await response.text() + try { + const parsed = JSON.parse(text) + data = Array.isArray(parsed) ? parsed : [parsed] + } catch { + data = [{ raw: text }] + } + } + + return { + success: true, + output: { + data, + format: contentType, + snapshotId: (data[0]?.snapshot_id as string) ?? null, + }, + } + }, + + outputs: { + data: { + type: 'array', + description: 'Array of scraped result records', + items: { + type: 'json', + description: 'A scraped record with dataset-specific fields', + }, + }, + format: { + type: 'string', + description: 'The content type of the downloaded data', + }, + snapshotId: { + type: 'string', + description: 'The snapshot ID that was downloaded', + optional: true, + }, + }, +} diff --git a/apps/sim/tools/brightdata/index.ts b/apps/sim/tools/brightdata/index.ts new file mode 100644 index 00000000000..9e4a7fc713b --- /dev/null +++ b/apps/sim/tools/brightdata/index.ts @@ -0,0 +1,19 @@ +import { brightDataCancelSnapshotTool } from '@/tools/brightdata/cancel_snapshot' +import { brightDataDiscoverTool } from '@/tools/brightdata/discover' +import { brightDataDownloadSnapshotTool } from '@/tools/brightdata/download_snapshot' +import { brightDataScrapeDatasetTool } from '@/tools/brightdata/scrape_dataset' +import { brightDataScrapeUrlTool } from '@/tools/brightdata/scrape_url' +import { brightDataSerpSearchTool } from '@/tools/brightdata/serp_search' +import { brightDataSnapshotStatusTool } from '@/tools/brightdata/snapshot_status' +import { brightDataSyncScrapeTool } from '@/tools/brightdata/sync_scrape' + +export { + brightDataCancelSnapshotTool, + brightDataDiscoverTool, + brightDataDownloadSnapshotTool, + brightDataScrapeDatasetTool, + brightDataScrapeUrlTool, + brightDataSerpSearchTool, + brightDataSnapshotStatusTool, + brightDataSyncScrapeTool, +} diff --git a/apps/sim/tools/brightdata/scrape_dataset.ts b/apps/sim/tools/brightdata/scrape_dataset.ts new file mode 100644 index 00000000000..f53891d99f4 --- /dev/null +++ b/apps/sim/tools/brightdata/scrape_dataset.ts @@ -0,0 +1,97 @@ +import type { + BrightDataScrapeDatasetParams, + BrightDataScrapeDatasetResponse, +} from '@/tools/brightdata/types' +import type { ToolConfig } from '@/tools/types' + +export const brightDataScrapeDatasetTool: ToolConfig< + BrightDataScrapeDatasetParams, + BrightDataScrapeDatasetResponse +> = { + id: 'brightdata_scrape_dataset', + name: 'Bright Data Scrape Dataset', + description: + 'Trigger a Bright Data pre-built scraper to extract structured data from URLs. Supports 660+ scrapers for platforms like Amazon, LinkedIn, Instagram, and more.', + version: '1.0.0', + + params: { + apiKey: { + type: 'string', + required: true, + visibility: 'user-only', + description: 'Bright Data API token', + }, + datasetId: { + type: 'string', + required: true, + visibility: 'user-or-llm', + description: + 'Dataset scraper ID from your Bright Data dashboard (e.g., "gd_l1viktl72bvl7bjuj0")', + }, + urls: { + type: 'string', + required: true, + visibility: 'user-or-llm', + description: + 'JSON array of URL objects to scrape (e.g., [{"url": "https://example.com/product"}])', + }, + format: { + type: 'string', + required: false, + visibility: 'user-or-llm', + description: 'Output format: "json" or "csv". Defaults to "json"', + }, + }, + + request: { + method: 'POST', + url: (params) => { + const queryParams = new URLSearchParams() + queryParams.set('dataset_id', params.datasetId) + queryParams.set('format', params.format || 'json') + return `https://api.brightdata.com/datasets/v3/trigger?${queryParams.toString()}` + }, + headers: (params) => ({ + 'Content-Type': 'application/json', + Authorization: `Bearer ${params.apiKey}`, + }), + body: (params) => { + if (typeof params.urls === 'string') { + try { + return JSON.parse(params.urls) + } catch { + return [{ url: params.urls }] + } + } + return params.urls + }, + }, + + transformResponse: async (response: Response) => { + if (!response.ok) { + const errorText = await response.text() + throw new Error(errorText || `Dataset trigger failed with status ${response.status}`) + } + + const data = await response.json() + + return { + success: true, + output: { + snapshotId: data.snapshot_id ?? data.snapshotId ?? '', + status: data.status ?? 'triggered', + }, + } + }, + + outputs: { + snapshotId: { + type: 'string', + description: 'The snapshot ID to retrieve results later', + }, + status: { + type: 'string', + description: 'Status of the scraping job (e.g., "triggered", "running")', + }, + }, +} diff --git a/apps/sim/tools/brightdata/scrape_url.ts b/apps/sim/tools/brightdata/scrape_url.ts new file mode 100644 index 00000000000..1fe284cd31b --- /dev/null +++ b/apps/sim/tools/brightdata/scrape_url.ts @@ -0,0 +1,103 @@ +import type { + BrightDataScrapeUrlParams, + BrightDataScrapeUrlResponse, +} from '@/tools/brightdata/types' +import type { ToolConfig } from '@/tools/types' + +export const brightDataScrapeUrlTool: ToolConfig< + BrightDataScrapeUrlParams, + BrightDataScrapeUrlResponse +> = { + id: 'brightdata_scrape_url', + name: 'Bright Data Scrape URL', + description: + 'Fetch content from any URL using Bright Data Web Unlocker. Bypasses anti-bot protections, CAPTCHAs, and IP blocks automatically.', + version: '1.0.0', + + params: { + apiKey: { + type: 'string', + required: true, + visibility: 'user-only', + description: 'Bright Data API token', + }, + zone: { + type: 'string', + required: true, + visibility: 'user-or-llm', + description: 'Web Unlocker zone name from your Bright Data dashboard (e.g., "web_unlocker1")', + }, + url: { + type: 'string', + required: true, + visibility: 'user-or-llm', + description: 'The URL to scrape (e.g., "https://example.com/page")', + }, + format: { + type: 'string', + required: false, + visibility: 'user-or-llm', + description: + 'Response format: "raw" for HTML or "json" for parsed content. Defaults to "raw"', + }, + country: { + type: 'string', + required: false, + visibility: 'user-or-llm', + description: 'Two-letter country code for geo-targeting (e.g., "us", "gb")', + }, + }, + + request: { + method: 'POST', + url: 'https://api.brightdata.com/request', + headers: (params) => ({ + 'Content-Type': 'application/json', + Authorization: `Bearer ${params.apiKey}`, + }), + body: (params) => { + const body: Record = { + zone: params.zone, + url: params.url, + format: params.format || 'raw', + } + if (params.country) body.country = params.country + return body + }, + }, + + transformResponse: async (response: Response) => { + const contentType = response.headers.get('content-type') || '' + + if (!response.ok) { + const errorText = await response.text() + throw new Error(errorText || `Request failed with status ${response.status}`) + } + + let content: string + if (contentType.includes('application/json')) { + const data = await response.json() + content = typeof data === 'string' ? data : JSON.stringify(data) + } else { + content = await response.text() + } + + return { + success: true, + output: { + content, + url: null, + statusCode: response.status, + }, + } + }, + + outputs: { + content: { + type: 'string', + description: 'The scraped page content (HTML or JSON depending on format)', + }, + url: { type: 'string', description: 'The URL that was scraped', optional: true }, + statusCode: { type: 'number', description: 'HTTP status code of the response', optional: true }, + }, +} diff --git a/apps/sim/tools/brightdata/serp_search.ts b/apps/sim/tools/brightdata/serp_search.ts new file mode 100644 index 00000000000..e9ed8ef1de3 --- /dev/null +++ b/apps/sim/tools/brightdata/serp_search.ts @@ -0,0 +1,214 @@ +import type { + BrightDataSerpSearchParams, + BrightDataSerpSearchResponse, +} from '@/tools/brightdata/types' +import type { ToolConfig } from '@/tools/types' + +const SEARCH_ENGINE_CONFIG: Record< + string, + { url: string; queryKey: string; numKey: string; langKey: string; countryKey: string } +> = { + google: { + url: 'https://www.google.com/search', + queryKey: 'q', + numKey: 'num', + langKey: 'hl', + countryKey: 'gl', + }, + bing: { + url: 'https://www.bing.com/search', + queryKey: 'q', + numKey: 'count', + langKey: 'setLang', + countryKey: 'cc', + }, + duckduckgo: { + url: 'https://duckduckgo.com/', + queryKey: 'q', + numKey: '', + langKey: '', + countryKey: '', + }, + yandex: { + url: 'https://yandex.com/search/', + queryKey: 'text', + numKey: 'numdoc', + langKey: 'lang', + countryKey: '', + }, +} as const + +export const brightDataSerpSearchTool: ToolConfig< + BrightDataSerpSearchParams, + BrightDataSerpSearchResponse +> = { + id: 'brightdata_serp_search', + name: 'Bright Data SERP Search', + description: + 'Search Google, Bing, DuckDuckGo, or Yandex and get structured search results using Bright Data SERP API.', + version: '1.0.0', + + params: { + apiKey: { + type: 'string', + required: true, + visibility: 'user-only', + description: 'Bright Data API token', + }, + zone: { + type: 'string', + required: true, + visibility: 'user-or-llm', + description: 'SERP API zone name from your Bright Data dashboard (e.g., "serp_api1")', + }, + query: { + type: 'string', + required: true, + visibility: 'user-or-llm', + description: 'The search query (e.g., "best project management tools")', + }, + searchEngine: { + type: 'string', + required: false, + visibility: 'user-or-llm', + description: + 'Search engine to use: "google", "bing", "duckduckgo", or "yandex". Defaults to "google"', + }, + country: { + type: 'string', + required: false, + visibility: 'user-or-llm', + description: 'Two-letter country code for localized results (e.g., "us", "gb")', + }, + language: { + type: 'string', + required: false, + visibility: 'user-or-llm', + description: 'Two-letter language code (e.g., "en", "es")', + }, + numResults: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Number of results to return (e.g., 10, 20). Defaults to 10', + }, + }, + + request: { + method: 'POST', + url: 'https://api.brightdata.com/request', + headers: (params) => ({ + 'Content-Type': 'application/json', + Authorization: `Bearer ${params.apiKey}`, + }), + body: (params) => { + const engine = params.searchEngine || 'google' + const config = SEARCH_ENGINE_CONFIG[engine] || SEARCH_ENGINE_CONFIG.google + + const searchParams = new URLSearchParams() + searchParams.set(config.queryKey, params.query) + if (params.numResults && config.numKey) { + searchParams.set(config.numKey, String(params.numResults)) + } + if (params.language && config.langKey) { + searchParams.set(config.langKey, params.language) + } + if (params.country && config.countryKey) { + searchParams.set(config.countryKey, params.country) + } + + searchParams.set('brd_json', '1') + + const body: Record = { + zone: params.zone, + url: `${config.url}?${searchParams.toString()}`, + format: 'raw', + } + if (params.country) body.country = params.country + return body + }, + }, + + transformResponse: async (response: Response) => { + if (!response.ok) { + const errorText = await response.text() + throw new Error(errorText || `SERP request failed with status ${response.status}`) + } + + const contentType = response.headers.get('content-type') || '' + let results: Array<{ + title: string | null + url: string | null + description: string | null + rank: number | null + }> = [] + let data: Record | null = null + + if (contentType.includes('application/json')) { + data = await response.json() + + if (Array.isArray(data?.organic)) { + results = data.organic.map((item: Record, index: number) => ({ + title: (item.title as string) ?? null, + url: (item.link as string) ?? (item.url as string) ?? null, + description: (item.description as string) ?? (item.snippet as string) ?? null, + rank: index + 1, + })) + } else if (Array.isArray(data)) { + results = data.map((item: Record, index: number) => ({ + title: (item.title as string) ?? null, + url: (item.link as string) ?? (item.url as string) ?? null, + description: (item.description as string) ?? (item.snippet as string) ?? null, + rank: index + 1, + })) + } + } else { + const text = await response.text() + results = [ + { + title: 'Raw SERP Response', + url: null, + description: text.slice(0, 500), + rank: 1, + }, + ] + } + + return { + success: true, + output: { + results, + query: ((data?.general as Record | undefined)?.query as string) ?? null, + searchEngine: + ((data?.general as Record | undefined)?.search_engine as string) ?? null, + }, + } + }, + + outputs: { + results: { + type: 'array', + description: 'Array of search results', + items: { + type: 'object', + description: 'A search result entry', + properties: { + title: { type: 'string', description: 'Title of the search result', optional: true }, + url: { type: 'string', description: 'URL of the search result', optional: true }, + description: { + type: 'string', + description: 'Snippet or description of the result', + optional: true, + }, + rank: { type: 'number', description: 'Position in search results', optional: true }, + }, + }, + }, + query: { type: 'string', description: 'The search query that was executed', optional: true }, + searchEngine: { + type: 'string', + description: 'The search engine that was used', + optional: true, + }, + }, +} diff --git a/apps/sim/tools/brightdata/snapshot_status.ts b/apps/sim/tools/brightdata/snapshot_status.ts new file mode 100644 index 00000000000..d6fe69bd3d8 --- /dev/null +++ b/apps/sim/tools/brightdata/snapshot_status.ts @@ -0,0 +1,74 @@ +import type { + BrightDataSnapshotStatusParams, + BrightDataSnapshotStatusResponse, +} from '@/tools/brightdata/types' +import type { ToolConfig } from '@/tools/types' + +export const brightDataSnapshotStatusTool: ToolConfig< + BrightDataSnapshotStatusParams, + BrightDataSnapshotStatusResponse +> = { + id: 'brightdata_snapshot_status', + name: 'Bright Data Snapshot Status', + description: + 'Check the progress of an async Bright Data scraping job. Returns status: starting, running, ready, or failed.', + version: '1.0.0', + + params: { + apiKey: { + type: 'string', + required: true, + visibility: 'user-only', + description: 'Bright Data API token', + }, + snapshotId: { + type: 'string', + required: true, + visibility: 'user-or-llm', + description: + 'The snapshot ID returned when the collection was triggered (e.g., "s_m4x7enmven8djfqak")', + }, + }, + + request: { + method: 'GET', + url: (params) => `https://api.brightdata.com/datasets/v3/progress/${params.snapshotId?.trim()}`, + headers: (params) => ({ + Authorization: `Bearer ${params.apiKey}`, + }), + }, + + transformResponse: async (response: Response) => { + if (!response.ok) { + const errorText = await response.text() + throw new Error(errorText || `Snapshot status check failed with status ${response.status}`) + } + + const data = await response.json() + + return { + success: true, + output: { + snapshotId: data.snapshot_id ?? null, + datasetId: data.dataset_id ?? null, + status: data.status ?? 'unknown', + }, + } + }, + + outputs: { + snapshotId: { + type: 'string', + description: 'The snapshot ID that was queried', + }, + datasetId: { + type: 'string', + description: 'The dataset ID associated with this snapshot', + optional: true, + }, + status: { + type: 'string', + description: 'Current status of the snapshot: "starting", "running", "ready", or "failed"', + }, + }, +} diff --git a/apps/sim/tools/brightdata/sync_scrape.ts b/apps/sim/tools/brightdata/sync_scrape.ts new file mode 100644 index 00000000000..8ac8d0108cc --- /dev/null +++ b/apps/sim/tools/brightdata/sync_scrape.ts @@ -0,0 +1,131 @@ +import type { + BrightDataSyncScrapeParams, + BrightDataSyncScrapeResponse, +} from '@/tools/brightdata/types' +import type { ToolConfig } from '@/tools/types' + +export const brightDataSyncScrapeTool: ToolConfig< + BrightDataSyncScrapeParams, + BrightDataSyncScrapeResponse +> = { + id: 'brightdata_sync_scrape', + name: 'Bright Data Sync Scrape', + description: + 'Scrape URLs synchronously using a Bright Data pre-built scraper and get structured results directly. Supports up to 20 URLs with a 1-minute timeout.', + version: '1.0.0', + + params: { + apiKey: { + type: 'string', + required: true, + visibility: 'user-only', + description: 'Bright Data API token', + }, + datasetId: { + type: 'string', + required: true, + visibility: 'user-or-llm', + description: + 'Dataset scraper ID from your Bright Data dashboard (e.g., "gd_l1viktl72bvl7bjuj0")', + }, + urls: { + type: 'string', + required: true, + visibility: 'user-or-llm', + description: + 'JSON array of URL objects to scrape, up to 20 (e.g., [{"url": "https://example.com/product"}])', + }, + format: { + type: 'string', + required: false, + visibility: 'user-or-llm', + description: 'Output format: "json", "ndjson", or "csv". Defaults to "json"', + }, + includeErrors: { + type: 'boolean', + required: false, + visibility: 'user-or-llm', + description: 'Whether to include error reports in results', + }, + }, + + request: { + method: 'POST', + url: (params) => { + const queryParams = new URLSearchParams() + queryParams.set('dataset_id', params.datasetId) + queryParams.set('format', params.format || 'json') + if (params.includeErrors) queryParams.set('include_errors', 'true') + return `https://api.brightdata.com/datasets/v3/scrape?${queryParams.toString()}` + }, + headers: (params) => ({ + 'Content-Type': 'application/json', + Authorization: `Bearer ${params.apiKey}`, + }), + body: (params) => { + if (typeof params.urls === 'string') { + try { + const parsed = JSON.parse(params.urls) + return { input: Array.isArray(parsed) ? parsed : [parsed] } + } catch { + return { input: [{ url: params.urls }] } + } + } + return { input: params.urls } + }, + }, + + transformResponse: async (response: Response) => { + if (!response.ok) { + const errorText = await response.text() + throw new Error(errorText || `Sync scrape failed with status ${response.status}`) + } + + if (response.status === 202) { + const data = await response.json() + return { + success: true, + output: { + data: [], + snapshotId: data.snapshot_id ?? null, + isAsync: true, + }, + } + } + + const data = await response.json() + const results = Array.isArray(data) ? data : [data] + + return { + success: true, + output: { + data: results, + snapshotId: null, + isAsync: false, + }, + } + }, + + outputs: { + data: { + type: 'array', + description: + 'Array of scraped result objects with fields specific to the dataset scraper used', + items: { + type: 'json', + description: 'A scraped record with dataset-specific fields', + }, + }, + snapshotId: { + type: 'string', + description: + 'Snapshot ID returned if the request exceeded the 1-minute timeout and switched to async processing', + optional: true, + }, + isAsync: { + type: 'boolean', + description: + 'Whether the request fell back to async mode (true means use snapshot ID to retrieve results)', + }, + }, +} diff --git a/apps/sim/tools/brightdata/types.ts b/apps/sim/tools/brightdata/types.ts new file mode 100644 index 00000000000..31978269966 --- /dev/null +++ b/apps/sim/tools/brightdata/types.ts @@ -0,0 +1,145 @@ +import type { ToolResponse } from '@/tools/types' + +export interface BrightDataScrapeUrlParams { + apiKey: string + zone: string + url: string + format?: string + country?: string +} + +export interface BrightDataScrapeUrlResponse extends ToolResponse { + output: { + content: string + url: string | null + statusCode: number | null + } +} + +export interface BrightDataSerpSearchParams { + apiKey: string + zone: string + query: string + searchEngine?: string + country?: string + language?: string + numResults?: number +} + +export interface BrightDataSerpSearchResponse extends ToolResponse { + output: { + results: Array<{ + title: string | null + url: string | null + description: string | null + rank: number | null + }> + query: string | null + searchEngine: string | null + } +} + +export interface BrightDataScrapeDatasetParams { + apiKey: string + datasetId: string + urls: string + format?: string +} + +export interface BrightDataScrapeDatasetResponse extends ToolResponse { + output: { + snapshotId: string + status: string + } +} + +export interface BrightDataSyncScrapeParams { + apiKey: string + datasetId: string + urls: string + format?: string + includeErrors?: boolean +} + +export interface BrightDataSyncScrapeResponse extends ToolResponse { + output: { + data: Array> + snapshotId: string | null + isAsync: boolean + } +} + +export interface BrightDataSnapshotStatusParams { + apiKey: string + snapshotId: string +} + +export interface BrightDataSnapshotStatusResponse extends ToolResponse { + output: { + snapshotId: string | null + datasetId: string | null + status: string + } +} + +export interface BrightDataDownloadSnapshotParams { + apiKey: string + snapshotId: string + format?: string + compress?: boolean +} + +export interface BrightDataDownloadSnapshotResponse extends ToolResponse { + output: { + data: Array> + format: string + snapshotId: string | null + } +} + +export interface BrightDataCancelSnapshotParams { + apiKey: string + snapshotId: string +} + +export interface BrightDataCancelSnapshotResponse extends ToolResponse { + output: { + snapshotId: string | null + cancelled: boolean + } +} + +export interface BrightDataDiscoverParams { + apiKey: string + query: string + numResults?: number + intent?: string + includeContent?: boolean + format?: string + language?: string + country?: string +} + +export interface BrightDataDiscoverResponse extends ToolResponse { + output: { + results: Array<{ + url: string | null + title: string | null + description: string | null + relevanceScore: number | null + content: string | null + }> + query: string | null + totalResults: number + } +} + +export type BrightDataResponse = + | BrightDataScrapeUrlResponse + | BrightDataSerpSearchResponse + | BrightDataScrapeDatasetResponse + | BrightDataSyncScrapeResponse + | BrightDataSnapshotStatusResponse + | BrightDataDownloadSnapshotResponse + | BrightDataCancelSnapshotResponse + | BrightDataDiscoverResponse diff --git a/apps/sim/tools/registry.ts b/apps/sim/tools/registry.ts index c2177e15055..44144459a42 100644 --- a/apps/sim/tools/registry.ts +++ b/apps/sim/tools/registry.ts @@ -236,6 +236,16 @@ import { boxSignResendRequestTool, } from '@/tools/box_sign' import { brandfetchGetBrandTool, brandfetchSearchTool } from '@/tools/brandfetch' +import { + brightDataCancelSnapshotTool, + brightDataDiscoverTool, + brightDataDownloadSnapshotTool, + brightDataScrapeDatasetTool, + brightDataScrapeUrlTool, + brightDataSerpSearchTool, + brightDataSnapshotStatusTool, + brightDataSyncScrapeTool, +} from '@/tools/brightdata' import { browserUseRunTaskTool } from '@/tools/browser_use' import { calcomCancelBookingTool, @@ -2921,6 +2931,14 @@ export const tools: Record = { athena_stop_query: athenaStopQueryTool, brandfetch_get_brand: brandfetchGetBrandTool, brandfetch_search: brandfetchSearchTool, + brightdata_cancel_snapshot: brightDataCancelSnapshotTool, + brightdata_discover: brightDataDiscoverTool, + brightdata_download_snapshot: brightDataDownloadSnapshotTool, + brightdata_scrape_dataset: brightDataScrapeDatasetTool, + brightdata_scrape_url: brightDataScrapeUrlTool, + brightdata_serp_search: brightDataSerpSearchTool, + brightdata_snapshot_status: brightDataSnapshotStatusTool, + brightdata_sync_scrape: brightDataSyncScrapeTool, box_copy_file: boxCopyFileTool, box_create_folder: boxCreateFolderTool, box_delete_file: boxDeleteFileTool,