diff --git a/scrapegraph-js/src/crawl.js b/scrapegraph-js/src/crawl.js index 631f3f7..45e5a2a 100644 --- a/scrapegraph-js/src/crawl.js +++ b/scrapegraph-js/src/crawl.js @@ -16,6 +16,7 @@ import { getMockResponse } from './utils/mockResponse.js'; * @param {boolean} [options.extractionMode=true] - true for AI extraction, false for markdown conversion (NO AI/LLM) * @param {boolean} [options.cacheWebsite=true] - Whether to cache the website content * @param {number} [options.depth=2] - Maximum depth of the crawl (1-10) + * @param {number|null} [options.breadth] - Maximum number of links to crawl per depth level. If null/undefined, unlimited (default). Controls the 'width' of exploration at each depth. Useful for limiting crawl scope on large sites. Note: maxPages always takes priority. Ignored when sitemap=true. * @param {number} [options.maxPages=2] - Maximum number of pages to crawl (1-100) * @param {boolean} [options.sameDomainOnly=true] - Whether to only crawl pages from the same domain * @param {boolean} [options.sitemap] - Whether to use sitemap for better page discovery @@ -68,6 +69,7 @@ export async function crawl( const { cacheWebsite = true, depth = 2, + breadth = null, maxPages = 2, sameDomainOnly = true, sitemap = false, @@ -87,6 +89,10 @@ export async function crawl( render_heavy_js: renderHeavyJs, }; + if (breadth !== null && breadth !== undefined) { + payload.breadth = breadth; + } + if (stealth) { payload.stealth = stealth; } diff --git a/scrapegraph-py/scrapegraph_py/async_client.py b/scrapegraph-py/scrapegraph_py/async_client.py index 4491482..22331b0 100644 --- a/scrapegraph-py/scrapegraph_py/async_client.py +++ b/scrapegraph-py/scrapegraph_py/async_client.py @@ -855,6 +855,7 @@ async def crawl( extraction_mode: bool = True, cache_website: bool = True, depth: int = 2, + breadth: Optional[int] = None, max_pages: int = 2, same_domain_only: bool = True, batch_size: Optional[int] = None, @@ -877,6 +878,9 @@ async def crawl( extraction_mode: Whether to use AI extraction (True) or markdown (False) cache_website: Whether to cache the website depth: Maximum depth of link traversal + breadth: Maximum number of links to crawl per depth level. If None, unlimited (default). + Controls the 'width' of exploration at each depth. Useful for limiting crawl scope + on large sites. Note: max_pages always takes priority. Ignored when sitemap=True. max_pages: Maximum number of pages to crawl same_domain_only: Only crawl pages within the same domain batch_size: Number of pages to process in batch @@ -905,6 +909,8 @@ async def crawl( ) logger.debug(f"πŸ’Ύ Cache website: {cache_website}") logger.debug(f"πŸ” Depth: {depth}") + if breadth is not None: + logger.debug(f"πŸ“ Breadth: {breadth}") logger.debug(f"πŸ“„ Max pages: {max_pages}") logger.debug(f"🏠 Same domain only: {same_domain_only}") logger.debug(f"πŸ—ΊοΈ Use sitemap: {sitemap}") @@ -941,6 +947,8 @@ async def crawl( request_data["prompt"] = prompt if data_schema is not None: request_data["data_schema"] = data_schema + if breadth is not None: + request_data["breadth"] = breadth if batch_size is not None: request_data["batch_size"] = batch_size if headers is not None: diff --git a/scrapegraph-py/scrapegraph_py/client.py b/scrapegraph-py/scrapegraph_py/client.py index 5a04311..c47ba82 100644 --- a/scrapegraph-py/scrapegraph_py/client.py +++ b/scrapegraph-py/scrapegraph_py/client.py @@ -865,6 +865,7 @@ def crawl( extraction_mode: bool = True, cache_website: bool = True, depth: int = 2, + breadth: Optional[int] = None, max_pages: int = 2, same_domain_only: bool = True, batch_size: Optional[int] = None, @@ -887,6 +888,9 @@ def crawl( extraction_mode: Whether to use AI extraction (True) or markdown (False) cache_website: Whether to cache the website depth: Maximum depth of link traversal + breadth: Maximum number of links to crawl per depth level. If None, unlimited (default). + Controls the 'width' of exploration at each depth. Useful for limiting crawl scope + on large sites. Note: max_pages always takes priority. Ignored when sitemap=True. max_pages: Maximum number of pages to crawl same_domain_only: Only crawl pages within the same domain batch_size: Number of pages to process in batch @@ -915,6 +919,8 @@ def crawl( ) logger.debug(f"πŸ’Ύ Cache website: {cache_website}") logger.debug(f"πŸ” Depth: {depth}") + if breadth is not None: + logger.debug(f"πŸ“ Breadth: {breadth}") logger.debug(f"πŸ“„ Max pages: {max_pages}") logger.debug(f"🏠 Same domain only: {same_domain_only}") logger.debug(f"πŸ—ΊοΈ Use sitemap: {sitemap}") @@ -951,6 +957,8 @@ def crawl( request_data["prompt"] = prompt if data_schema is not None: request_data["data_schema"] = data_schema + if breadth is not None: + request_data["breadth"] = breadth if batch_size is not None: request_data["batch_size"] = batch_size if headers is not None: diff --git a/scrapegraph-py/scrapegraph_py/models/crawl.py b/scrapegraph-py/scrapegraph_py/models/crawl.py index 7f4638f..e515b80 100644 --- a/scrapegraph-py/scrapegraph_py/models/crawl.py +++ b/scrapegraph-py/scrapegraph_py/models/crawl.py @@ -54,6 +54,14 @@ class CrawlRequest(BaseModel): depth: conint(ge=1, le=10) = Field( default=2, description="Maximum depth of the crawl (1-10)" ) + breadth: Optional[conint(ge=1)] = Field( + default=None, + description="Maximum number of links to crawl per depth level. " + "If None, unlimited (default). Controls the 'width' of exploration at each depth. " + "Useful for limiting crawl scope on large sites. Note: max_pages always takes priority - " + "the total crawled pages will never exceed max_pages regardless of breadth setting. " + "Ignored when sitemap=True (sitemap mode uses sitemap URLs directly instead of link discovery).", + ) max_pages: conint(ge=1, le=100) = Field( default=2, description="Maximum number of pages to crawl (1-100)" )