Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions scrapegraph-js/src/crawl.js
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import { getMockResponse } from './utils/mockResponse.js';
* @param {boolean} [options.extractionMode=true] - true for AI extraction, false for markdown conversion (NO AI/LLM)
* @param {boolean} [options.cacheWebsite=true] - Whether to cache the website content
* @param {number} [options.depth=2] - Maximum depth of the crawl (1-10)
* @param {number|null} [options.breadth] - Maximum number of links to crawl per depth level. If null/undefined, unlimited (default). Controls the 'width' of exploration at each depth. Useful for limiting crawl scope on large sites. Note: maxPages always takes priority. Ignored when sitemap=true.
* @param {number} [options.maxPages=2] - Maximum number of pages to crawl (1-100)
* @param {boolean} [options.sameDomainOnly=true] - Whether to only crawl pages from the same domain
* @param {boolean} [options.sitemap] - Whether to use sitemap for better page discovery
Expand Down Expand Up @@ -68,6 +69,7 @@ export async function crawl(
const {
cacheWebsite = true,
depth = 2,
breadth = null,
maxPages = 2,
sameDomainOnly = true,
sitemap = false,
Expand All @@ -87,6 +89,10 @@ export async function crawl(
render_heavy_js: renderHeavyJs,
};

if (breadth !== null && breadth !== undefined) {
payload.breadth = breadth;
}

if (stealth) {
payload.stealth = stealth;
}
Expand Down
8 changes: 8 additions & 0 deletions scrapegraph-py/scrapegraph_py/async_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -855,6 +855,7 @@ async def crawl(
extraction_mode: bool = True,
cache_website: bool = True,
depth: int = 2,
breadth: Optional[int] = None,
max_pages: int = 2,
same_domain_only: bool = True,
batch_size: Optional[int] = None,
Expand All @@ -877,6 +878,9 @@ async def crawl(
extraction_mode: Whether to use AI extraction (True) or markdown (False)
cache_website: Whether to cache the website
depth: Maximum depth of link traversal
breadth: Maximum number of links to crawl per depth level. If None, unlimited (default).
Controls the 'width' of exploration at each depth. Useful for limiting crawl scope
on large sites. Note: max_pages always takes priority. Ignored when sitemap=True.
max_pages: Maximum number of pages to crawl
same_domain_only: Only crawl pages within the same domain
batch_size: Number of pages to process in batch
Expand Down Expand Up @@ -905,6 +909,8 @@ async def crawl(
)
logger.debug(f"💾 Cache website: {cache_website}")
logger.debug(f"🔍 Depth: {depth}")
if breadth is not None:
logger.debug(f"📏 Breadth: {breadth}")
logger.debug(f"📄 Max pages: {max_pages}")
logger.debug(f"🏠 Same domain only: {same_domain_only}")
logger.debug(f"🗺️ Use sitemap: {sitemap}")
Expand Down Expand Up @@ -941,6 +947,8 @@ async def crawl(
request_data["prompt"] = prompt
if data_schema is not None:
request_data["data_schema"] = data_schema
if breadth is not None:
request_data["breadth"] = breadth
if batch_size is not None:
request_data["batch_size"] = batch_size
if headers is not None:
Expand Down
8 changes: 8 additions & 0 deletions scrapegraph-py/scrapegraph_py/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -865,6 +865,7 @@ def crawl(
extraction_mode: bool = True,
cache_website: bool = True,
depth: int = 2,
breadth: Optional[int] = None,
max_pages: int = 2,
same_domain_only: bool = True,
batch_size: Optional[int] = None,
Expand All @@ -887,6 +888,9 @@ def crawl(
extraction_mode: Whether to use AI extraction (True) or markdown (False)
cache_website: Whether to cache the website
depth: Maximum depth of link traversal
breadth: Maximum number of links to crawl per depth level. If None, unlimited (default).
Controls the 'width' of exploration at each depth. Useful for limiting crawl scope
on large sites. Note: max_pages always takes priority. Ignored when sitemap=True.
max_pages: Maximum number of pages to crawl
same_domain_only: Only crawl pages within the same domain
batch_size: Number of pages to process in batch
Expand Down Expand Up @@ -915,6 +919,8 @@ def crawl(
)
logger.debug(f"💾 Cache website: {cache_website}")
logger.debug(f"🔍 Depth: {depth}")
if breadth is not None:
logger.debug(f"📏 Breadth: {breadth}")
logger.debug(f"📄 Max pages: {max_pages}")
logger.debug(f"🏠 Same domain only: {same_domain_only}")
logger.debug(f"🗺️ Use sitemap: {sitemap}")
Expand Down Expand Up @@ -951,6 +957,8 @@ def crawl(
request_data["prompt"] = prompt
if data_schema is not None:
request_data["data_schema"] = data_schema
if breadth is not None:
request_data["breadth"] = breadth
if batch_size is not None:
request_data["batch_size"] = batch_size
if headers is not None:
Expand Down
8 changes: 8 additions & 0 deletions scrapegraph-py/scrapegraph_py/models/crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,14 @@ class CrawlRequest(BaseModel):
depth: conint(ge=1, le=10) = Field(
default=2, description="Maximum depth of the crawl (1-10)"
)
breadth: Optional[conint(ge=1)] = Field(
default=None,
description="Maximum number of links to crawl per depth level. "
"If None, unlimited (default). Controls the 'width' of exploration at each depth. "
"Useful for limiting crawl scope on large sites. Note: max_pages always takes priority - "
"the total crawled pages will never exceed max_pages regardless of breadth setting. "
"Ignored when sitemap=True (sitemap mode uses sitemap URLs directly instead of link discovery).",
)
max_pages: conint(ge=1, le=100) = Field(
default=2, description="Maximum number of pages to crawl (1-100)"
)
Expand Down
Loading