From a774f8da28c76857fb236a7ae6fb5a525d853656 Mon Sep 17 00:00:00 2001 From: "dylan.min" Date: Sun, 4 Jan 2026 09:24:16 +0800 Subject: [PATCH 1/3] feat(docker): optimize concurrency performance and memory management This commit consolidates several optimizations for crawl4ai in high-concurrency environments: 1. Browser Pool Optimization: - Implemented a tiered browser pool (Hot, Cold, Retired). - Added a browser retirement mechanism based on usage count (MAX_USAGE_COUNT) and memory pressure (MEMORY_RETIRE_THRESHOLD). - Added reference counting (active_requests) to ensure browser instances are not closed while in use. - Enhanced the pool janitor with adaptive cleanup intervals based on system memory. 2. Resource Loading Optimization: - Integrated optional CSS and Ad blocking to reduce memory footprint and improve QPS. - Decoupled resource filtering from text_mode to allow granular control. 3. Stability and Scalability: - Added mandatory release_crawler calls in API/Server handlers to prevent resource leaks. - Introduced environment variables to toggle these new features (defaulting to False for safe community adoption). - Added optional 5-minute pool audit logs for better observability. Co-authored-by: dylan.min --- crawl4ai/async_configs.py | 10 ++ crawl4ai/browser_manager.py | 32 +++++- deploy/docker/api.py | 53 +++++----- deploy/docker/crawler_pool.py | 189 +++++++++++++++++++++++----------- deploy/docker/server.py | 46 +++++---- 5 files changed, 217 insertions(+), 113 deletions(-) diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index 10cc48d08..3d683a06c 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -418,6 +418,8 @@ class BrowserConfig: Default: []. enable_stealth (bool): If True, applies playwright-stealth to bypass basic bot detection. Cannot be used with use_undetected browser mode. Default: False. + avoid_ads (bool): If True, attempts to block ad-related network requests. Default: False. + avoid_css (bool): If True, blocks loading of CSS files for faster and leaner crawling. Default: False. """ def __init__( @@ -459,6 +461,8 @@ def __init__( debugging_port: int = 9222, host: str = "localhost", enable_stealth: bool = False, + avoid_ads: bool = False, + avoid_css: bool = False, ): self.browser_type = browser_type @@ -514,6 +518,8 @@ def __init__( self.debugging_port = debugging_port self.host = host self.enable_stealth = enable_stealth + self.avoid_ads = avoid_ads + self.avoid_css = avoid_css fa_user_agenr_generator = ValidUAGenerator() if self.user_agent_mode == "random": @@ -589,6 +595,8 @@ def from_kwargs(kwargs: dict) -> "BrowserConfig": debugging_port=kwargs.get("debugging_port", 9222), host=kwargs.get("host", "localhost"), enable_stealth=kwargs.get("enable_stealth", False), + avoid_ads=kwargs.get("avoid_ads", False), + avoid_css=kwargs.get("avoid_css", False), ) def to_dict(self): @@ -624,6 +632,8 @@ def to_dict(self): "debugging_port": self.debugging_port, "host": self.host, "enable_stealth": self.enable_stealth, + "avoid_ads": self.avoid_ads, + "avoid_css": self.avoid_css, } diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py index 3ca96aed4..f635b2959 100644 --- a/crawl4ai/browser_manager.py +++ b/crawl4ai/browser_manager.py @@ -897,7 +897,7 @@ async def create_browser_context(self, crawlerRunConfig: CrawlerRunConfig = None "otf", "eot", # Styles - # 'css', 'less', 'scss', 'sass', + "css", "less", "scss", "sass", # Media "mp4", "webm", @@ -933,6 +933,26 @@ async def create_browser_context(self, crawlerRunConfig: CrawlerRunConfig = None "wasm", ] + # Ad and Tracker patterns + ad_tracker_patterns = [ + "**/google-analytics.com/**", + "**/googletagmanager.com/**", + "**/googlesyndication.com/**", + "**/doubleclick.net/**", + "**/adservice.google.com/**", + "**/adsystem.com/**", + "**/adzerk.net/**", + "**/adnxs.com/**", + "**/ads.linkedin.com/**", + "**/facebook.net/**", + "**/analytics.twitter.com/**", + "**/t.co/**", + "**/hotjar.com/**", + "**/clarity.ms/**", + "**/scorecardresearch.com/**", + "**/pixel.wp.com/**", + ] + # Common context settings context_settings = { "user_agent": user_agent, @@ -986,11 +1006,17 @@ async def create_browser_context(self, crawlerRunConfig: CrawlerRunConfig = None # Create and return the context with all settings context = await self.browser.new_context(**context_settings) - # Apply text mode settings if enabled - if self.config.text_mode: + # Apply resource filtering based on config + if self.config.avoid_css or self.config.text_mode: # Create and apply route patterns for each extension for ext in blocked_extensions: await context.route(f"**/*.{ext}", lambda route: route.abort()) + + if self.config.avoid_ads: + # Apply ad/tracker blocking + for pattern in ad_tracker_patterns: + await context.route(pattern, lambda route: route.abort()) + return context def _make_config_signature(self, crawlerRunConfig: CrawlerRunConfig) -> str: diff --git a/deploy/docker/api.py b/deploy/docker/api.py index 81cd312ab..5892b8019 100644 --- a/deploy/docker/api.py +++ b/deploy/docker/api.py @@ -67,7 +67,8 @@ async def handle_llm_qa( config: dict ) -> str: """Process QA using LLM with crawled content as context.""" - from crawler_pool import get_crawler + from crawler_pool import get_crawler, release_crawler + crawler = None try: if not url.startswith(('http://', 'https://')) and not url.startswith(("raw:", "raw://")): url = 'https://' + url @@ -121,6 +122,9 @@ async def handle_llm_qa( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e) ) + finally: + if crawler: + await release_crawler(crawler) async def process_llm_extraction( redis: aioredis.Redis, @@ -249,6 +253,7 @@ async def handle_markdown_request( base_url: Optional[str] = None ) -> str: """Handle markdown generation requests.""" + crawler = None try: # Validate provider if using LLM filter if filter_type == FilterType.LLM: @@ -282,7 +287,7 @@ async def handle_markdown_request( cache_mode = CacheMode.ENABLED if cache == "1" else CacheMode.WRITE_ONLY - from crawler_pool import get_crawler + from crawler_pool import get_crawler, release_crawler from utils import load_config as _load_config _cfg = _load_config() browser_cfg = BrowserConfig( @@ -315,6 +320,9 @@ async def handle_markdown_request( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e) ) + finally: + if crawler: + await release_crawler(crawler) async def handle_llm_request( redis: aioredis.Redis, @@ -481,6 +489,7 @@ async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator) """Stream results with heartbeats and completion markers.""" import json from utils import datetime_handler + from crawler_pool import release_crawler try: async for result in results_gen: @@ -507,11 +516,8 @@ async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator) except asyncio.CancelledError: logger.warning("Client disconnected during streaming") finally: - # try: - # await crawler.close() - # except Exception as e: - # logger.error(f"Crawler cleanup error: {e}") - pass + if crawler: + await release_crawler(crawler) async def handle_crawl_request( urls: List[str], @@ -523,6 +529,7 @@ async def handle_crawl_request( """Handle non-streaming crawl requests with optional hooks.""" # Track request start request_id = f"req_{uuid4().hex[:8]}" + crawler = None try: from monitor import get_monitor await get_monitor().track_request_start( @@ -549,12 +556,9 @@ async def handle_crawl_request( ) if config["crawler"]["rate_limiter"]["enabled"] else None ) - from crawler_pool import get_crawler + from crawler_pool import get_crawler, release_crawler crawler = await get_crawler(browser_config) - # crawler: AsyncWebCrawler = AsyncWebCrawler(config=browser_config) - # await crawler.start() - # Attach hooks if provided hooks_status = {} if hooks_config: @@ -589,8 +593,6 @@ async def handle_crawl_request( if not isinstance(results, list): results = [results] - # await crawler.close() - end_mem_mb = _get_memory_mb() # <--- Get memory after end_time = time.time() @@ -689,13 +691,6 @@ async def handle_crawl_request( except: pass - if 'crawler' in locals() and crawler.ready: # Check if crawler was initialized and started - # try: - # await crawler.close() - # except Exception as close_e: - # logger.error(f"Error closing crawler during exception handling: {close_e}") - logger.error(f"Error closing crawler during exception handling: {str(e)}") - # Measure memory even on error if possible end_mem_mb_error = _get_memory_mb() if start_mem_mb is not None and end_mem_mb_error is not None: @@ -709,6 +704,9 @@ async def handle_crawl_request( "server_peak_memory_mb": max(peak_mem_mb if peak_mem_mb else 0, end_mem_mb_error or 0) }) ) + finally: + if crawler: + await release_crawler(crawler) async def handle_stream_crawl_request( urls: List[str], @@ -719,6 +717,7 @@ async def handle_stream_crawl_request( ) -> Tuple[AsyncWebCrawler, AsyncGenerator, Optional[Dict]]: """Handle streaming crawl requests with optional hooks.""" hooks_info = None + crawler = None try: browser_config = BrowserConfig.load(browser_config) # browser_config.verbose = True # Set to False or remove for production stress testing @@ -734,7 +733,7 @@ async def handle_stream_crawl_request( ) ) - from crawler_pool import get_crawler + from crawler_pool import get_crawler, release_crawler crawler = await get_crawler(browser_config) # crawler = AsyncWebCrawler(config=browser_config) @@ -763,13 +762,9 @@ async def handle_stream_crawl_request( return crawler, results_gen, hooks_info except Exception as e: - # Make sure to close crawler if started during an error here - if 'crawler' in locals() and crawler.ready: - # try: - # await crawler.close() - # except Exception as close_e: - # logger.error(f"Error closing crawler during stream setup exception: {close_e}") - logger.error(f"Error closing crawler during stream setup exception: {str(e)}") + # Make sure to release crawler if started during an error here + if crawler: + await release_crawler(crawler) logger.error(f"Stream crawl error: {str(e)}", exc_info=True) # Raising HTTPException here will prevent streaming response raise HTTPException( @@ -852,4 +847,4 @@ async def _runner(): ) background_tasks.add_task(_runner) - return {"task_id": task_id} \ No newline at end of file + return {"task_id": task_id} diff --git a/deploy/docker/crawler_pool.py b/deploy/docker/crawler_pool.py index 509cbba92..810a0e7c9 100644 --- a/deploy/docker/crawler_pool.py +++ b/deploy/docker/crawler_pool.py @@ -1,5 +1,6 @@ # crawler_pool.py - Smart browser pool with tiered management -import asyncio, json, hashlib, time +import asyncio, json, hashlib, time, os +import psutil from contextlib import suppress from typing import Dict, Optional from crawl4ai import AsyncWebCrawler, BrowserConfig @@ -13,6 +14,7 @@ PERMANENT: Optional[AsyncWebCrawler] = None # Always-ready default browser HOT_POOL: Dict[str, AsyncWebCrawler] = {} # Frequent configs COLD_POOL: Dict[str, AsyncWebCrawler] = {} # Rare configs +RETIRED_POOL: Dict[str, AsyncWebCrawler] = {} # Browsers marked for retirement LAST_USED: Dict[str, float] = {} USAGE_COUNT: Dict[str, int] = {} LOCK = asyncio.Lock() @@ -22,6 +24,15 @@ BASE_IDLE_TTL = CONFIG.get("crawler", {}).get("pool", {}).get("idle_ttl_sec", 300) DEFAULT_CONFIG_SIG = None # Cached sig for default config +# Retirement Config (from env) +RETIREMENT_ENABLED = os.getenv("CRAWL4AI_BROWSER_RETIREMENT_ENABLED", "false").lower() == "true" +POOL_AUDIT_ENABLED = os.getenv("CRAWL4AI_POOL_AUDIT_ENABLED", "false").lower() == "true" +PERMANENT_BROWSER_DISABLED = os.getenv("CRAWL4AI_PERMANENT_BROWSER_DISABLED", "false").lower() == "true" + +MAX_USAGE_COUNT = int(os.getenv("CRAWL4AI_BROWSER_MAX_USAGE", "100")) +MEMORY_RETIRE_THRESHOLD = int(os.getenv("CRAWL4AI_MEMORY_RETIRE_THRESHOLD", "75")) +MEMORY_RETIRE_MIN_USAGE = int(os.getenv("CRAWL4AI_MEMORY_RETIRE_MIN_USAGE", "10")) + def _sig(cfg: BrowserConfig) -> str: """Generate config signature.""" payload = json.dumps(cfg.to_dict(), sort_keys=True, separators=(",",":")) @@ -35,21 +46,45 @@ async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler: """Get crawler from pool with tiered strategy.""" sig = _sig(cfg) async with LOCK: - # Check permanent browser for default config - if PERMANENT and _is_default_config(sig): + # Use permanent browser if not disabled and config matches + if not PERMANENT_BROWSER_DISABLED and PERMANENT and _is_default_config(sig): LAST_USED[sig] = time.time() USAGE_COUNT[sig] = USAGE_COUNT.get(sig, 0) + 1 - logger.info("πŸ”₯ Using permanent browser") + # logger.info("πŸ”₯ Using permanent browser") return PERMANENT # Check hot pool if sig in HOT_POOL: - LAST_USED[sig] = time.time() - USAGE_COUNT[sig] = USAGE_COUNT.get(sig, 0) + 1 - logger.info(f"♨️ Using hot pool browser (sig={sig[:8]})") - return HOT_POOL[sig] - - # Check cold pool (promote to hot if used 3+ times) + crawler = HOT_POOL[sig] + usage = USAGE_COUNT.get(sig, 0) + + if not hasattr(crawler, 'active_requests'): + crawler.active_requests = 0 + + should_retire = False + if RETIREMENT_ENABLED: + if usage >= MAX_USAGE_COUNT: + should_retire = True + logger.info(f"πŸ‘΄ Retirement time for browser {sig[:8]}: Max usage reached ({usage})") + elif usage >= MEMORY_RETIRE_MIN_USAGE: + try: + mem_percent = psutil.virtual_memory().percent + if mem_percent > MEMORY_RETIRE_THRESHOLD: + should_retire = True + logger.info(f"πŸ‘΄ Retirement time for browser {sig[:8]}: Memory high ({mem_percent}%)") + except Exception as e: + logger.warning(f"Failed to check memory for retirement: {e}") + + if should_retire: + RETIRED_POOL[sig] = HOT_POOL.pop(sig) + else: + LAST_USED[sig] = time.time() + USAGE_COUNT[sig] = usage + 1 + crawler.active_requests += 1 + logger.info(f"♨️ Using hot pool browser (sig={sig[:8]}, usage={USAGE_COUNT[sig]}, active={crawler.active_requests})") + return crawler + + # Check cold pool if sig in COLD_POOL: LAST_USED[sig] = time.time() USAGE_COUNT[sig] = USAGE_COUNT.get(sig, 0) + 1 @@ -57,18 +92,18 @@ async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler: if USAGE_COUNT[sig] >= 3: logger.info(f"⬆️ Promoting to hot pool (sig={sig[:8]}, count={USAGE_COUNT[sig]})") HOT_POOL[sig] = COLD_POOL.pop(sig) - - # Track promotion in monitor - try: - from monitor import get_monitor - await get_monitor().track_janitor_event("promote", sig, {"count": USAGE_COUNT[sig]}) - except: - pass - - return HOT_POOL[sig] + crawler = HOT_POOL[sig] + if not hasattr(crawler, 'active_requests'): + crawler.active_requests = 0 + crawler.active_requests += 1 + return crawler logger.info(f"❄️ Using cold pool browser (sig={sig[:8]})") - return COLD_POOL[sig] + crawler = COLD_POOL[sig] + if not hasattr(crawler, 'active_requests'): + crawler.active_requests = 0 + crawler.active_requests += 1 + return crawler # Memory check before creating new mem_pct = get_container_memory_percent() @@ -80,18 +115,36 @@ async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler: logger.info(f"πŸ†• Creating new browser in cold pool (sig={sig[:8]}, mem={mem_pct:.1f}%)") crawler = AsyncWebCrawler(config=cfg, thread_safe=False) await crawler.start() + crawler.active_requests = 1 COLD_POOL[sig] = crawler LAST_USED[sig] = time.time() USAGE_COUNT[sig] = 1 return crawler +async def release_crawler(crawler: AsyncWebCrawler): + """Decrement active request count for a crawler.""" + async with LOCK: + if hasattr(crawler, 'active_requests'): + crawler.active_requests -= 1 + if crawler.active_requests < 0: + crawler.active_requests = 0 + async def init_permanent(cfg: BrowserConfig): """Initialize permanent default browser.""" global PERMANENT, DEFAULT_CONFIG_SIG + + # Log retirement status once on startup + if RETIREMENT_ENABLED: + logger.info(f"βœ… Browser retirement enabled (Max Usage: {MAX_USAGE_COUNT}, Mem Threshold: {MEMORY_RETIRE_THRESHOLD}%)") + else: + logger.info("ℹ️ Browser retirement disabled") + async with LOCK: - if PERMANENT: - return DEFAULT_CONFIG_SIG = _sig(cfg) + if PERMANENT_BROWSER_DISABLED: + logger.info("ℹ️ Permanent browser is DISABLED via config") + return + logger.info("πŸ”₯ Creating permanent default browser") PERMANENT = AsyncWebCrawler(config=cfg, thread_safe=False) await PERMANENT.start() @@ -102,69 +155,87 @@ async def close_all(): """Close all browsers.""" async with LOCK: tasks = [] - if PERMANENT: - tasks.append(PERMANENT.close()) tasks.extend([c.close() for c in HOT_POOL.values()]) tasks.extend([c.close() for c in COLD_POOL.values()]) + tasks.extend([c.close() for c in RETIRED_POOL.values()]) await asyncio.gather(*tasks, return_exceptions=True) HOT_POOL.clear() COLD_POOL.clear() + RETIRED_POOL.clear() LAST_USED.clear() USAGE_COUNT.clear() async def janitor(): """Adaptive cleanup based on memory pressure.""" + last_audit_time = 0 while True: mem_pct = get_container_memory_percent() # Adaptive intervals and TTLs + # δΈ₯格遡εΎͺ BASE_IDLE_TTLοΌŒδΈε†εš hot_ttl = ttl * 2 ηš„ζ”Ύε€§ if mem_pct > 80: - interval, cold_ttl, hot_ttl = 10, 30, 120 + interval, cold_ttl, hot_ttl = 10, 30, 60 elif mem_pct > 60: - interval, cold_ttl, hot_ttl = 30, 60, 300 + interval, cold_ttl, hot_ttl = 30, 60, 120 else: - interval, cold_ttl, hot_ttl = 60, BASE_IDLE_TTL, BASE_IDLE_TTL * 2 + interval, cold_ttl, hot_ttl = 60, BASE_IDLE_TTL, BASE_IDLE_TTL await asyncio.sleep(interval) now = time.time() async with LOCK: + # [Audit Log] Every 5 minutes + if POOL_AUDIT_ENABLED and now - last_audit_time >= 300: + def _pool_info(pool): + res = [] + for s, c in pool.items(): + req = getattr(c, 'active_requests', 0) + u_count = USAGE_COUNT.get(s, 0) + res.append(f"{s[:8]}(req={req}, usage={u_count})") + return res + + logger.info( + f"🧐 [Pool Audit]\n" + f" - PERMANENT: {'Active' if PERMANENT else 'None/Disabled'}\n" + f" - HOT_POOL: {len(HOT_POOL)} {_pool_info(HOT_POOL)}\n" + f" - COLD_POOL: {len(COLD_POOL)} {_pool_info(COLD_POOL)}\n" + f" - RETIRED_POOL: {len(RETIRED_POOL)} {_pool_info(RETIRED_POOL)}\n" + f" - System Memory: {mem_pct:.1f}%" + ) + last_audit_time = now + # Clean cold pool for sig in list(COLD_POOL.keys()): if now - LAST_USED.get(sig, now) > cold_ttl: - idle_time = now - LAST_USED[sig] - logger.info(f"🧹 Closing cold browser (sig={sig[:8]}, idle={idle_time:.0f}s)") - with suppress(Exception): - await COLD_POOL[sig].close() - COLD_POOL.pop(sig, None) - LAST_USED.pop(sig, None) - USAGE_COUNT.pop(sig, None) - - # Track in monitor - try: - from monitor import get_monitor - await get_monitor().track_janitor_event("close_cold", sig, {"idle_seconds": int(idle_time), "ttl": cold_ttl}) - except: - pass - - # Clean hot pool (more conservative) + crawler = COLD_POOL[sig] + if not hasattr(crawler, 'active_requests') or crawler.active_requests == 0: + logger.info(f"🧹 Closing cold browser (idle, sig={sig[:8]})") + with suppress(Exception): + await crawler.close() + COLD_POOL.pop(sig, None) + LAST_USED.pop(sig, None) + USAGE_COUNT.pop(sig, None) + + # Clean hot pool for sig in list(HOT_POOL.keys()): if now - LAST_USED.get(sig, now) > hot_ttl: - idle_time = now - LAST_USED[sig] - logger.info(f"🧹 Closing hot browser (sig={sig[:8]}, idle={idle_time:.0f}s)") + crawler = HOT_POOL[sig] + if not hasattr(crawler, 'active_requests') or crawler.active_requests == 0: + logger.info(f"🧹 Closing hot browser (idle={now - LAST_USED[sig]:.0f}s, sig={sig[:8]})") + with suppress(Exception): + await crawler.close() + HOT_POOL.pop(sig, None) + LAST_USED.pop(sig, None) + USAGE_COUNT.pop(sig, None) + + # Clean retired pool + for sig in list(RETIRED_POOL.keys()): + crawler = RETIRED_POOL[sig] + if hasattr(crawler, 'active_requests') and crawler.active_requests == 0: + logger.info(f"πŸ’€ Janitor closing retired browser (sig={sig[:8]})") with suppress(Exception): - await HOT_POOL[sig].close() - HOT_POOL.pop(sig, None) - LAST_USED.pop(sig, None) - USAGE_COUNT.pop(sig, None) + await crawler.close() + RETIRED_POOL.pop(sig, None) - # Track in monitor - try: - from monitor import get_monitor - await get_monitor().track_janitor_event("close_hot", sig, {"idle_seconds": int(idle_time), "ttl": hot_ttl}) - except: - pass - - # Log pool stats - if mem_pct > 60: - logger.info(f"πŸ“Š Pool: hot={len(HOT_POOL)}, cold={len(COLD_POOL)}, mem={mem_pct:.1f}%") + if mem_pct > 60 or len(RETIRED_POOL) > 0: + logger.info(f"πŸ“Š Pool: hot={len(HOT_POOL)}, cold={len(COLD_POOL)}, retired={len(RETIRED_POOL)}, mem={mem_pct:.1f}%") diff --git a/deploy/docker/server.py b/deploy/docker/server.py index 62e4e4413..353d99353 100644 --- a/deploy/docker/server.py +++ b/deploy/docker/server.py @@ -7,7 +7,7 @@ """ # ── stdlib & 3rd‑party imports ─────────────────────────────── -from crawler_pool import get_crawler, close_all, janitor +from crawler_pool import get_crawler, release_crawler, close_all, janitor from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig from auth import create_access_token, get_token_dependency, TokenRequest from pydantic import BaseModel @@ -337,8 +337,9 @@ async def generate_html( Crawls the URL, preprocesses the raw HTML for schema extraction, and returns the processed HTML. Use when you need sanitized HTML structures for building schemas or further processing. """ - from crawler_pool import get_crawler - cfg = CrawlerRunConfig() + from crawler_pool import get_crawler, release_crawler + cfg = get_default_crawler_config() + crawler = None try: crawler = await get_crawler(get_default_browser_config()) results = await crawler.arun(url=body.url, config=cfg) @@ -351,6 +352,9 @@ async def generate_html( return JSONResponse({"html": processed_html, "url": body.url, "success": True}) except Exception as e: raise HTTPException(500, detail=str(e)) + finally: + if crawler: + await release_crawler(crawler) # Screenshot endpoint @@ -368,7 +372,8 @@ async def generate_screenshot( Use when you need an image snapshot of the rendered page. Its recommened to provide an output path to save the screenshot. Then in result instead of the screenshot you will get a path to the saved file. """ - from crawler_pool import get_crawler + from crawler_pool import get_crawler, release_crawler + crawler = None try: cfg = CrawlerRunConfig(screenshot=True, screenshot_wait_for=body.screenshot_wait_for) crawler = await get_crawler(get_default_browser_config()) @@ -385,6 +390,9 @@ async def generate_screenshot( return {"success": True, "screenshot": screenshot_data} except Exception as e: raise HTTPException(500, detail=str(e)) + finally: + if crawler: + await release_crawler(crawler) # PDF endpoint @@ -402,7 +410,8 @@ async def generate_pdf( Use when you need a printable or archivable snapshot of the page. It is recommended to provide an output path to save the PDF. Then in result instead of the PDF you will get a path to the saved file. """ - from crawler_pool import get_crawler + from crawler_pool import get_crawler, release_crawler + crawler = None try: cfg = CrawlerRunConfig(pdf=True) crawler = await get_crawler(get_default_browser_config()) @@ -419,6 +428,9 @@ async def generate_pdf( return {"success": True, "pdf": base64.b64encode(pdf_data).decode()} except Exception as e: raise HTTPException(500, detail=str(e)) + finally: + if crawler: + await release_crawler(crawler) @app.post("/execute_js") @@ -457,24 +469,11 @@ class CrawlResult(BaseModel): metadata: Optional[dict] = None error_message: Optional[str] = None session_id: Optional[str] = None - response_headers: Optional[dict] = None - status_code: Optional[int] = None - ssl_certificate: Optional[SSLCertificate] = None - dispatch_result: Optional[DispatchResult] = None - redirected_url: Optional[str] = None - network_requests: Optional[List[Dict[str, Any]]] = None - console_messages: Optional[List[Dict[str, Any]]] = None - - class MarkdownGenerationResult(BaseModel): - raw_markdown: str - markdown_with_citations: str - references_markdown: str - fit_markdown: Optional[str] = None - fit_html: Optional[str] = None + # ... ``` - """ - from crawler_pool import get_crawler + from crawler_pool import get_crawler, release_crawler + crawler = None try: cfg = CrawlerRunConfig(js_code=body.scripts) crawler = await get_crawler(get_default_browser_config()) @@ -485,6 +484,9 @@ class MarkdownGenerationResult(BaseModel): return JSONResponse(data) except Exception as e: raise HTTPException(500, detail=str(e)) + finally: + if crawler: + await release_crawler(crawler) @app.get("/llm/{url:path}") @@ -806,7 +808,7 @@ async def get_context( base_url=f"http://{config['app']['host']}:{config['app']['port']}" ) -# ────────────────────────── cli ────────────────────────────── +# ── cli ────────────────────────────── if __name__ == "__main__": import uvicorn uvicorn.run( From 47bc68890af1cd93cd25f73c2b5030ed32f16157 Mon Sep 17 00:00:00 2001 From: "dylan.min" Date: Tue, 6 Jan 2026 13:12:51 +0800 Subject: [PATCH 2/3] feat: optimize resource filtering, enhance ad/tracker blocking and cleanup docs - Refactor BrowserManager to dynamically block resources based on avoid_css and text_mode - Align text_mode behavior with community standards (no forced CSS blocking) - Add Top 20 curated ad and tracker patterns for performance - Restore and translate permanent browser logs in crawler_pool.py - Clean up models.py schema annotations and server.py docstrings - Add unit and functional tests for filtering flags --- crawl4ai/browser_manager.py | 97 +++++++----------------- crawl4ai/models.py | 2 +- deploy/docker/crawler_pool.py | 4 +- tests/browser/test_resource_filtering.py | 55 ++++++++++++++ tests/general/test_cache_context.py | 2 + tests/unit/test_config_flags.py | 33 ++++++++ 6 files changed, 119 insertions(+), 74 deletions(-) create mode 100644 tests/browser/test_resource_filtering.py create mode 100644 tests/unit/test_config_flags.py diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py index f635b2959..600884bde 100644 --- a/crawl4ai/browser_manager.py +++ b/crawl4ai/browser_manager.py @@ -878,79 +878,29 @@ async def create_browser_context(self, crawlerRunConfig: CrawlerRunConfig = None } proxy_settings = {"server": self.config.proxy} if self.config.proxy else None - blocked_extensions = [ + # Define resource categories + css_extensions = ["css", "less", "scss", "sass"] + static_extensions = [ # Images - "jpg", - "jpeg", - "png", - "gif", - "webp", - "svg", - "ico", - "bmp", - "tiff", - "psd", + "jpg", "jpeg", "png", "gif", "webp", "svg", "ico", "bmp", "tiff", # Fonts - "woff", - "woff2", - "ttf", - "otf", - "eot", - # Styles - "css", "less", "scss", "sass", + "woff", "woff2", "ttf", "otf", "eot", # Media - "mp4", - "webm", - "ogg", - "avi", - "mov", - "wmv", - "flv", - "m4v", - "mp3", - "wav", - "aac", - "m4a", - "opus", - "flac", - # Documents - "pdf", - "doc", - "docx", - "xls", - "xlsx", - "ppt", - "pptx", - # Archives - "zip", - "rar", - "7z", - "tar", - "gz", - # Scripts and data - "xml", - "swf", - "wasm", + "mp4", "webm", "ogg", "mp3", "wav", "aac", "flac", + # Documents & Archives + "pdf", "doc", "docx", "xls", "xlsx", "zip", "rar", "7z", "tar", "gz", + # Other + "xml", "swf", "wasm" ] - # Ad and Tracker patterns + # Ad and Tracker patterns (Top 20 curated from uBlock sources for performance) ad_tracker_patterns = [ - "**/google-analytics.com/**", - "**/googletagmanager.com/**", - "**/googlesyndication.com/**", - "**/doubleclick.net/**", - "**/adservice.google.com/**", - "**/adsystem.com/**", - "**/adzerk.net/**", - "**/adnxs.com/**", - "**/ads.linkedin.com/**", - "**/facebook.net/**", - "**/analytics.twitter.com/**", - "**/t.co/**", - "**/hotjar.com/**", - "**/clarity.ms/**", - "**/scorecardresearch.com/**", - "**/pixel.wp.com/**", + "**/google-analytics.com/**", "**/googletagmanager.com/**", "**/googlesyndication.com/**", + "**/doubleclick.net/**", "**/adservice.google.com/**", "**/adsystem.com/**", + "**/adzerk.net/**", "**/adnxs.com/**", "**/ads.linkedin.com/**", "**/facebook.net/**", + "**/analytics.twitter.com/**", "**/t.co/**", "**/ads-twitter.com/**", + "**/hotjar.com/**", "**/clarity.ms/**", "**/scorecardresearch.com/**", "**/pixel.wp.com/**", + "**/amazon-adsystem.com/**", "**/mixpanel.com/**", "**/segment.com/**" ] # Common context settings @@ -1006,10 +956,15 @@ async def create_browser_context(self, crawlerRunConfig: CrawlerRunConfig = None # Create and return the context with all settings context = await self.browser.new_context(**context_settings) - # Apply resource filtering based on config - if self.config.avoid_css or self.config.text_mode: - # Create and apply route patterns for each extension - for ext in blocked_extensions: + # Apply resource filtering based on config (Dynamic addition) + to_block = [] + if self.config.avoid_css: + to_block += css_extensions + if self.config.text_mode: + to_block += static_extensions + + if to_block: + for ext in to_block: await context.route(f"**/*.{ext}", lambda route: route.abort()) if self.config.avoid_ads: diff --git a/crawl4ai/models.py b/crawl4ai/models.py index e46bb7fa8..a370a4b9e 100644 --- a/crawl4ai/models.py +++ b/crawl4ai/models.py @@ -151,7 +151,7 @@ class CrawlResult(BaseModel): redirected_url: Optional[str] = None network_requests: Optional[List[Dict[str, Any]]] = None console_messages: Optional[List[Dict[str, Any]]] = None - tables: List[Dict] = Field(default_factory=list) # NEW – [{headers,rows,caption,summary}] + tables: List[Dict] = Field(default_factory=list) model_config = ConfigDict(arbitrary_types_allowed=True) diff --git a/deploy/docker/crawler_pool.py b/deploy/docker/crawler_pool.py index 810a0e7c9..681022c98 100644 --- a/deploy/docker/crawler_pool.py +++ b/deploy/docker/crawler_pool.py @@ -50,7 +50,7 @@ async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler: if not PERMANENT_BROWSER_DISABLED and PERMANENT and _is_default_config(sig): LAST_USED[sig] = time.time() USAGE_COUNT[sig] = USAGE_COUNT.get(sig, 0) + 1 - # logger.info("πŸ”₯ Using permanent browser") + logger.info("πŸ”₯ Using permanent browser") return PERMANENT # Check hot pool @@ -172,7 +172,7 @@ async def janitor(): mem_pct = get_container_memory_percent() # Adaptive intervals and TTLs - # δΈ₯格遡εΎͺ BASE_IDLE_TTLοΌŒδΈε†εš hot_ttl = ttl * 2 ηš„ζ”Ύε€§ + # Strictly follow BASE_IDLE_TTL without multipliers if mem_pct > 80: interval, cold_ttl, hot_ttl = 10, 30, 60 elif mem_pct > 60: diff --git a/tests/browser/test_resource_filtering.py b/tests/browser/test_resource_filtering.py new file mode 100644 index 000000000..a38782c84 --- /dev/null +++ b/tests/browser/test_resource_filtering.py @@ -0,0 +1,55 @@ +import asyncio +import os +import sys +import pytest + +# Add the project root to Python path if running directly +if __name__ == "__main__": + sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) + +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig +from crawl4ai.async_logger import AsyncLogger + +# Create a logger for clear terminal output +logger = AsyncLogger(verbose=True, log_file=None) + +@pytest.mark.asyncio +async def test_resource_filtering_launch(): + """Functional test to ensure browser launches correctly with filtering flags enabled.""" + browser_config = BrowserConfig( + headless=True, + avoid_ads=True, + avoid_css=True, + text_mode=True + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + # Simple crawl to verify functionality + result = await crawler.arun( + url="https://example.com", + config=CrawlerRunConfig(cache_mode="bypass") + ) + assert result.success + logger.success("Browser launched and crawled successfully with filtering flags") + +@pytest.mark.asyncio +async def test_avoid_css_only(): + """Test avoid_css without text_mode.""" + browser_config = BrowserConfig( + headless=True, + avoid_css=True, + text_mode=False + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="https://example.com", + config=CrawlerRunConfig(cache_mode="bypass") + ) + assert result.success + logger.success("Browser launched and crawled successfully with avoid_css only") + +if __name__ == "__main__": + asyncio.run(test_resource_filtering_launch()) + asyncio.run(test_avoid_css_only()) + diff --git a/tests/general/test_cache_context.py b/tests/general/test_cache_context.py index 0f42f9fdd..4d4049f70 100644 --- a/tests/general/test_cache_context.py +++ b/tests/general/test_cache_context.py @@ -1,7 +1,9 @@ import asyncio +import pytest from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode from playwright.async_api import Page, BrowserContext +@pytest.mark.asyncio async def test_reuse_context_by_config(): # We will store each context ID in these maps to confirm reuse context_ids_for_A = [] diff --git a/tests/unit/test_config_flags.py b/tests/unit/test_config_flags.py new file mode 100644 index 000000000..8685d2417 --- /dev/null +++ b/tests/unit/test_config_flags.py @@ -0,0 +1,33 @@ +import pytest +from crawl4ai.async_configs import BrowserConfig + +def test_browser_config_filtering_flags(): + """Test that BrowserConfig correctly stores the new filtering flags.""" + # Default values + config = BrowserConfig() + assert config.avoid_ads is False + assert config.avoid_css is False + + # Custom values + config = BrowserConfig(avoid_ads=True, avoid_css=True) + assert config.avoid_ads is True + assert config.avoid_css is True + + # Check to_dict / from_kwargs parity + config_dict = config.to_dict() + assert config_dict["avoid_ads"] is True + assert config_dict["avoid_css"] is True + + new_config = BrowserConfig.from_kwargs(config_dict) + assert new_config.avoid_ads is True + assert new_config.avoid_css is True + +def test_browser_config_clone(): + """Test that cloning BrowserConfig preserves the new flags.""" + config = BrowserConfig(avoid_ads=True, avoid_css=False) + cloned = config.clone(avoid_css=True) + + assert cloned.avoid_ads is True + assert cloned.avoid_css is True + assert config.avoid_css is False # Original remains unchanged + From 340013da1a61bd746ea0bf6cbcd78bbb73a327ac Mon Sep 17 00:00:00 2001 From: "dylan.min" Date: Wed, 7 Jan 2026 09:31:13 +0800 Subject: [PATCH 3/3] review: update code style --- crawl4ai/browser_manager.py | 54 ++++++++++++++++++++++++++++++++----- crawl4ai/models.py | 2 +- 2 files changed, 48 insertions(+), 8 deletions(-) diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py index 600884bde..9f2efd3d6 100644 --- a/crawl4ai/browser_manager.py +++ b/crawl4ai/browser_manager.py @@ -882,15 +882,55 @@ async def create_browser_context(self, crawlerRunConfig: CrawlerRunConfig = None css_extensions = ["css", "less", "scss", "sass"] static_extensions = [ # Images - "jpg", "jpeg", "png", "gif", "webp", "svg", "ico", "bmp", "tiff", + "jpg", + "jpeg", + "png", + "gif", + "webp", + "svg", + "ico", + "bmp", + "tiff", + "psd", # Fonts - "woff", "woff2", "ttf", "otf", "eot", + "woff", + "woff2", + "ttf", + "otf", + "eot", # Media - "mp4", "webm", "ogg", "mp3", "wav", "aac", "flac", - # Documents & Archives - "pdf", "doc", "docx", "xls", "xlsx", "zip", "rar", "7z", "tar", "gz", - # Other - "xml", "swf", "wasm" + "mp4", + "webm", + "ogg", + "avi", + "mov", + "wmv", + "flv", + "m4v", + "mp3", + "wav", + "aac", + "m4a", + "opus", + "flac", + # Documents + "pdf", + "doc", + "docx", + "xls", + "xlsx", + "ppt", + "pptx", + # Archives + "zip", + "rar", + "7z", + "tar", + "gz", + # Scripts and data + "xml", + "swf", + "wasm", ] # Ad and Tracker patterns (Top 20 curated from uBlock sources for performance) diff --git a/crawl4ai/models.py b/crawl4ai/models.py index a370a4b9e..e46bb7fa8 100644 --- a/crawl4ai/models.py +++ b/crawl4ai/models.py @@ -151,7 +151,7 @@ class CrawlResult(BaseModel): redirected_url: Optional[str] = None network_requests: Optional[List[Dict[str, Any]]] = None console_messages: Optional[List[Dict[str, Any]]] = None - tables: List[Dict] = Field(default_factory=list) + tables: List[Dict] = Field(default_factory=list) # NEW – [{headers,rows,caption,summary}] model_config = ConfigDict(arbitrary_types_allowed=True)