Skip to content

Commit 5d83c47

Browse files
committed
Try to add waits to avoid busy-waiting in cases where it makes no sense
1 parent 3befc3e commit 5d83c47

File tree

3 files changed

+9
-1
lines changed

3 files changed

+9
-1
lines changed

src/crawlee/_autoscaling/autoscaled_pool.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,9 @@ class AutoscaledPool:
6161
_TASK_TIMEOUT: timedelta | None = None
6262
"""Timeout within which the `run_task_function` must complete."""
6363

64+
_OVERLOADED_BACKOFF_TIME: timedelta = timedelta(seconds=0.5)
65+
"""When overloaded, Autoscaled pool waits this long before rechecking system status."""
66+
6467
def __init__(
6568
self,
6669
*,
@@ -238,6 +241,7 @@ async def _worker_task_orchestrator(self, run: _AutoscaledPoolRun) -> None:
238241
current_status = self._system_status.get_current_system_info()
239242
if not current_status.is_system_idle:
240243
logger.info('Not scheduling new tasks - system is overloaded')
244+
await asyncio.sleep(self._OVERLOADED_BACKOFF_TIME)
241245
elif self._is_paused:
242246
logger.info('Not scheduling new tasks - the autoscaled pool is paused')
243247
elif self.current_concurrency >= self.desired_concurrency:
@@ -261,7 +265,7 @@ async def _worker_task_orchestrator(self, run: _AutoscaledPoolRun) -> None:
261265

262266
logger.info("Just finishing")
263267

264-
except Exception as e:
268+
except BaseException as e:
265269
logger.error('What is hiding here?', exc_info=e)
266270
raise
267271

src/crawlee/crawlers/_basic/_basic_crawler.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1386,6 +1386,9 @@ async def __run_task_function(self) -> None:
13861386
)
13871387

13881388
if request is None:
1389+
# No request to process, request manager is neither finished nor empty.
1390+
# All requests are locked or in progress.
1391+
await asyncio.sleep(0.2) # Small backoff time to avoid overloading the system through busy-waiting.
13891392
return
13901393

13911394
if not (await self._is_allowed_based_on_robots_txt_file(request.url)):

src/crawlee/storage_clients/_file_system/_request_queue_client.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -596,6 +596,7 @@ async def is_empty(self) -> bool:
596596
async with self._lock:
597597
# If we have a cached value, return it immediately.
598598
if self._is_empty_cache is not None:
599+
logger.info(f'From cache {self._is_empty_cache=}')
599600
return self._is_empty_cache
600601

601602
state = self._state.current_value

0 commit comments

Comments
 (0)