Skip to content

Commit 58c8140

Browse files
committed
Revert all but executor changes
1 parent 4435c8a commit 58c8140

File tree

12 files changed

+35
-132
lines changed

12 files changed

+35
-132
lines changed

.github/workflows/run_code_checks.yaml

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -40,17 +40,12 @@ jobs:
4040
HTTPBIN_URL: 'asd'
4141

4242
steps:
43-
- name: Os cleanup
43+
- name: macOS cleanup
4444
if: runner.os == 'macOS'
45+
# Disable Spotlight indexing and try to kill all useless processes that could drain CPU during tests
4546
run: |
4647
sudo mdutil -i off /
47-
sudo killall ecosystemanalyticsd
48-
sudo killall Finder || true
49-
sudo killall mds || true
50-
sudo killall mds_stores || true
51-
sudo killall mds_worker || true
52-
sudo killall mdworker || true
53-
sudo killall mdworker_shared || true
48+
sudo killall Finder spindump ecosystemanalyticsd SystemUIServer NotificationCenter mds mds_stores mds_worker mdworker mdworker_shared || true
5449
5550
- name: Checkout repository
5651
uses: actions/checkout@v4

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ type-check:
3131

3232
unit-tests:
3333
uv run pytest --numprocesses=1 -vv tests/unit -m "run_alone"
34-
#uv run pytest --numprocesses=auto -vv tests/unit -m "not run_alone"
34+
uv run pytest --numprocesses=auto -vv tests/unit -m "not run_alone"
3535

3636
unit-tests-cov:
3737
uv run pytest --numprocesses=1 -vv --cov=src/crawlee tests/unit -m "run_alone"

src/crawlee/_autoscaling/autoscaled_pool.py

Lines changed: 12 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ class AutoscaledPool:
4646
_AUTOSCALE_INTERVAL = timedelta(seconds=10)
4747
"""Interval at which the autoscaled pool adjusts the desired concurrency based on the latest system status."""
4848

49-
_LOGGING_INTERVAL = timedelta(seconds=0.5)
49+
_LOGGING_INTERVAL = timedelta(minutes=1)
5050
"""Interval at which the autoscaled pool logs its current state."""
5151

5252
_DESIRED_CONCURRENCY_RATIO = 0.9
@@ -61,9 +61,6 @@ class AutoscaledPool:
6161
_TASK_TIMEOUT: timedelta | None = None
6262
"""Timeout within which the `run_task_function` must complete."""
6363

64-
_OVERLOADED_BACKOFF_TIME: timedelta = timedelta(seconds=1)
65-
"""When overloaded, Autoscaled pool waits this long before rechecking system status."""
66-
6764
def __init__(
6865
self,
6966
*,
@@ -126,33 +123,19 @@ async def run(self) -> None:
126123
)
127124

128125
try:
129-
logger.info(f'Await result')
130126
await run.result
131-
logger.info(f'Finished naturally, {run.worker_tasks=}, {run.result.result()=}')
132127
except AbortError:
133-
logger.info('AbortError')
134128
orchestrator.cancel()
135129
for task in run.worker_tasks:
136130
if not task.done():
137131
task.cancel()
138-
except Exception as exc:
139-
logger.error('Something sinister happened', exc_info=exc)
140-
raise
141-
except BaseException as exc:
142-
logger.error('BaseException happened', exc_info=exc)
143-
raise
144-
145132
finally:
146-
logger.error('finally')
147133
with suppress(asyncio.CancelledError):
148-
logger.error('self._autoscale_task.stop()')
149134
await self._autoscale_task.stop()
150135
with suppress(asyncio.CancelledError):
151-
logger.error('await self._log_system_status_task.stop()')
152136
await self._log_system_status_task.stop()
153137

154138
if not orchestrator.done():
155-
logger.error('not orchestrator.done()')
156139
orchestrator.cancel()
157140
elif not orchestrator.cancelled() and orchestrator.exception() is not None:
158141
logger.error('Exception in worker task orchestrator', exc_info=orchestrator.exception())
@@ -233,59 +216,48 @@ async def _worker_task_orchestrator(self, run: _AutoscaledPoolRun) -> None:
233216
Exits when `is_finished_function` returns True.
234217
"""
235218
finished = False
236-
logger.info('_worker_task_orchestrator')
219+
237220
try:
238221
while not (finished := await self._is_finished_function()) and not run.result.done():
239222
run.worker_tasks_updated.clear()
240223

241224
current_status = self._system_status.get_current_system_info()
242225
if not current_status.is_system_idle:
243-
logger.info(f'Not scheduling new tasks - system is overloaded: {current_status}')
244-
await asyncio.sleep(self._OVERLOADED_BACKOFF_TIME.total_seconds())
245-
logger.info('Release the overloaded backoff')
226+
logger.debug('Not scheduling new tasks - system is overloaded')
246227
elif self._is_paused:
247-
logger.info('Not scheduling new tasks - the autoscaled pool is paused')
228+
logger.debug('Not scheduling new tasks - the autoscaled pool is paused')
248229
elif self.current_concurrency >= self.desired_concurrency:
249-
logger.info('Not scheduling new tasks - already running at desired concurrency')
230+
logger.debug('Not scheduling new tasks - already running at desired concurrency')
250231
elif not await self._is_task_ready_function():
251232
logger.debug('Not scheduling new task - no task is ready')
252233
else:
253-
logger.info('Scheduling a new task')
234+
logger.debug('Scheduling a new task')
254235
worker_task = asyncio.create_task(self._worker_task(), name='autoscaled pool worker task')
255236
worker_task.add_done_callback(lambda task: self._reap_worker_task(task, run))
256237
run.worker_tasks.append(worker_task)
257238

258239
if math.isfinite(self._max_tasks_per_minute):
259-
logger.info('Deadlock sleep????')
260240
await asyncio.sleep(60 / self._max_tasks_per_minute)
261241

262242
continue
263243

264244
with suppress(asyncio.TimeoutError):
265245
await asyncio.wait_for(run.worker_tasks_updated.wait(), timeout=0.5)
266-
267-
logger.info("Just finishing")
268-
269-
except BaseException as e:
270-
logger.error('What is hiding here?', exc_info=e)
271-
raise
272-
273246
finally:
274-
logger.info(f'Finally pool. {finished=}, {(run.result.done())=}, {(run.result.result() if run.result.done() else None)=}')
275247
if finished:
276-
logger.info('`is_finished_function` reports that we are finished')
248+
logger.debug('`is_finished_function` reports that we are finished')
277249
elif run.result.done() and run.result.exception() is not None:
278-
logger.info('Unhandled exception in `run_task_function`')
250+
logger.debug('Unhandled exception in `run_task_function`')
279251

280252
if run.worker_tasks:
281-
logger.info('Terminating - waiting for tasks to complete')
253+
logger.debug('Terminating - waiting for tasks to complete')
282254
await asyncio.wait(run.worker_tasks, return_when=asyncio.ALL_COMPLETED)
283-
logger.info('Worker tasks finished')
255+
logger.debug('Worker tasks finished')
284256
else:
285-
logger.info('Terminating - no running tasks to wait for')
257+
logger.debug('Terminating - no running tasks to wait for')
286258

287259
if not run.result.done():
288-
run.result.set_result("Hello")
260+
run.result.set_result(object())
289261

290262
def _reap_worker_task(self, task: asyncio.Task, run: _AutoscaledPoolRun) -> None:
291263
"""Handle cleanup and tracking of a completed worker task.

src/crawlee/_autoscaling/system_status.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,6 @@ def _is_cpu_overloaded(self, sample_duration: timedelta | None = None) -> LoadRa
123123
CPU load ratio information.
124124
"""
125125
sample = self._snapshotter.get_cpu_sample(sample_duration)
126-
logger.info(sample)
127126
return self._is_sample_overloaded(sample, self._cpu_overload_threshold)
128127

129128
def _is_memory_overloaded(self, sample_duration: timedelta | None = None) -> LoadRatioInfo:

src/crawlee/_log_config.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
import logging
55
import sys
66
import textwrap
7-
from datetime import datetime
87
from typing import TYPE_CHECKING, Any
98

109
from colorama import Fore, Style, just_fix_windows_console
@@ -163,6 +162,6 @@ def format(self, record: logging.LogRecord) -> str:
163162

164163
if self.include_logger_name:
165164
# Include logger name at the beginning of the log line
166-
return f'{datetime.utcnow().strftime("%M-%S-%f")}{logger_name_string}{level_string}{log_string}{extra_string}{exception_string}'
165+
return f'{logger_name_string}{level_string}{log_string}{extra_string}{exception_string}'
167166

168167
return f'{level_string}{log_string}{extra_string}{exception_string}'

src/crawlee/_utils/system.py

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
from contextlib import suppress
66
from datetime import datetime, timezone
77
from logging import getLogger
8-
from subprocess import run
98
from typing import Annotated
109

1110
import psutil
@@ -94,13 +93,9 @@ def get_cpu_info() -> CpuInfo:
9493
It utilizes the `psutil` library. Function `psutil.cpu_percent()` returns a float representing the current
9594
system-wide CPU utilization as a percentage.
9695
"""
97-
98-
cpu_percent = psutil.cpu_percent(percpu=True)
99-
logger.info(f'Calling get_cpu_info()...: {cpu_percent}')
100-
ratio = sum(cpu_percent)/len(cpu_percent) / 100
101-
if ratio>0.95:
102-
print_ps()
103-
return CpuInfo(used_ratio=ratio)
96+
logger.debug('Calling get_cpu_info()...')
97+
cpu_percent = psutil.cpu_percent(interval=0.1)
98+
return CpuInfo(used_ratio=cpu_percent / 100)
10499

105100

106101
def get_memory_info() -> MemoryInfo:
@@ -128,7 +123,3 @@ def get_memory_info() -> MemoryInfo:
128123
current_size=ByteSize(current_size_bytes),
129124
system_wide_used_size=ByteSize(vm.total - vm.available),
130125
)
131-
132-
133-
def print_ps():
134-
run("ps -awxo pid,%cpu,comm", shell=True)

src/crawlee/crawlers/_basic/_basic_crawler.py

Lines changed: 3 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1130,7 +1130,6 @@ async def _handle_request_retries(
11301130
await self._statistics.error_tracker.add(error=error, context=context)
11311131

11321132
if self._error_handler:
1133-
self.log.warning('Error handler')
11341133
try:
11351134
new_request = await self._error_handler(context, error)
11361135
except Exception as e:
@@ -1146,20 +1145,18 @@ async def _handle_request_retries(
11461145
await self._mark_request_as_handled(request)
11471146
await self._handle_failed_request(context, error)
11481147
self._statistics.record_request_processing_failure(request.unique_key)
1149-
self.log.warning('_handle_request_retries DONE')
11501148

11511149
async def _handle_request_error(self, context: TCrawlingContext | BasicCrawlingContext, error: Exception) -> None:
11521150
try:
11531151
context.request.state = RequestState.ERROR_HANDLER
1154-
self.log.warning('Before _handle_request_error')
1152+
11551153
await wait_for(
11561154
partial(self._handle_request_retries, context, error),
11571155
timeout=self._internal_timeout,
11581156
timeout_message='Handling request failure timed out after '
11591157
f'{self._internal_timeout.total_seconds()} seconds',
11601158
logger=self._logger,
11611159
)
1162-
self.log.warning('After _handle_request_error')
11631160

11641161
context.request.state = RequestState.DONE
11651162
except UserDefinedErrorHandlerError:
@@ -1331,17 +1328,13 @@ async def __is_finished_function(self) -> bool:
13311328

13321329
if self._abort_on_error and self._failed:
13331330
self._failed = False
1334-
self.log.info('_abort_on_error')
13351331
return True
13361332

13371333
if self._keep_alive:
13381334
return False
13391335

13401336
request_manager = await self.get_request_manager()
1341-
is_finished = await request_manager.is_finished()
1342-
if is_finished:
1343-
self.log.info('I am sure this is a lie!!!')
1344-
return is_finished
1337+
return await request_manager.is_finished()
13451338

13461339
async def __is_task_ready_function(self) -> bool:
13471340
self._stop_if_max_requests_count_exceeded()
@@ -1353,10 +1346,7 @@ async def __is_task_ready_function(self) -> bool:
13531346
return False
13541347

13551348
request_manager = await self.get_request_manager()
1356-
is_ready = not await request_manager.is_empty()
1357-
if is_ready:
1358-
self.log.info('There is a request to process')
1359-
return is_ready
1349+
return not await request_manager.is_empty()
13601350

13611351
async def __run_task_function(self) -> None:
13621352
request_manager = await self.get_request_manager()
@@ -1370,11 +1360,6 @@ async def __run_task_function(self) -> None:
13701360
)
13711361

13721362
if request is None:
1373-
# No request to process, request manager is neither finished nor empty.
1374-
# All requests are locked or in progress.
1375-
self._logger.warning("Backoff: No available requests to process.")
1376-
await asyncio.sleep(0.2) # Small backoff time to avoid overloading the system through busy-waiting.
1377-
self._logger.warning("Backoff finished.")
13781363
return
13791364

13801365
if not (await self._is_allowed_based_on_robots_txt_file(request.url)):
@@ -1415,10 +1400,8 @@ async def __run_task_function(self) -> None:
14151400
try:
14161401
await self._run_request_handler(context=context)
14171402
except asyncio.TimeoutError as e:
1418-
context.log.info('RH error')
14191403
raise RequestHandlerError(e, context) from e
14201404

1421-
context.log.info('Commit resutls')
14221405
await self._commit_request_handler_result(context)
14231406

14241407
await self._mark_request_as_handled(request)
@@ -1428,7 +1411,6 @@ async def __run_task_function(self) -> None:
14281411
if context.session and context.session.is_usable:
14291412
context.session.mark_good()
14301413

1431-
context.log.info('Finished processing request')
14321414
self._statistics.record_request_processing_finish(request.unique_key)
14331415

14341416
except RequestCollisionError as request_error:

src/crawlee/events/_local_event_manager.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,8 +93,8 @@ async def __aexit__(
9393

9494
async def _emit_system_info_event(self) -> None:
9595
"""Emit a system info event with the current CPU and memory usage."""
96-
cpu_info = get_cpu_info()
97-
memory_info = get_memory_info()
96+
cpu_info = await asyncio.to_thread(get_cpu_info)
97+
memory_info = await asyncio.to_thread(get_memory_info)
9898

9999
event_data = EventSystemInfoData(cpu_info=cpu_info, memory_info=memory_info)
100100
self.emit(event=Event.SYSTEM_INFO, event_data=event_data)

src/crawlee/statistics/_error_tracker.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,6 @@ async def add(
5858
early: Flag indicating that the error is added earlier than usual to have access to resources that will be
5959
closed before normal error collection. This prevents double reporting during normal error collection.
6060
"""
61-
logger.warning('Adding error')
6261
if id(error) in self._early_reported_errors:
6362
# Error had to be collected earlier before relevant resources are closed.
6463
self._early_reported_errors.remove(id(error))
@@ -99,15 +98,12 @@ async def add(
9998
== 1
10099
and context is not None
101100
):
102-
logger.warning('awaiting snapshot')
103101
# Save snapshot only on the first occurrence of the error and only if context and kvs was passed as well.
104102
await self._capture_error_snapshot(
105103
error_message=new_error_group_message or error_group_message,
106104
file_and_line=error_group_file_and_line,
107105
context=context,
108106
)
109-
logger.warning('Snapshot added')
110-
logger.warning('Finished')
111107

112108
async def _capture_error_snapshot(
113109
self, error_message: str, file_and_line: str, context: BasicCrawlingContext

0 commit comments

Comments
 (0)