Skip to content

Commit 33ba628

Browse files
committed
Merge remote-tracking branch 'origin/master' into run-uts-on-macos
2 parents 5d83c47 + 883355a commit 33ba628

File tree

32 files changed

+1003
-582
lines changed

32 files changed

+1003
-582
lines changed

.github/workflows/build_and_deploy_docs.yaml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,10 @@ jobs:
6767
uses: actions/deploy-pages@v4
6868

6969
- name: Invalidate CloudFront cache
70-
run: gh workflow run invalidate.yaml --repo apify/apify-docs-private
70+
run: |
71+
gh workflow run invalidate-cloudfront.yml \
72+
--repo apify/apify-docs-private \
73+
--field deployment=crawlee-web
74+
echo "✅ CloudFront cache invalidation workflow triggered successfully"
7175
env:
7276
GITHUB_TOKEN: ${{ secrets.APIFY_SERVICE_ACCOUNT_GITHUB_TOKEN }}

CHANGELOG.md

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,26 @@
33
All notable changes to this project will be documented in this file.
44

55
<!-- git-cliff-unreleased-start -->
6-
## 1.1.1 - **not yet released**
6+
## 1.1.2 - **not yet released**
7+
8+
### 🐛 Bug Fixes
9+
10+
- Only apply requestHandlerTimeout to request handler ([#1474](https://github.com/apify/crawlee-python/pull/1474)) ([0dfb6c2](https://github.com/apify/crawlee-python/commit/0dfb6c2a13b6650736245fa39b3fbff397644df7)) by [@janbuchar](https://github.com/janbuchar)
11+
- Handle the case when `error_handler` returns `Request` ([#1595](https://github.com/apify/crawlee-python/pull/1595)) ([8a961a2](https://github.com/apify/crawlee-python/commit/8a961a2b07d0d33a7302dbb13c17f3d90999d390)) by [@Mantisus](https://github.com/Mantisus)
12+
13+
14+
<!-- git-cliff-unreleased-end -->
15+
## [1.1.1](https://github.com/apify/crawlee-python/releases/tag/v1.1.1) (2025-12-02)
716

817
### 🐛 Bug Fixes
918

1019
- Unify separators in `unique_key` construction ([#1569](https://github.com/apify/crawlee-python/pull/1569)) ([af46a37](https://github.com/apify/crawlee-python/commit/af46a3733b059a8052489296e172f005def953f7)) by [@vdusek](https://github.com/vdusek), closes [#1512](https://github.com/apify/crawlee-python/issues/1512)
1120
- Fix `same-domain` strategy ignoring public suffix ([#1572](https://github.com/apify/crawlee-python/pull/1572)) ([3d018b2](https://github.com/apify/crawlee-python/commit/3d018b21a28a4bee493829783057188d6106a69b)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1571](https://github.com/apify/crawlee-python/issues/1571)
1221
- Make context helpers work in `FailedRequestHandler` and `ErrorHandler` ([#1570](https://github.com/apify/crawlee-python/pull/1570)) ([b830019](https://github.com/apify/crawlee-python/commit/b830019350830ac33075316061659e2854f7f4a5)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1532](https://github.com/apify/crawlee-python/issues/1532)
1322
- Fix non-ASCII character corruption in `FileSystemStorageClient` on systems without UTF-8 default encoding ([#1580](https://github.com/apify/crawlee-python/pull/1580)) ([f179f86](https://github.com/apify/crawlee-python/commit/f179f8671b0b6af9264450e4fef7e49d1cecd2bd)) by [@Mantisus](https://github.com/Mantisus), closes [#1579](https://github.com/apify/crawlee-python/issues/1579)
23+
- Respect `&lt;base&gt;` when enqueuing ([#1590](https://github.com/apify/crawlee-python/pull/1590)) ([de517a1](https://github.com/apify/crawlee-python/commit/de517a1629cc29b20568143eb64018f216d4ba33)) by [@Mantisus](https://github.com/Mantisus), closes [#1589](https://github.com/apify/crawlee-python/issues/1589)
1424

1525

16-
<!-- git-cliff-unreleased-end -->
1726
## [1.1.0](https://github.com/apify/crawlee-python/releases/tag/v1.1.0) (2025-11-18)
1827

1928
### 🚀 Features

docs/deployment/code_examples/google/cloud_run_example.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from crawlee.storage_clients import MemoryStorageClient
1010

1111

12-
@get('/')
12+
@get('/') # type: ignore[untyped-decorator]
1313
async def main() -> str:
1414
"""The crawler entry point that will be called when the HTTP endpoint is accessed."""
1515
# highlight-start

docs/deployment/code_examples/google/google_example.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,7 @@
66
import functions_framework
77
from flask import Request, Response
88

9-
from crawlee.crawlers import (
10-
BeautifulSoupCrawler,
11-
BeautifulSoupCrawlingContext,
12-
)
9+
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
1310
from crawlee.storage_clients import MemoryStorageClient
1411

1512

@@ -51,7 +48,7 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
5148
# highlight-end
5249

5350

54-
@functions_framework.http
51+
@functions_framework.http # type: ignore[untyped-decorator]
5552
def crawlee_run(request: Request) -> Response:
5653
# You can pass data to your crawler using `request`
5754
function_id = request.headers['Function-Execution-Id']

docs/guides/code_examples/running_in_web_server/server.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
app = FastAPI(lifespan=lifespan, title='Crawler app')
1515

1616

17-
@app.get('/', response_class=HTMLResponse)
17+
@app.get('/', response_class=HTMLResponse) # type: ignore[untyped-decorator]
1818
def index() -> str:
1919
return """
2020
<!DOCTYPE html>
@@ -32,7 +32,7 @@ def index() -> str:
3232
"""
3333

3434

35-
@app.get('/scrape')
35+
@app.get('/scrape') # type: ignore[untyped-decorator]
3636
async def scrape_url(request: Request, url: str | None = None) -> dict:
3737
if not url:
3838
return {'url': 'missing', 'scrape result': 'no results'}

pyproject.toml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
44

55
[project]
66
name = "crawlee"
7-
version = "1.1.1"
7+
version = "1.1.2"
88
description = "Crawlee for Python"
99
authors = [{ name = "Apify Technologies s.r.o.", email = "support@apify.com" }]
1010
license = { file = "LICENSE" }
@@ -34,6 +34,7 @@ keywords = [
3434
"scraping",
3535
]
3636
dependencies = [
37+
"async-timeout>=5.0.1",
3738
"cachetools>=5.5.0",
3839
"colorama>=0.4.0",
3940
"impit>=0.8.0",
@@ -74,7 +75,7 @@ otel = [
7475
]
7576
sql_postgres = [
7677
"sqlalchemy[asyncio]>=2.0.0,<3.0.0",
77-
"asyncpg>=0.24.0; python_version < '3.14'" # TODO: https://github.com/apify/crawlee-python/issues/1555
78+
"asyncpg>=0.24.0"
7879
]
7980
sql_sqlite = [
8081
"sqlalchemy[asyncio]>=2.0.0,<3.0.0",
@@ -101,7 +102,7 @@ dev = [
101102
"build<2.0.0", # For e2e tests.
102103
"dycw-pytest-only<3.0.0",
103104
"fakeredis[probabilistic,json,lua]<3.0.0",
104-
"mypy~=1.18.0",
105+
"mypy~=1.19.0",
105106
"pre-commit<5.0.0",
106107
"proxy-py<3.0.0",
107108
"pydoc-markdown<5.0.0",

src/crawlee/_utils/time.py

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,14 @@
33
import time
44
from contextlib import contextmanager
55
from dataclasses import dataclass
6+
from datetime import timedelta
67
from typing import TYPE_CHECKING
78

9+
from async_timeout import Timeout, timeout
10+
811
if TYPE_CHECKING:
912
from collections.abc import Iterator
10-
from datetime import timedelta
13+
from types import TracebackType
1114

1215
_SECONDS_PER_MINUTE = 60
1316
_SECONDS_PER_HOUR = 3600
@@ -35,6 +38,43 @@ def measure_time() -> Iterator[TimerResult]:
3538
result.cpu = after_cpu - before_cpu
3639

3740

41+
class SharedTimeout:
42+
"""Keeps track of a time budget shared by multiple independent async operations.
43+
44+
Provides a reusable, non-reentrant context manager interface.
45+
"""
46+
47+
def __init__(self, timeout: timedelta) -> None:
48+
self._remaining_timeout = timeout
49+
self._active_timeout: Timeout | None = None
50+
self._activation_timestamp: float | None = None
51+
52+
async def __aenter__(self) -> timedelta:
53+
if self._active_timeout is not None or self._activation_timestamp is not None:
54+
raise RuntimeError('A shared timeout context cannot be entered twice at the same time')
55+
56+
self._activation_timestamp = time.monotonic()
57+
self._active_timeout = new_timeout = timeout(self._remaining_timeout.total_seconds())
58+
await new_timeout.__aenter__()
59+
return self._remaining_timeout
60+
61+
async def __aexit__(
62+
self,
63+
exc_type: type[BaseException] | None,
64+
exc_value: BaseException | None,
65+
exc_traceback: TracebackType | None,
66+
) -> None:
67+
if self._active_timeout is None or self._activation_timestamp is None:
68+
raise RuntimeError('Logic error')
69+
70+
await self._active_timeout.__aexit__(exc_type, exc_value, exc_traceback)
71+
elapsed = time.monotonic() - self._activation_timestamp
72+
self._remaining_timeout = self._remaining_timeout - timedelta(seconds=elapsed)
73+
74+
self._active_timeout = None
75+
self._activation_timestamp = None
76+
77+
3878
def format_duration(duration: timedelta | None) -> str:
3979
"""Format a timedelta into a human-readable string with appropriate units."""
4080
if duration is None:

src/crawlee/crawlers/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from crawlee._utils.try_import import install_import_hook as _install_import_hook
22
from crawlee._utils.try_import import try_import as _try_import
33

4-
from ._abstract_http import AbstractHttpCrawler, AbstractHttpParser, ParsedHttpCrawlingContext
4+
from ._abstract_http import AbstractHttpCrawler, AbstractHttpParser, HttpCrawlerOptions, ParsedHttpCrawlingContext
55
from ._basic import BasicCrawler, BasicCrawlerOptions, BasicCrawlingContext, ContextPipeline
66
from ._http import HttpCrawler, HttpCrawlingContext, HttpCrawlingResult
77

@@ -51,6 +51,7 @@
5151
'BeautifulSoupParserType',
5252
'ContextPipeline',
5353
'HttpCrawler',
54+
'HttpCrawlerOptions',
5455
'HttpCrawlingContext',
5556
'HttpCrawlingResult',
5657
'ParsedHttpCrawlingContext',
Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1-
from ._abstract_http_crawler import AbstractHttpCrawler
1+
from ._abstract_http_crawler import AbstractHttpCrawler, HttpCrawlerOptions
22
from ._abstract_http_parser import AbstractHttpParser
33
from ._http_crawling_context import ParsedHttpCrawlingContext
44

55
__all__ = [
66
'AbstractHttpCrawler',
77
'AbstractHttpParser',
8+
'HttpCrawlerOptions',
89
'ParsedHttpCrawlingContext',
910
]

src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py

Lines changed: 46 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,16 @@
33
import asyncio
44
import logging
55
from abc import ABC
6+
from datetime import timedelta
67
from typing import TYPE_CHECKING, Any, Generic
78

89
from more_itertools import partition
910
from pydantic import ValidationError
10-
from typing_extensions import TypeVar
11+
from typing_extensions import NotRequired, TypeVar
1112

1213
from crawlee._request import Request, RequestOptions
1314
from crawlee._utils.docs import docs_group
15+
from crawlee._utils.time import SharedTimeout
1416
from crawlee._utils.urls import to_absolute_url_iterator
1517
from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
1618
from crawlee.errors import SessionError
@@ -32,6 +34,19 @@
3234
TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
3335

3436

37+
class HttpCrawlerOptions(
38+
BasicCrawlerOptions[TCrawlingContext, TStatisticsState],
39+
Generic[TCrawlingContext, TStatisticsState],
40+
):
41+
"""Arguments for the `AbstractHttpCrawler` constructor.
42+
43+
It is intended for typing forwarded `__init__` arguments in the subclasses.
44+
"""
45+
46+
navigation_timeout: NotRequired[timedelta | None]
47+
"""Timeout for the HTTP request."""
48+
49+
3550
@docs_group('Crawlers')
3651
class AbstractHttpCrawler(
3752
BasicCrawler[TCrawlingContext, StatisticsState],
@@ -56,10 +71,13 @@ def __init__(
5671
self,
5772
*,
5873
parser: AbstractHttpParser[TParseResult, TSelectResult],
74+
navigation_timeout: timedelta | None = None,
5975
**kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext, StatisticsState]],
6076
) -> None:
6177
self._parser = parser
78+
self._navigation_timeout = navigation_timeout or timedelta(minutes=1)
6279
self._pre_navigation_hooks: list[Callable[[BasicCrawlingContext], Awaitable[None]]] = []
80+
self._shared_navigation_timeouts: dict[int, SharedTimeout] = {}
6381

6482
if '_context_pipeline' not in kwargs:
6583
raise ValueError(
@@ -112,9 +130,17 @@ def _create_static_content_crawler_pipeline(self) -> ContextPipeline[ParsedHttpC
112130
async def _execute_pre_navigation_hooks(
113131
self, context: BasicCrawlingContext
114132
) -> AsyncGenerator[BasicCrawlingContext, None]:
115-
for hook in self._pre_navigation_hooks:
116-
await hook(context)
117-
yield context
133+
context_id = id(context)
134+
self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout)
135+
136+
try:
137+
for hook in self._pre_navigation_hooks:
138+
async with self._shared_navigation_timeouts[context_id]:
139+
await hook(context)
140+
141+
yield context
142+
finally:
143+
self._shared_navigation_timeouts.pop(context_id, None)
118144

119145
async def _parse_http_response(
120146
self, context: HttpCrawlingContext
@@ -167,9 +193,15 @@ async def extract_links(
167193
kwargs.setdefault('strategy', 'same-hostname')
168194

169195
links_iterator: Iterator[str] = iter(self._parser.find_links(parsed_content, selector=selector))
170-
links_iterator = to_absolute_url_iterator(
171-
context.request.loaded_url or context.request.url, links_iterator, logger=context.log
196+
197+
# Get base URL from <base> tag if present
198+
extracted_base_urls = list(self._parser.find_links(parsed_content, 'base[href]'))
199+
base_url: str = (
200+
str(extracted_base_urls[0])
201+
if extracted_base_urls
202+
else context.request.loaded_url or context.request.url
172203
)
204+
links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)
173205

174206
if robots_txt_file:
175207
skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
@@ -216,12 +248,14 @@ async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenera
216248
Yields:
217249
The original crawling context enhanced by HTTP response.
218250
"""
219-
result = await self._http_client.crawl(
220-
request=context.request,
221-
session=context.session,
222-
proxy_info=context.proxy_info,
223-
statistics=self._statistics,
224-
)
251+
async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
252+
result = await self._http_client.crawl(
253+
request=context.request,
254+
session=context.session,
255+
proxy_info=context.proxy_info,
256+
statistics=self._statistics,
257+
timeout=remaining_timeout,
258+
)
225259

226260
yield HttpCrawlingContext.from_basic_crawling_context(context=context, http_response=result.http_response)
227261

0 commit comments

Comments
 (0)