Skip to content

Commit ea070ff

Browse files
committed
Merge remote-tracking branch 'origin/master' into run-uts-on-macos
2 parents 5d83c47 + 883355a commit ea070ff

File tree

33 files changed

+1004
-583
lines changed

33 files changed

+1004
-583
lines changed

.github/workflows/build_and_deploy_docs.yaml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,10 @@ jobs:
6767
uses: actions/deploy-pages@v4
6868

6969
- name: Invalidate CloudFront cache
70-
run: gh workflow run invalidate.yaml --repo apify/apify-docs-private
70+
run: |
71+
gh workflow run invalidate-cloudfront.yml \
72+
--repo apify/apify-docs-private \
73+
--field deployment=crawlee-web
74+
echo "✅ CloudFront cache invalidation workflow triggered successfully"
7175
env:
7276
GITHUB_TOKEN: ${{ secrets.APIFY_SERVICE_ACCOUNT_GITHUB_TOKEN }}

CHANGELOG.md

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,26 @@
33
All notable changes to this project will be documented in this file.
44

55
<!-- git-cliff-unreleased-start -->
6-
## 1.1.1 - **not yet released**
6+
## 1.1.2 - **not yet released**
7+
8+
### 🐛 Bug Fixes
9+
10+
- Only apply requestHandlerTimeout to request handler ([#1474](https://github.com/apify/crawlee-python/pull/1474)) ([0dfb6c2](https://github.com/apify/crawlee-python/commit/0dfb6c2a13b6650736245fa39b3fbff397644df7)) by [@janbuchar](https://github.com/janbuchar)
11+
- Handle the case when `error_handler` returns `Request` ([#1595](https://github.com/apify/crawlee-python/pull/1595)) ([8a961a2](https://github.com/apify/crawlee-python/commit/8a961a2b07d0d33a7302dbb13c17f3d90999d390)) by [@Mantisus](https://github.com/Mantisus)
12+
13+
14+
<!-- git-cliff-unreleased-end -->
15+
## [1.1.1](https://github.com/apify/crawlee-python/releases/tag/v1.1.1) (2025-12-02)
716

817
### 🐛 Bug Fixes
918

1019
- Unify separators in `unique_key` construction ([#1569](https://github.com/apify/crawlee-python/pull/1569)) ([af46a37](https://github.com/apify/crawlee-python/commit/af46a3733b059a8052489296e172f005def953f7)) by [@vdusek](https://github.com/vdusek), closes [#1512](https://github.com/apify/crawlee-python/issues/1512)
1120
- Fix `same-domain` strategy ignoring public suffix ([#1572](https://github.com/apify/crawlee-python/pull/1572)) ([3d018b2](https://github.com/apify/crawlee-python/commit/3d018b21a28a4bee493829783057188d6106a69b)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1571](https://github.com/apify/crawlee-python/issues/1571)
1221
- Make context helpers work in `FailedRequestHandler` and `ErrorHandler` ([#1570](https://github.com/apify/crawlee-python/pull/1570)) ([b830019](https://github.com/apify/crawlee-python/commit/b830019350830ac33075316061659e2854f7f4a5)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1532](https://github.com/apify/crawlee-python/issues/1532)
1322
- Fix non-ASCII character corruption in `FileSystemStorageClient` on systems without UTF-8 default encoding ([#1580](https://github.com/apify/crawlee-python/pull/1580)) ([f179f86](https://github.com/apify/crawlee-python/commit/f179f8671b0b6af9264450e4fef7e49d1cecd2bd)) by [@Mantisus](https://github.com/Mantisus), closes [#1579](https://github.com/apify/crawlee-python/issues/1579)
23+
- Respect `&lt;base&gt;` when enqueuing ([#1590](https://github.com/apify/crawlee-python/pull/1590)) ([de517a1](https://github.com/apify/crawlee-python/commit/de517a1629cc29b20568143eb64018f216d4ba33)) by [@Mantisus](https://github.com/Mantisus), closes [#1589](https://github.com/apify/crawlee-python/issues/1589)
1424

1525

16-
<!-- git-cliff-unreleased-end -->
1726
## [1.1.0](https://github.com/apify/crawlee-python/releases/tag/v1.1.0) (2025-11-18)
1827

1928
### 🚀 Features

docs/deployment/code_examples/google/cloud_run_example.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from crawlee.storage_clients import MemoryStorageClient
1010

1111

12-
@get('/')
12+
@get('/') # type: ignore[untyped-decorator]
1313
async def main() -> str:
1414
"""The crawler entry point that will be called when the HTTP endpoint is accessed."""
1515
# highlight-start

docs/deployment/code_examples/google/google_example.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,7 @@
66
import functions_framework
77
from flask import Request, Response
88

9-
from crawlee.crawlers import (
10-
BeautifulSoupCrawler,
11-
BeautifulSoupCrawlingContext,
12-
)
9+
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
1310
from crawlee.storage_clients import MemoryStorageClient
1411

1512

@@ -51,7 +48,7 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
5148
# highlight-end
5249

5350

54-
@functions_framework.http
51+
@functions_framework.http # type: ignore[untyped-decorator]
5552
def crawlee_run(request: Request) -> Response:
5653
# You can pass data to your crawler using `request`
5754
function_id = request.headers['Function-Execution-Id']

docs/guides/code_examples/running_in_web_server/server.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
app = FastAPI(lifespan=lifespan, title='Crawler app')
1515

1616

17-
@app.get('/', response_class=HTMLResponse)
17+
@app.get('/', response_class=HTMLResponse) # type: ignore[untyped-decorator]
1818
def index() -> str:
1919
return """
2020
<!DOCTYPE html>
@@ -32,7 +32,7 @@ def index() -> str:
3232
"""
3333

3434

35-
@app.get('/scrape')
35+
@app.get('/scrape') # type: ignore[untyped-decorator]
3636
async def scrape_url(request: Request, url: str | None = None) -> dict:
3737
if not url:
3838
return {'url': 'missing', 'scrape result': 'no results'}

pyproject.toml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
44

55
[project]
66
name = "crawlee"
7-
version = "1.1.1"
7+
version = "1.1.2"
88
description = "Crawlee for Python"
99
authors = [{ name = "Apify Technologies s.r.o.", email = "support@apify.com" }]
1010
license = { file = "LICENSE" }
@@ -34,6 +34,7 @@ keywords = [
3434
"scraping",
3535
]
3636
dependencies = [
37+
"async-timeout>=5.0.1",
3738
"cachetools>=5.5.0",
3839
"colorama>=0.4.0",
3940
"impit>=0.8.0",
@@ -74,7 +75,7 @@ otel = [
7475
]
7576
sql_postgres = [
7677
"sqlalchemy[asyncio]>=2.0.0,<3.0.0",
77-
"asyncpg>=0.24.0; python_version < '3.14'" # TODO: https://github.com/apify/crawlee-python/issues/1555
78+
"asyncpg>=0.24.0"
7879
]
7980
sql_sqlite = [
8081
"sqlalchemy[asyncio]>=2.0.0,<3.0.0",
@@ -101,7 +102,7 @@ dev = [
101102
"build<2.0.0", # For e2e tests.
102103
"dycw-pytest-only<3.0.0",
103104
"fakeredis[probabilistic,json,lua]<3.0.0",
104-
"mypy~=1.18.0",
105+
"mypy~=1.19.0",
105106
"pre-commit<5.0.0",
106107
"proxy-py<3.0.0",
107108
"pydoc-markdown<5.0.0",

src/crawlee/_autoscaling/autoscaled_pool.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,7 @@ async def _worker_task_orchestrator(self, run: _AutoscaledPoolRun) -> None:
241241
current_status = self._system_status.get_current_system_info()
242242
if not current_status.is_system_idle:
243243
logger.info('Not scheduling new tasks - system is overloaded')
244-
await asyncio.sleep(self._OVERLOADED_BACKOFF_TIME)
244+
await asyncio.sleep(self._OVERLOADED_BACKOFF_TIME.total_seconds())
245245
elif self._is_paused:
246246
logger.info('Not scheduling new tasks - the autoscaled pool is paused')
247247
elif self.current_concurrency >= self.desired_concurrency:

src/crawlee/_utils/time.py

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,14 @@
33
import time
44
from contextlib import contextmanager
55
from dataclasses import dataclass
6+
from datetime import timedelta
67
from typing import TYPE_CHECKING
78

9+
from async_timeout import Timeout, timeout
10+
811
if TYPE_CHECKING:
912
from collections.abc import Iterator
10-
from datetime import timedelta
13+
from types import TracebackType
1114

1215
_SECONDS_PER_MINUTE = 60
1316
_SECONDS_PER_HOUR = 3600
@@ -35,6 +38,43 @@ def measure_time() -> Iterator[TimerResult]:
3538
result.cpu = after_cpu - before_cpu
3639

3740

41+
class SharedTimeout:
42+
"""Keeps track of a time budget shared by multiple independent async operations.
43+
44+
Provides a reusable, non-reentrant context manager interface.
45+
"""
46+
47+
def __init__(self, timeout: timedelta) -> None:
48+
self._remaining_timeout = timeout
49+
self._active_timeout: Timeout | None = None
50+
self._activation_timestamp: float | None = None
51+
52+
async def __aenter__(self) -> timedelta:
53+
if self._active_timeout is not None or self._activation_timestamp is not None:
54+
raise RuntimeError('A shared timeout context cannot be entered twice at the same time')
55+
56+
self._activation_timestamp = time.monotonic()
57+
self._active_timeout = new_timeout = timeout(self._remaining_timeout.total_seconds())
58+
await new_timeout.__aenter__()
59+
return self._remaining_timeout
60+
61+
async def __aexit__(
62+
self,
63+
exc_type: type[BaseException] | None,
64+
exc_value: BaseException | None,
65+
exc_traceback: TracebackType | None,
66+
) -> None:
67+
if self._active_timeout is None or self._activation_timestamp is None:
68+
raise RuntimeError('Logic error')
69+
70+
await self._active_timeout.__aexit__(exc_type, exc_value, exc_traceback)
71+
elapsed = time.monotonic() - self._activation_timestamp
72+
self._remaining_timeout = self._remaining_timeout - timedelta(seconds=elapsed)
73+
74+
self._active_timeout = None
75+
self._activation_timestamp = None
76+
77+
3878
def format_duration(duration: timedelta | None) -> str:
3979
"""Format a timedelta into a human-readable string with appropriate units."""
4080
if duration is None:

src/crawlee/crawlers/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from crawlee._utils.try_import import install_import_hook as _install_import_hook
22
from crawlee._utils.try_import import try_import as _try_import
33

4-
from ._abstract_http import AbstractHttpCrawler, AbstractHttpParser, ParsedHttpCrawlingContext
4+
from ._abstract_http import AbstractHttpCrawler, AbstractHttpParser, HttpCrawlerOptions, ParsedHttpCrawlingContext
55
from ._basic import BasicCrawler, BasicCrawlerOptions, BasicCrawlingContext, ContextPipeline
66
from ._http import HttpCrawler, HttpCrawlingContext, HttpCrawlingResult
77

@@ -51,6 +51,7 @@
5151
'BeautifulSoupParserType',
5252
'ContextPipeline',
5353
'HttpCrawler',
54+
'HttpCrawlerOptions',
5455
'HttpCrawlingContext',
5556
'HttpCrawlingResult',
5657
'ParsedHttpCrawlingContext',
Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1-
from ._abstract_http_crawler import AbstractHttpCrawler
1+
from ._abstract_http_crawler import AbstractHttpCrawler, HttpCrawlerOptions
22
from ._abstract_http_parser import AbstractHttpParser
33
from ._http_crawling_context import ParsedHttpCrawlingContext
44

55
__all__ = [
66
'AbstractHttpCrawler',
77
'AbstractHttpParser',
8+
'HttpCrawlerOptions',
89
'ParsedHttpCrawlingContext',
910
]

0 commit comments

Comments
 (0)