Skip to content

Commit 340c53d

Browse files
committed
Add crawl_one_required_contexts property. (Alternative to accessing internals of sub crawlers)
Cleanup commit results.
1 parent 957915a commit 340c53d

4 files changed

Lines changed: 44 additions & 14 deletions

File tree

src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py

Lines changed: 25 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import asyncio
44
import logging
5+
from contextlib import AsyncExitStack
56
from copy import deepcopy
67
from logging import getLogger
78
from random import random
@@ -189,14 +190,27 @@ async def run(
189190
purge_request_queue: If this is `True` and the crawler is not being run for the first time, the default
190191
request queue will be purged.
191192
"""
192-
# TODO: Create something more robust that does not leak implementation so much
193-
async with (self.beautifulsoup_crawler.statistics, self.playwright_crawler.statistics,
194-
self.playwright_crawler._additional_context_managers[0]):
193+
contexts_to_enter = [
194+
cm
195+
for cm in (self.beautifulsoup_crawler.crawl_one_required_contexts
196+
+ self.playwright_crawler.crawl_one_required_contexts)
197+
if cm and getattr(cm, 'active', False) is False
198+
]
199+
200+
# Enter contexts required by sub crawler for them to be able to do `crawl_one`
201+
async with AsyncExitStack() as exit_stack:
202+
for context in contexts_to_enter:
203+
await exit_stack.enter_async_context(context)
195204
return await super().run(requests=requests, purge_request_queue=purge_request_queue)
196205

206+
# AsyncExitStack can in theory swallow exceptions and so the return might not execute.
207+
# https://github.com/python/mypy/issues/7726
208+
raise RuntimeError('FinalStatistics not created.')
209+
210+
197211
# Can't use override as mypy does not like it for double underscore private method.
198212
async def _BasicCrawler__run_request_handler(self, context: BasicCrawlingContext) -> None: # noqa: N802
199-
"""Overrided BasicCrawler method that delegates request processing to sub crawlers.
213+
"""Override BasicCrawler method that delegates request processing to sub crawlers.
200214
201215
To decide which sub crawler should process the request it runs `rendering_type_predictor`.
202216
To check if results are valid it uses `result_checker`.
@@ -271,17 +285,16 @@ async def _run_subcrawler(crawler: BeautifulSoupCrawler | PlaywrightCrawler,
271285
self.rendering_type_predictor.store_result(context.request.url, context.request.label, detection_result)
272286

273287
async def commit_result(self, result: RequestHandlerRunResult, context: BasicCrawlingContext) -> None:
274-
result_tasks = []
275-
result_tasks.extend([
276-
asyncio.create_task(context.push_data(**kwargs)) for kwargs in result.push_data_calls])
277-
result_tasks.extend([
278-
asyncio.create_task(context.add_requests(**kwargs)) for kwargs in result.add_requests_calls])
288+
result_tasks = [
289+
asyncio.create_task(context.push_data(**kwargs)) for kwargs in result.push_data_calls
290+
] + [
291+
asyncio.create_task(context.add_requests(**kwargs)) for kwargs in result.add_requests_calls
292+
] + [
293+
asyncio.create_task(self._commit_key_value_store_changes(result))
294+
]
279295

280-
# What to do with KV changes????
281296
await asyncio.gather(*result_tasks)
282297

283-
# Optimize if needed
284-
await self._commit_key_value_store_changes(result)
285298

286299

287300
def pre_navigation_hook(self, hook: Callable[[Any], Awaitable[None]]) -> None:

src/crawlee/crawlers/_basic/_basic_crawler.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -523,6 +523,7 @@ def sigint_handler() -> None:
523523

524524
return final_statistics
525525

526+
526527
async def _run_crawler(self) -> None:
527528
event_manager = service_locator.get_event_manager()
528529

@@ -1122,6 +1123,13 @@ async def __run_request_handler(self, context: BasicCrawlingContext) -> None:
11221123
await self._context_pipeline(context, self.router)
11231124

11241125

1126+
@property
1127+
def crawl_one_required_contexts(self) -> list[AbstractAsyncContextManager]:
1128+
"""Contexts that have to be active before `crawl_one` can be called."""
1129+
contexts: list[AbstractAsyncContextManager] = []
1130+
contexts.append(self.statistics)
1131+
return contexts
1132+
11251133
async def crawl_one(self, *, context: BasicCrawlingContext,
11261134
request_handler_timeout: timedelta,
11271135
result: RequestHandlerRunResult,

src/crawlee/crawlers/_playwright/_playwright_crawler.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020

2121
if TYPE_CHECKING:
2222
from collections.abc import AsyncGenerator, Awaitable, Mapping
23+
from contextlib import AbstractAsyncContextManager
2324

2425
from typing_extensions import Unpack
2526

@@ -285,3 +286,11 @@ def pre_navigation_hook(self, hook: Callable[[PlaywrightPreNavCrawlingContext],
285286
hook: A coroutine function to be called before each navigation.
286287
"""
287288
self._pre_navigation_hooks.append(hook)
289+
290+
291+
@property
292+
def crawl_one_required_contexts(self) -> list[AbstractAsyncContextManager]:
293+
"""Contexts that have to be active before `crawl_one` can be called."""
294+
contexts = super().crawl_one_required_contexts
295+
contexts.append(self._browser_pool)
296+
return contexts

tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import logging
44
from datetime import timedelta
55
from itertools import cycle
6-
from typing import TYPE_CHECKING, cast
6+
from typing import TYPE_CHECKING, Any, cast
77
from unittest.mock import Mock, patch
88

99
import pytest
@@ -250,7 +250,7 @@ def test_adaptive_default_hooks_raise_exception() -> None:
250250

251251
with pytest.raises(RuntimeError):
252252
@crawler.pre_navigation_hook
253-
def some_hook() -> None:
253+
async def some_hook(whatever: Any) -> None:
254254
pass
255255

256256

0 commit comments

Comments
 (0)