|
2 | 2 |
|
3 | 3 | import asyncio |
4 | 4 | import logging |
| 5 | +from contextlib import AsyncExitStack |
5 | 6 | from copy import deepcopy |
6 | 7 | from logging import getLogger |
7 | 8 | from random import random |
@@ -189,14 +190,27 @@ async def run( |
189 | 190 | purge_request_queue: If this is `True` and the crawler is not being run for the first time, the default |
190 | 191 | request queue will be purged. |
191 | 192 | """ |
192 | | - # TODO: Create something more robust that does not leak implementation so much |
193 | | - async with (self.beautifulsoup_crawler.statistics, self.playwright_crawler.statistics, |
194 | | - self.playwright_crawler._additional_context_managers[0]): |
| 193 | + contexts_to_enter = [ |
| 194 | + cm |
| 195 | + for cm in (self.beautifulsoup_crawler.crawl_one_required_contexts |
| 196 | + + self.playwright_crawler.crawl_one_required_contexts) |
| 197 | + if cm and getattr(cm, 'active', False) is False |
| 198 | + ] |
| 199 | + |
| 200 | + # Enter contexts required by sub crawler for them to be able to do `crawl_one` |
| 201 | + async with AsyncExitStack() as exit_stack: |
| 202 | + for context in contexts_to_enter: |
| 203 | + await exit_stack.enter_async_context(context) |
195 | 204 | return await super().run(requests=requests, purge_request_queue=purge_request_queue) |
196 | 205 |
|
| 206 | + # AsyncExitStack can in theory swallow exceptions and so the return might not execute. |
| 207 | + # https://github.com/python/mypy/issues/7726 |
| 208 | + raise RuntimeError('FinalStatistics not created.') |
| 209 | + |
| 210 | + |
197 | 211 | # Can't use override as mypy does not like it for double underscore private method. |
198 | 212 | async def _BasicCrawler__run_request_handler(self, context: BasicCrawlingContext) -> None: # noqa: N802 |
199 | | - """Overrided BasicCrawler method that delegates request processing to sub crawlers. |
| 213 | + """Override BasicCrawler method that delegates request processing to sub crawlers. |
200 | 214 |
|
201 | 215 | To decide which sub crawler should process the request it runs `rendering_type_predictor`. |
202 | 216 | To check if results are valid it uses `result_checker`. |
@@ -271,17 +285,16 @@ async def _run_subcrawler(crawler: BeautifulSoupCrawler | PlaywrightCrawler, |
271 | 285 | self.rendering_type_predictor.store_result(context.request.url, context.request.label, detection_result) |
272 | 286 |
|
273 | 287 | async def commit_result(self, result: RequestHandlerRunResult, context: BasicCrawlingContext) -> None: |
274 | | - result_tasks = [] |
275 | | - result_tasks.extend([ |
276 | | - asyncio.create_task(context.push_data(**kwargs)) for kwargs in result.push_data_calls]) |
277 | | - result_tasks.extend([ |
278 | | - asyncio.create_task(context.add_requests(**kwargs)) for kwargs in result.add_requests_calls]) |
| 288 | + result_tasks = [ |
| 289 | + asyncio.create_task(context.push_data(**kwargs)) for kwargs in result.push_data_calls |
| 290 | + ] + [ |
| 291 | + asyncio.create_task(context.add_requests(**kwargs)) for kwargs in result.add_requests_calls |
| 292 | + ] + [ |
| 293 | + asyncio.create_task(self._commit_key_value_store_changes(result)) |
| 294 | + ] |
279 | 295 |
|
280 | | - # What to do with KV changes???? |
281 | 296 | await asyncio.gather(*result_tasks) |
282 | 297 |
|
283 | | - # Optimize if needed |
284 | | - await self._commit_key_value_store_changes(result) |
285 | 298 |
|
286 | 299 |
|
287 | 300 | def pre_navigation_hook(self, hook: Callable[[Any], Awaitable[None]]) -> None: |
|
0 commit comments