Skip to content

Commit baa2052

Browse files
committed
Use context result map for handling request handler results
1 parent b4ba31b commit baa2052

3 files changed

Lines changed: 27 additions & 23 deletions

File tree

src/crawlee/_types.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -558,3 +558,7 @@ class BasicCrawlingContext:
558558

559559
log: logging.Logger
560560
"""Logger instance."""
561+
562+
def __hash__(self) -> int:
563+
"""Return has of the context. Each context is considered unique."""
564+
return id(self)

src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -319,9 +319,7 @@ async def from_pw_pipeline_to_top_router(context: PlaywrightCrawlingContext) ->
319319
)
320320

321321
@override
322-
async def _run_request_handler(
323-
self, context: BasicCrawlingContext, result: RequestHandlerRunResult
324-
) -> RequestHandlerRunResult:
322+
async def _run_request_handler(self, context: BasicCrawlingContext) -> None:
325323
"""Override BasicCrawler method that delegates request processing to sub crawlers.
326324
327325
To decide which sub crawler should process the request it runs `rendering_type_predictor`.
@@ -343,7 +341,8 @@ async def _run_request_handler(
343341

344342
static_run = await self._crawl_one(rendering_type='static', context=context)
345343
if static_run.result and self.result_checker(static_run.result):
346-
return static_run.result
344+
self._context_result_map[context] = static_run.result
345+
return
347346
if static_run.exception:
348347
context.log.exception(
349348
msg=f'Static crawler: failed for {context.request.url}', exc_info=static_run.exception
@@ -367,7 +366,12 @@ async def _run_request_handler(
367366
pw_run = await self._crawl_one('client only', context=context)
368367
self.track_browser_request_handler_runs()
369368

369+
if pw_run.exception is not None:
370+
raise pw_run.exception
371+
370372
if pw_run.result:
373+
self._context_result_map[context] = pw_run.result
374+
371375
if should_detect_rendering_type:
372376
detection_result: RenderingType
373377
static_run = await self._crawl_one('static', context=context, state=old_state_copy)
@@ -379,11 +383,6 @@ async def _run_request_handler(
379383

380384
context.log.debug(f'Detected rendering type {detection_result} for {context.request.url}')
381385
self.rendering_type_predictor.store_result(context.request, detection_result)
382-
return pw_run.result
383-
if pw_run.exception is not None:
384-
raise pw_run.exception
385-
# Unreachable code, but mypy can't know it.
386-
raise RuntimeError('Missing both result and exception.')
387386

388387
def pre_navigation_hook(
389388
self,

src/crawlee/crawlers/_basic/_basic_crawler.py

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from pathlib import Path
1515
from typing import TYPE_CHECKING, Any, Callable, Generic, Union, cast
1616
from urllib.parse import ParseResult, urlparse
17+
from weakref import WeakKeyDictionary
1718

1819
from tldextract import TLDExtract
1920
from typing_extensions import NotRequired, TypedDict, TypeVar, Unpack, assert_never
@@ -290,6 +291,9 @@ def __init__(
290291
self._failed_request_handler: FailedRequestHandler[TCrawlingContext | BasicCrawlingContext] | None = None
291292
self._abort_on_error = abort_on_error
292293

294+
# Context of each request with matching result.
295+
self._context_result_map = WeakKeyDictionary[BasicCrawlingContext, RequestHandlerRunResult]()
296+
293297
# Context pipeline
294298
self._context_pipeline = (_context_pipeline or ContextPipeline()).compose(self._check_url_after_redirects)
295299

@@ -908,9 +912,9 @@ async def send_request(
908912

909913
return send_request
910914

911-
async def _commit_request_handler_result(
912-
self, context: BasicCrawlingContext, result: RequestHandlerRunResult
913-
) -> None:
915+
async def _commit_request_handler_result(self, context: BasicCrawlingContext) -> None:
916+
result = self._context_result_map[context]
917+
914918
request_manager = await self.get_request_manager()
915919
origin = context.request.loaded_url or context.request.url
916920

@@ -1018,19 +1022,20 @@ async def __run_task_function(self) -> None:
10181022

10191023
session = await self._get_session()
10201024
proxy_info = await self._get_proxy_info(request, session)
1021-
empty_result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store)
1025+
result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store)
10221026

10231027
context = BasicCrawlingContext(
10241028
request=request,
10251029
session=session,
10261030
proxy_info=proxy_info,
10271031
send_request=self._prepare_send_request_function(session, proxy_info),
1028-
add_requests=empty_result.add_requests,
1029-
push_data=empty_result.push_data,
1030-
get_key_value_store=empty_result.get_key_value_store,
1032+
add_requests=result.add_requests,
1033+
push_data=result.push_data,
1034+
get_key_value_store=result.get_key_value_store,
10311035
use_state=self._use_state,
10321036
log=self._logger,
10331037
)
1038+
self._context_result_map[context] = result
10341039

10351040
statistics_id = request.id or request.unique_key
10361041
self._statistics.record_request_processing_start(statistics_id)
@@ -1039,12 +1044,11 @@ async def __run_task_function(self) -> None:
10391044
request.state = RequestState.REQUEST_HANDLER
10401045

10411046
try:
1042-
result = await self._run_request_handler(context=context, result=empty_result)
1047+
await self._run_request_handler(context=context)
10431048
except asyncio.TimeoutError as e:
10441049
raise RequestHandlerError(e, context) from e
10451050

1046-
await self._commit_request_handler_result(context, result)
1047-
1051+
await self._commit_request_handler_result(context)
10481052
await wait_for(
10491053
lambda: request_manager.mark_request_as_handled(context.request),
10501054
timeout=self._internal_timeout,
@@ -1132,17 +1136,14 @@ async def __run_task_function(self) -> None:
11321136
)
11331137
raise
11341138

1135-
async def _run_request_handler(
1136-
self, context: BasicCrawlingContext, result: RequestHandlerRunResult
1137-
) -> RequestHandlerRunResult:
1139+
async def _run_request_handler(self, context: BasicCrawlingContext) -> None:
11381140
await wait_for(
11391141
lambda: self._context_pipeline(context, self.router),
11401142
timeout=self._request_handler_timeout,
11411143
timeout_message='Request handler timed out after '
11421144
f'{self._request_handler_timeout.total_seconds()} seconds',
11431145
logger=self._logger,
11441146
)
1145-
return result
11461147

11471148
def _is_session_blocked_status_code(self, session: Session | None, status_code: int) -> bool:
11481149
"""Check if the HTTP status code indicates that the session was blocked by the target website.

0 commit comments

Comments
 (0)