apify
diff --git a/‎src/crawlee/_utils/blocked.py‎
Lines changed: 25 additions & 0 deletions b/‎src/crawlee/_utils/blocked.py‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎src/crawlee/basic_crawler/basic_crawler.py‎
Lines changed: 103 additions & 20 deletions b/‎src/crawlee/basic_crawler/basic_crawler.py‎
Lines changed: 103 additions & 20 deletions
diff --git a/‎src/crawlee/basic_crawler/context_pipeline.py‎
Lines changed: 11 additions & 34 deletions b/‎src/crawlee/basic_crawler/context_pipeline.py‎
Lines changed: 11 additions & 34 deletions
@@ -0,0 +1,25 @@
+# Inspiration: https://github.com/apify/crawlee/blob/v3.9.2/packages/utils/src/internals/blocked.ts
+
+CLOUDFLARE_RETRY_CSS_SELECTORS = [
+    '#turnstile-wrapper iframe[src^="https://challenges.cloudflare.com"]',
+]
+
+RETRY_CSS_SELECTORS = [
+    *CLOUDFLARE_RETRY_CSS_SELECTORS,
+    'div#infoDiv0 a[href*="//www.google.com/policies/terms/"]',
+    'iframe[src*="_Incapsula_Resource"]',
+]
+"""
+CSS selectors for elements that should trigger a retry, as the crawler is likely getting blocked.
+"""
+
+ROTATE_PROXY_ERRORS = [
+    'ECONNRESET',
+    'ECONNREFUSED',
+    'ERR_PROXY_CONNECTION_FAILED',
+    'ERR_TUNNEL_CONNECTION_FAILED',
+    'Proxy responded with',
+]
+"""
+Content of proxy errors that should trigger a retry, as the proxy is likely getting blocked / is malfunctioning.
+"""
@@ -2,6 +2,7 @@
 from __future__ import annotations
 
 import tempfile
+from contextlib import AsyncExitStack
 from datetime import timedelta
 from functools import partial
 from logging import getLogger
@@ -18,9 +19,13 @@
 from crawlee.autoscaling.system_status import SystemStatus
 from crawlee.basic_crawler.context_pipeline import (
     ContextPipeline,
+)
+from crawlee.basic_crawler.errors import (
     ContextPipelineInitializationError,
     ContextPipelineInterruptedError,
     RequestHandlerError,
+    SessionError,
+    UserDefinedErrorHandlerError,
 )
 from crawlee.basic_crawler.router import Router
 from crawlee.basic_crawler.types import (
@@ -34,12 +39,14 @@
 from crawlee.events.local_event_manager import LocalEventManager
 from crawlee.http_clients.httpx_client import HttpxClient
 from crawlee.request import BaseRequestData, Request, RequestState
+from crawlee.sessions import SessionPool
 from crawlee.storages.request_queue import RequestQueue
 
 if TYPE_CHECKING:
     import re
 
     from crawlee.http_clients.base_http_client import BaseHttpClient, HttpResponse
+    from crawlee.sessions.session import Session
     from crawlee.storages.request_provider import RequestProvider
 
 TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext)
@@ -49,10 +56,6 @@
 logger = getLogger(__name__)
 
 
-class UserDefinedErrorHandlerError(Exception):
-    """Wraps an exception thrown from an user-defined error handler."""
-
-
 class BasicCrawler(Generic[TCrawlingContext]):
     """Provides a simple framework for parallel crawling of web pages.
 
@@ -72,8 +75,12 @@ def __init__(
         http_client: BaseHttpClient | None = None,
         concurrency_settings: ConcurrencySettings | None = None,
         max_request_retries: int = 3,
+        max_session_rotations: int = 10,
         configuration: Configuration | None = None,
         request_handler_timeout: timedelta = timedelta(minutes=1),
+        session_pool: SessionPool | None = None,
+        use_session_pool: bool = True,
+        retry_on_blocked: bool = True,
         _context_pipeline: ContextPipeline[TCrawlingContext] | None = None,
     ) -> None:
         """Initialize the BasicCrawler.
@@ -84,8 +91,14 @@ def __init__(
             http_client: HTTP client to be used for `BasicCrawlingContext.send_request` and HTTP-only crawling.
             concurrency_settings: Allows fine-tuning concurrency levels
             max_request_retries: Maximum amount of attempts at processing a request
+            max_session_rotations: Maximum number of session rotations per request.
+                The crawler will automatically rotate the session in case of a proxy error or if it gets blocked by
+                the website.
             configuration: Crawler configuration
             request_handler_timeout: How long is a single request handler allowed to run
+            use_session_pool: Enables using the session pool for crawling
+            session_pool: A preconfigured SessionPool instance if you wish to use non-default configuration
+            retry_on_blocked: If set to True, the crawler will try to automatically bypass any detected bot protection
             _context_pipeline: Allows extending the request lifecycle and modifying the crawling context.
                 This parameter is meant to be used by child classes, not when BasicCrawler is instantiated directly.
         """
@@ -105,6 +118,7 @@ def __init__(
         self._failed_request_handler: FailedRequestHandler[TCrawlingContext] | None = None
 
         self._max_request_retries = max_request_retries
+        self._max_session_rotations = max_session_rotations
 
         self._request_provider = request_provider
         self._configuration = configuration or Configuration()
@@ -129,6 +143,11 @@ def __init__(
             concurrency_settings=concurrency_settings,
         )
 
+        self._use_session_pool = use_session_pool
+        self._session_pool: SessionPool = session_pool or SessionPool()
+
+        self._retry_on_blocked = retry_on_blocked
+
     @property
     def router(self) -> Router[TCrawlingContext]:
         """The router used to handle each individual crawling request."""
@@ -144,6 +163,20 @@ def router(self, router: Router[TCrawlingContext]) -> None:
 
         self._router = router
 
+    async def _get_session(self) -> Session | None:
+        """If session pool is being used, try to take a session from it."""
+        if not self._use_session_pool:
+            return None
+
+        return await wait_for(
+            self._session_pool.get_session,
+            timeout=self._internal_timeout,
+            timeout_message='Fetching a session from the pool timed out after '
+            f'{self._internal_timeout.total_seconds()} seconds',
+            max_retries=3,
+            logger=logger,
+        )
+
     async def get_request_provider(self) -> RequestProvider:
         """Return the configured request provider. If none is configured, open and return the default request queue."""
         if not self._request_provider:
@@ -188,19 +221,29 @@ async def run(self, requests: list[str | BaseRequestData] | None = None) -> Fina
         if requests is not None:
             await self.add_requests(requests)
 
-        async with self._event_manager, self._snapshotter:
+        async with AsyncExitStack() as exit_stack:
+            await exit_stack.enter_async_context(self._event_manager)
+            await exit_stack.enter_async_context(self._snapshotter)
+
+            if self._use_session_pool:
+                await exit_stack.enter_async_context(self._session_pool)
+
             await self._pool.run()
 
         return FinalStatistics()
 
-    def _should_retry_request(self, crawling_context: BasicCrawlingContext) -> bool:
+    def _should_retry_request(self, crawling_context: BasicCrawlingContext, error: Exception) -> bool:
+        if crawling_context.request.no_retry:
+            return False
+
+        if isinstance(error, SessionError):
+            return ((crawling_context.request.session_rotation_count or 0) + 1) < self._max_session_rotations
+
         max_request_retries = crawling_context.request.max_retries
         if max_request_retries is None:
             max_request_retries = self._max_request_retries
 
-        return (
-            not crawling_context.request.no_retry and (crawling_context.request.retry_count + 1) < max_request_retries
-        )
+        return (crawling_context.request.retry_count + 1) < max_request_retries
 
     async def _check_url_after_redirects(
         self, crawling_context: TCrawlingContext
@@ -273,7 +316,7 @@ def _check_url_patterns(
     async def _handle_request_error(self, crawling_context: TCrawlingContext, error: Exception) -> None:
         request_provider = await self.get_request_provider()
 
-        if self._should_retry_request(crawling_context):
+        if self._should_retry_request(crawling_context, error):
             request = crawling_context.request
             request.retry_count += 1
 
@@ -307,9 +350,16 @@ async def _handle_failed_request(self, crawling_context: TCrawlingContext, error
             except Exception as e:
                 raise UserDefinedErrorHandlerError('Exception thrown in user-defined failed request handler') from e
 
-    def _prepare_send_request_function(self) -> SendRequestFunction:
-        async def send_request(url: str, *, method: str = 'get', headers: dict[str, str] | None = None) -> HttpResponse:
-            return await self._http_client.send_request(url, method=method, headers=httpx.Headers(headers))
+    def _prepare_send_request_function(self, session: Session | None) -> SendRequestFunction:
+        async def send_request(
+            url: str,
+            *,
+            method: str = 'get',
+            headers: dict[str, str] | None = None,
+        ) -> HttpResponse:
+            return await self._http_client.send_request(
+                url, method=method, headers=httpx.Headers(headers), session=session
+            )
 
         return send_request
 
@@ -350,7 +400,7 @@ async def __is_task_ready_function(self) -> bool:
         request_provider = await self.get_request_provider()
         return not await request_provider.is_empty()
 
-    async def __run_task_function(self) -> None:
+    async def __run_task_function(self) -> None:  # noqa: PLR0912
         request_provider = await self.get_request_provider()
 
         request = await wait_for(
@@ -364,14 +414,13 @@ async def __run_task_function(self) -> None:
         if request is None:
             return
 
-        # TODO: fetch session from the session pool
-        # https://github.com/apify/crawlee-py/issues/110
-
+        session = await self._get_session()
         result = RequestHandlerRunResult()
 
         crawling_context = BasicCrawlingContext(
             request=request,
-            send_request=self._prepare_send_request_function(),
+            session=session,
+            send_request=self._prepare_send_request_function(session),
             add_requests=result.add_requests,
         )
 
@@ -398,6 +447,9 @@ async def __run_task_function(self) -> None:
             )
 
             request.state = RequestState.DONE
+
+            if crawling_context.session:
+                crawling_context.session.mark_good()
         except RequestHandlerError as primary_error:
             primary_error = cast(
                 RequestHandlerError[TCrawlingContext], primary_error
@@ -428,6 +480,34 @@ async def __run_task_function(self) -> None:
                 )
                 request.state = RequestState.ERROR
                 raise
+
+            if crawling_context.session:
+                crawling_context.session.mark_bad()
+        except SessionError as session_error:
+            if not crawling_context.session:
+                raise RuntimeError('SessionError raised in a crawling context without a session') from session_error
+
+            if self._should_retry_request(crawling_context, session_error):
+                logger.warning('Encountered a session error, rotating session and retrying')
+
+                crawling_context.session.retire()
+
+                if crawling_context.request.session_rotation_count is None:
+                    crawling_context.request.session_rotation_count = 0
+                crawling_context.request.session_rotation_count += 1
+
+                await request_provider.reclaim_request(request)
+            else:
+                logger.exception('Request failed and reached maximum retries', exc_info=session_error)
+
+                await wait_for(
+                    lambda: request_provider.mark_request_as_handled(crawling_context.request),
+                    timeout=self._internal_timeout,
+                    timeout_message='Marking request as handled timed out after '
+                    f'{self._internal_timeout.total_seconds()} seconds',
+                    logger=logger,
+                    max_retries=3,
+                )
         except ContextPipelineInterruptedError as interruped_error:
             logger.debug('The context pipeline was interrupted', exc_info=interruped_error)
 
@@ -440,9 +520,9 @@ async def __run_task_function(self) -> None:
                 max_retries=3,
             )
         except ContextPipelineInitializationError as initialization_error:
-            if self._should_retry_request(crawling_context):
+            if self._should_retry_request(crawling_context, initialization_error):
                 logger.debug(
-                    'An exception occured during the initialization of crawling context, a retry is in order',
+                    'An exception occurred during the initialization of crawling context, a retry is in order',
                     exc_info=initialization_error,
                 )
 
@@ -461,6 +541,9 @@ async def __run_task_function(self) -> None:
                     logger=logger,
                     max_retries=3,
                 )
+
+            if crawling_context.session:
+                crawling_context.session.mark_bad()
         except Exception as internal_error:
             logger.exception(
                 'An exception occurred during handling of a request. This places the crawler '
 
@@ -4,46 +4,19 @@
 
 from typing_extensions import TypeVar
 
+from crawlee.basic_crawler.errors import (
+    ContextPipelineFinalizationError,
+    ContextPipelineInitializationError,
+    ContextPipelineInterruptedError,
+    RequestHandlerError,
+    SessionError,
+)
 from crawlee.basic_crawler.types import BasicCrawlingContext
 
 TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext)
 TMiddlewareCrawlingContext = TypeVar('TMiddlewareCrawlingContext', bound=BasicCrawlingContext)
 
 
-class RequestHandlerError(Exception, Generic[TCrawlingContext]):
-    """Wraps an exception thrown from a request handler (router) and extends it with crawling context."""
-
-    def __init__(self, wrapped_exception: Exception, crawling_context: TCrawlingContext) -> None:
-        self.wrapped_exception = wrapped_exception
-        self.crawling_context = crawling_context
-
-
-class ContextPipelineInitializationError(Exception):
-    """Wraps an exception thrown in the initialization step of a context pipeline middleware.
-
-    We may not have the complete context at this point, so only `BasicCrawlingContext` is provided.
-    """
-
-    def __init__(self, wrapped_exception: Exception, crawling_context: BasicCrawlingContext) -> None:
-        self.wrapped_exception = wrapped_exception
-        self.crawling_context = crawling_context
-
-
-class ContextPipelineInterruptedError(Exception):
-    """May be thrown in the initialization phase of a middleware to signal that the request should not be processed."""
-
-
-class ContextPipelineFinalizationError(Exception):
-    """Wraps an exception thrown in the finalization step of a context pipeline middleware.
-
-    We may not have the complete context at this point, so only `BasicCrawlingContext` is provided.
-    """
-
-    def __init__(self, wrapped_exception: Exception, crawling_context: BasicCrawlingContext) -> None:
-        self.wrapped_exception = wrapped_exception
-        self.crawling_context = crawling_context
-
-
 class ContextPipeline(Generic[TCrawlingContext]):
     """Encapsulates the logic of gradually enhancing the crawling context with additional information and utilities.
 
@@ -87,6 +60,8 @@ async def __call__(
                     middleware_instance = member._middleware(crawling_context)  # noqa: SLF001
                     try:
                         result = await middleware_instance.__anext__()
+                    except SessionError:  # Session errors get special treatment
+                        raise
                     except StopAsyncIteration as e:
                         raise RuntimeError('The middleware did not yield') from e
                     except ContextPipelineInterruptedError:
@@ -99,6 +74,8 @@ async def __call__(
 
             try:
                 await final_context_consumer(cast(TCrawlingContext, crawling_context))
+            except SessionError:  # Session errors get special treatment
+                raise
             except Exception as e:
                 raise RequestHandlerError(e, crawling_context) from e
         finally: