apify
diff --git a/‎CHANGELOG.md‎
Lines changed: 2 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/crawlee/basic_crawler/basic_crawler.py‎
Lines changed: 110 additions & 7 deletions b/‎src/crawlee/basic_crawler/basic_crawler.py‎
Lines changed: 110 additions & 7 deletions
diff --git a/‎src/crawlee/basic_crawler/types.py‎
Lines changed: 66 additions & 5 deletions b/‎src/crawlee/basic_crawler/types.py‎
Lines changed: 66 additions & 5 deletions
diff --git a/‎src/crawlee/beautifulsoup_crawler/beautifulsoup_crawler.py‎
Lines changed: 9 additions & 3 deletions b/‎src/crawlee/beautifulsoup_crawler/beautifulsoup_crawler.py‎
Lines changed: 9 additions & 3 deletions
@@ -11,6 +11,8 @@
 - Browser rotation with a maximum number of pages opened per browser.
 - Add emit persist state event to event manager
 - Add batched request addition in `RequestQueue`
+- Add start requests option to `BasicCrawler`
+- Add storage-related helpers `get_data`, `push_data` and `export_to` to `BasicCrawler` and `BasicContext`
 
 ## [0.0.4](../../releases/tag/v0.0.4) - 2024-05-30
 
 
@@ -11,7 +11,7 @@
 
 import httpx
 from tldextract import TLDExtract
-from typing_extensions import NotRequired, TypedDict, TypeVar, assert_never
+from typing_extensions import NotRequired, TypedDict, TypeVar, Unpack, assert_never
 
 from crawlee import Glob
 from crawlee._utils.wait import wait_for
@@ -32,10 +32,10 @@
 from crawlee.enqueue_strategy import EnqueueStrategy
 from crawlee.events.local_event_manager import LocalEventManager
 from crawlee.http_clients.httpx_client import HttpxClient
-from crawlee.models import BaseRequestData, Request, RequestState
+from crawlee.models import BaseRequestData, DatasetItemsListPage, Request, RequestState
 from crawlee.sessions import SessionPool
 from crawlee.statistics.statistics import Statistics
-from crawlee.storages import RequestQueue
+from crawlee.storages import Dataset, KeyValueStore, RequestQueue
 
 if TYPE_CHECKING:
     import re
@@ -44,6 +44,7 @@
     from crawlee.proxy_configuration import ProxyConfiguration, ProxyInfo
     from crawlee.sessions.session import Session
     from crawlee.statistics.models import FinalStatistics, StatisticsState
+    from crawlee.storages.dataset import ExportToKwargs, GetDataKwargs, PushDataKwargs
     from crawlee.storages.request_provider import RequestProvider
 
 TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext)
@@ -86,6 +87,7 @@ class BasicCrawler(Generic[TCrawlingContext]):
 
     def __init__(
         self,
+        start_requests: Sequence[str | BaseRequestData | Request] | None = None,
         *,
         request_provider: RequestProvider | None = None,
         request_handler: Callable[[TCrawlingContext], Awaitable[None]] | None = None,
@@ -106,6 +108,7 @@ def __init__(
         """Initialize the BasicCrawler.
 
         Args:
+            start_requests: A list of URLs to start crawling from
             request_provider: Provides requests to be processed
             request_handler: A callable to which request handling is delegated
             http_client: HTTP client to be used for `BasicCrawlingContext.send_request` and HTTP-only crawling.
@@ -126,6 +129,7 @@ def __init__(
                 This parameter is meant to be used by child classes, not when BasicCrawler is instantiated directly.
             _additional_context_managers: Additional context managers to be used in the crawler lifecycle.
         """
+        self._start_requests = start_requests or []
         self._router: Router[TCrawlingContext] | None = None
 
         if isinstance(cast(Router, request_handler), Router):
@@ -227,13 +231,39 @@ async def _get_proxy_info(self, request: Request, session: Session | None) -> Pr
             proxy_tier=None,
         )
 
-    async def get_request_provider(self) -> RequestProvider:
+    async def get_request_provider(
+        self,
+        *,
+        id: str | None = None,
+        name: str | None = None,
+        configuration: Configuration | None = None,
+    ) -> RequestProvider:
         """Return the configured request provider. If none is configured, open and return the default request queue."""
         if not self._request_provider:
-            self._request_provider = await RequestQueue.open()
+            self._request_provider = await RequestQueue.open(id=id, name=name, configuration=configuration)
 
         return self._request_provider
 
+    async def get_dataset(
+        self,
+        *,
+        id: str | None = None,
+        name: str | None = None,
+        configuration: Configuration | None = None,
+    ) -> Dataset:
+        """Return the dataset with the given ID or name. If none is provided, return the default dataset."""
+        return await Dataset.open(id=id, name=name, configuration=configuration)
+
+    async def get_key_value_store(
+        self,
+        *,
+        id: str | None = None,
+        name: str | None = None,
+        configuration: Configuration | None = None,
+    ) -> KeyValueStore:
+        """Return the key-value store with the given ID or name. If none is provided, return the default KVS."""
+        return await KeyValueStore.open(id=id, name=name, configuration=configuration)
+
     def error_handler(self, handler: ErrorHandler[TCrawlingContext]) -> ErrorHandler[TCrawlingContext]:
         """Decorator for configuring an error handler (called after a request handler error and before retrying)."""
         self._error_handler = handler
@@ -246,7 +276,7 @@ def failed_request_handler(
         self._failed_request_handler = handler
         return handler
 
-    async def run(self, requests: list[str | BaseRequestData] | None = None) -> FinalStatistics:
+    async def run(self, requests: Sequence[str | BaseRequestData | Request] | None = None) -> FinalStatistics:
         """Run the crawler until all requests are processed."""
         if self._running:
             raise RuntimeError(
@@ -261,6 +291,8 @@ async def run(self, requests: list[str | BaseRequestData] | None = None) -> Fina
             if self._use_session_pool:
                 await self._session_pool.reset_store()
 
+        await self.add_requests(self._start_requests)
+
         if requests is not None:
             await self.add_requests(requests)
 
@@ -286,12 +318,13 @@ async def run(self, requests: list[str | BaseRequestData] | None = None) -> Fina
 
         self._running = False
         self._has_finished_before = True
+        self._start_requests = []  # Clear the start requests to prevent them from being added again
 
         return self._statistics.calculate()
 
     async def add_requests(
         self,
-        requests: Sequence[BaseRequestData | Request | str],
+        requests: Sequence[str | BaseRequestData | Request],
         *,
         batch_size: int = 1000,
         wait_time_between_batches: timedelta = timedelta(0),
@@ -317,6 +350,73 @@ async def add_requests(
             wait_for_all_requests_to_be_added_timeout=wait_for_all_requests_to_be_added_timeout,
         )
 
+    async def push_data(
+        self,
+        dataset_id: str | None = None,
+        dataset_name: str | None = None,
+        configuration: Configuration | None = None,
+        **kwargs: Unpack[PushDataKwargs],
+    ) -> None:
+        """Push data to a dataset.
+
+        This helper method simplifies the process of pushing data to a dataset. It opens the specified
+        dataset and then pushes the provided data to it.
+
+        Args:
+            data: The data to push to the dataset.
+            dataset_id: The ID of the dataset.
+            dataset_name: The name of the dataset.
+            configuration: The configuration settings for accessing the dataset.
+            kwargs: Keyword arguments to be passed to the dataset's `push_data` method.
+        """
+        dataset = await Dataset.open(id=dataset_id, name=dataset_name, configuration=configuration)
+        await dataset.push_data(**kwargs)
+
+    async def get_data(
+        self,
+        dataset_id: str | None = None,
+        dataset_name: str | None = None,
+        configuration: Configuration | None = None,
+        **kwargs: Unpack[GetDataKwargs],
+    ) -> DatasetItemsListPage:
+        """Retrieve data from a dataset.
+
+        This helper method simplifies the process of retrieving data from a dataset. It opens the specified
+        dataset and then retrieves the data based on the provided parameters.
+
+        Args:
+            dataset_id: The ID of the dataset.
+            dataset_name: The name of the dataset.
+            configuration: The configuration settings for accessing the dataset.
+            kwargs: Keyword arguments to be passed to the dataset's `get_data` method.
+
+        Returns:
+            The retrieved data.
+        """
+        dataset = await Dataset.open(id=dataset_id, name=dataset_name, configuration=configuration)
+        return await dataset.get_data(**kwargs)
+
+    async def export_to(
+        self,
+        dataset_id: str | None = None,
+        dataset_name: str | None = None,
+        configuration: Configuration | None = None,
+        **kwargs: Unpack[ExportToKwargs],
+    ) -> None:
+        """Export data from a dataset.
+
+        This helper method simplifies the process of exporting data from a dataset. It opens the specified
+        dataset and then exports the data based on the provided parameters.
+
+        Args:
+            dataset_id: The ID of the dataset.
+            dataset_name: The name of the dataset.
+            configuration: The configuration settings for accessing the dataset.
+            kwargs: Keyword arguments to be passed to the dataset's `export_to` method.
+        """
+        dataset = await Dataset.open(id=dataset_id, name=dataset_name, configuration=configuration)
+        return await dataset.export_to(**kwargs)
+
     def _should_retry_request(self, crawling_context: BasicCrawlingContext, error: Exception) -> bool:
         if crawling_context.request.no_retry:
             return False
@@ -517,6 +617,9 @@ async def __run_task_function(self) -> None:
             proxy_info=proxy_info,
             send_request=self._prepare_send_request_function(session, proxy_info),
             add_requests=result.add_requests,
+            get_data=self.get_data,
+            push_data=self.push_data,
+            export_to=self.export_to,
         )
 
         statistics_id = request.id or request.unique_key
 
@@ -10,11 +10,13 @@
 
 if TYPE_CHECKING:
     from crawlee import Glob
+    from crawlee.configuration import Configuration
     from crawlee.enqueue_strategy import EnqueueStrategy
     from crawlee.http_clients.base_http_client import HttpResponse
-    from crawlee.models import BaseRequestData, Request
+    from crawlee.models import BaseRequestData, DatasetItemsListPage, Request
     from crawlee.proxy_configuration import ProxyInfo
     from crawlee.sessions.session import Session
+    from crawlee.storages.dataset import ExportToKwargs, GetDataKwargs, PushDataKwargs
 
 
 class AddRequestsFunctionKwargs(TypedDict):
@@ -28,10 +30,64 @@ class AddRequestsFunctionKwargs(TypedDict):
 
 
 class AddRequestsFunction(Protocol):
-    """Type of a function for adding URLs to the request queue with optional filtering."""
+    """Type of a function for adding URLs to the request queue with optional filtering.
+
+    This helper method simplifies the process of adding requests to the request provider. It opens the specified
+    request provider and adds the requests to it.
+    """
 
     def __call__(  # noqa: D102
-        self, requests: Sequence[str | BaseRequestData], **kwargs: Unpack[AddRequestsFunctionKwargs]
+        self,
+        requests: Sequence[str | BaseRequestData | Request],
+        **kwargs: Unpack[AddRequestsFunctionKwargs],
+    ) -> Coroutine[None, None, None]: ...
+
+
+class GetDataFunction(Protocol):
+    """Type of a function for getting data from the dataset.
+
+    This helper method simplifies the process of retrieving data from a dataset. It opens the specified
+    dataset and then retrieves the data based on the provided parameters.
+    """
+
+    def __call__(  # noqa: D102
+        self,
+        dataset_id: str | None = None,
+        dataset_name: str | None = None,
+        configuration: Configuration | None = None,
+        **kwargs: Unpack[GetDataKwargs],
+    ) -> Coroutine[None, None, DatasetItemsListPage]: ...
+
+
+class PushDataFunction(Protocol):
+    """Type of a function for pushing data to the dataset.
+
+    This helper method simplifies the process of pushing data to a dataset. It opens the specified
+    dataset and then pushes the provided data to it.
+    """
+
+    def __call__(  # noqa: D102
+        self,
+        dataset_id: str | None = None,
+        dataset_name: str | None = None,
+        configuration: Configuration | None = None,
+        **kwargs: Unpack[PushDataKwargs],
+    ) -> Coroutine[None, None, None]: ...
+
+
+class ExportToFunction(Protocol):
+    """Type of a function for exporting data from a dataset.
+
+    This helper method simplifies the process of exporting data from a dataset. It opens the specified
+    dataset and then exports its content to the key-value store.
+    """
+
+    def __call__(  # noqa: D102
+        self,
+        dataset_id: str | None = None,
+        dataset_name: str | None = None,
+        configuration: Configuration | None = None,
+        **kwargs: Unpack[ExportToKwargs],
     ) -> Coroutine[None, None, None]: ...
 
 
@@ -69,12 +125,15 @@ class BasicCrawlingContext:
     proxy_info: ProxyInfo | None
     send_request: SendRequestFunction
     add_requests: AddRequestsFunction
+    get_data: GetDataFunction
+    push_data: PushDataFunction
+    export_to: ExportToFunction
 
 
 class AddRequestsFunctionCall(AddRequestsFunctionKwargs):
     """Record of a call to `add_requests`."""
 
-    requests: Sequence[str | BaseRequestData]
+    requests: Sequence[str | BaseRequestData | Request]
 
 
 @dataclass()
@@ -84,7 +143,9 @@ class RequestHandlerRunResult:
     add_requests_calls: list[AddRequestsFunctionCall] = field(default_factory=list)
 
     async def add_requests(
-        self, requests: Sequence[str | BaseRequestData], **kwargs: Unpack[AddRequestsFunctionKwargs]
+        self,
+        requests: Sequence[str | BaseRequestData],
+        **kwargs: Unpack[AddRequestsFunctionKwargs],
     ) -> None:
         """Track a call to the `add_requests` context helper."""
         self.add_requests_calls.append(AddRequestsFunctionCall(requests=requests, **kwargs))
@@ -74,8 +74,11 @@ async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenera
             request=context.request,
             session=context.session,
             proxy_info=context.proxy_info,
-            send_request=context.send_request,
             add_requests=context.add_requests,
+            send_request=context.send_request,
+            get_data=context.get_data,
+            push_data=context.push_data,
+            export_to=context.export_to,
             http_response=result.http_response,
         )
 
@@ -134,9 +137,12 @@ async def enqueue_links(
             request=context.request,
             session=context.session,
             proxy_info=context.proxy_info,
-            send_request=context.send_request,
-            add_requests=context.add_requests,
             enqueue_links=enqueue_links,
+            add_requests=context.add_requests,
+            send_request=context.send_request,
+            get_data=context.get_data,
+            push_data=context.push_data,
+            export_to=context.export_to,
             http_response=context.http_response,
             soup=soup,
         )