1111
1212import httpx
1313from tldextract import TLDExtract
14- from typing_extensions import NotRequired , TypedDict , TypeVar , assert_never
14+ from typing_extensions import NotRequired , TypedDict , TypeVar , Unpack , assert_never
1515
1616from crawlee import Glob
1717from crawlee ._utils .wait import wait_for
3232from crawlee .enqueue_strategy import EnqueueStrategy
3333from crawlee .events .local_event_manager import LocalEventManager
3434from crawlee .http_clients .httpx_client import HttpxClient
35- from crawlee .models import BaseRequestData , Request , RequestState
35+ from crawlee .models import BaseRequestData , DatasetItemsListPage , Request , RequestState
3636from crawlee .sessions import SessionPool
3737from crawlee .statistics .statistics import Statistics
38- from crawlee .storages import RequestQueue
38+ from crawlee .storages import Dataset , KeyValueStore , RequestQueue
3939
4040if TYPE_CHECKING :
4141 import re
4444 from crawlee .proxy_configuration import ProxyConfiguration , ProxyInfo
4545 from crawlee .sessions .session import Session
4646 from crawlee .statistics .models import FinalStatistics , StatisticsState
47+ from crawlee .storages .dataset import ExportToKwargs , GetDataKwargs , PushDataKwargs
4748 from crawlee .storages .request_provider import RequestProvider
4849
4950TCrawlingContext = TypeVar ('TCrawlingContext' , bound = BasicCrawlingContext , default = BasicCrawlingContext )
@@ -86,6 +87,7 @@ class BasicCrawler(Generic[TCrawlingContext]):
8687
8788 def __init__ (
8889 self ,
90+ start_requests : Sequence [str | BaseRequestData | Request ] | None = None ,
8991 * ,
9092 request_provider : RequestProvider | None = None ,
9193 request_handler : Callable [[TCrawlingContext ], Awaitable [None ]] | None = None ,
@@ -106,6 +108,7 @@ def __init__(
106108 """Initialize the BasicCrawler.
107109
108110 Args:
111+ start_requests: A list of URLs to start crawling from
109112 request_provider: Provides requests to be processed
110113 request_handler: A callable to which request handling is delegated
111114 http_client: HTTP client to be used for `BasicCrawlingContext.send_request` and HTTP-only crawling.
@@ -126,6 +129,7 @@ def __init__(
126129 This parameter is meant to be used by child classes, not when BasicCrawler is instantiated directly.
127130 _additional_context_managers: Additional context managers to be used in the crawler lifecycle.
128131 """
132+ self ._start_requests = start_requests or []
129133 self ._router : Router [TCrawlingContext ] | None = None
130134
131135 if isinstance (cast (Router , request_handler ), Router ):
@@ -227,13 +231,39 @@ async def _get_proxy_info(self, request: Request, session: Session | None) -> Pr
227231 proxy_tier = None ,
228232 )
229233
230- async def get_request_provider (self ) -> RequestProvider :
234+ async def get_request_provider (
235+ self ,
236+ * ,
237+ id : str | None = None ,
238+ name : str | None = None ,
239+ configuration : Configuration | None = None ,
240+ ) -> RequestProvider :
231241 """Return the configured request provider. If none is configured, open and return the default request queue."""
232242 if not self ._request_provider :
233- self ._request_provider = await RequestQueue .open ()
243+ self ._request_provider = await RequestQueue .open (id = id , name = name , configuration = configuration )
234244
235245 return self ._request_provider
236246
247+ async def get_dataset (
248+ self ,
249+ * ,
250+ id : str | None = None ,
251+ name : str | None = None ,
252+ configuration : Configuration | None = None ,
253+ ) -> Dataset :
254+ """Return the dataset with the given ID or name. If none is provided, return the default dataset."""
255+ return await Dataset .open (id = id , name = name , configuration = configuration )
256+
257+ async def get_key_value_store (
258+ self ,
259+ * ,
260+ id : str | None = None ,
261+ name : str | None = None ,
262+ configuration : Configuration | None = None ,
263+ ) -> KeyValueStore :
264+ """Return the key-value store with the given ID or name. If none is provided, return the default KVS."""
265+ return await KeyValueStore .open (id = id , name = name , configuration = configuration )
266+
237267 def error_handler (self , handler : ErrorHandler [TCrawlingContext ]) -> ErrorHandler [TCrawlingContext ]:
238268 """Decorator for configuring an error handler (called after a request handler error and before retrying)."""
239269 self ._error_handler = handler
@@ -246,7 +276,7 @@ def failed_request_handler(
246276 self ._failed_request_handler = handler
247277 return handler
248278
249- async def run (self , requests : list [str | BaseRequestData ] | None = None ) -> FinalStatistics :
279+ async def run (self , requests : Sequence [str | BaseRequestData | Request ] | None = None ) -> FinalStatistics :
250280 """Run the crawler until all requests are processed."""
251281 if self ._running :
252282 raise RuntimeError (
@@ -261,6 +291,8 @@ async def run(self, requests: list[str | BaseRequestData] | None = None) -> Fina
261291 if self ._use_session_pool :
262292 await self ._session_pool .reset_store ()
263293
294+ await self .add_requests (self ._start_requests )
295+
264296 if requests is not None :
265297 await self .add_requests (requests )
266298
@@ -286,12 +318,13 @@ async def run(self, requests: list[str | BaseRequestData] | None = None) -> Fina
286318
287319 self ._running = False
288320 self ._has_finished_before = True
321+ self ._start_requests = [] # Clear the start requests to prevent them from being added again
289322
290323 return self ._statistics .calculate ()
291324
292325 async def add_requests (
293326 self ,
294- requests : Sequence [BaseRequestData | Request | str ],
327+ requests : Sequence [str | BaseRequestData | Request ],
295328 * ,
296329 batch_size : int = 1000 ,
297330 wait_time_between_batches : timedelta = timedelta (0 ),
@@ -317,6 +350,73 @@ async def add_requests(
317350 wait_for_all_requests_to_be_added_timeout = wait_for_all_requests_to_be_added_timeout ,
318351 )
319352
353+ async def push_data (
354+ self ,
355+ dataset_id : str | None = None ,
356+ dataset_name : str | None = None ,
357+ configuration : Configuration | None = None ,
358+ ** kwargs : Unpack [PushDataKwargs ],
359+ ) -> None :
360+ """Push data to a dataset.
361+
362+ This helper method simplifies the process of pushing data to a dataset. It opens the specified
363+ dataset and then pushes the provided data to it.
364+
365+ Args:
366+ data: The data to push to the dataset.
367+ dataset_id: The ID of the dataset.
368+ dataset_name: The name of the dataset.
369+ configuration: The configuration settings for accessing the dataset.
370+ kwargs: Keyword arguments to be passed to the dataset's `push_data` method.
371+ """
372+ dataset = await Dataset .open (id = dataset_id , name = dataset_name , configuration = configuration )
373+ await dataset .push_data (** kwargs )
374+
375+ async def get_data (
376+ self ,
377+ dataset_id : str | None = None ,
378+ dataset_name : str | None = None ,
379+ configuration : Configuration | None = None ,
380+ ** kwargs : Unpack [GetDataKwargs ],
381+ ) -> DatasetItemsListPage :
382+ """Retrieve data from a dataset.
383+
384+ This helper method simplifies the process of retrieving data from a dataset. It opens the specified
385+ dataset and then retrieves the data based on the provided parameters.
386+
387+ Args:
388+ dataset_id: The ID of the dataset.
389+ dataset_name: The name of the dataset.
390+ configuration: The configuration settings for accessing the dataset.
391+ kwargs: Keyword arguments to be passed to the dataset's `get_data` method.
392+
393+ Returns:
394+ The retrieved data.
395+ """
396+ dataset = await Dataset .open (id = dataset_id , name = dataset_name , configuration = configuration )
397+ return await dataset .get_data (** kwargs )
398+
399+ async def export_to (
400+ self ,
401+ dataset_id : str | None = None ,
402+ dataset_name : str | None = None ,
403+ configuration : Configuration | None = None ,
404+ ** kwargs : Unpack [ExportToKwargs ],
405+ ) -> None :
406+ """Export data from a dataset.
407+
408+ This helper method simplifies the process of exporting data from a dataset. It opens the specified
409+ dataset and then exports the data based on the provided parameters.
410+
411+ Args:
412+ dataset_id: The ID of the dataset.
413+ dataset_name: The name of the dataset.
414+ configuration: The configuration settings for accessing the dataset.
415+ kwargs: Keyword arguments to be passed to the dataset's `export_to` method.
416+ """
417+ dataset = await Dataset .open (id = dataset_id , name = dataset_name , configuration = configuration )
418+ return await dataset .export_to (** kwargs )
419+
320420 def _should_retry_request (self , crawling_context : BasicCrawlingContext , error : Exception ) -> bool :
321421 if crawling_context .request .no_retry :
322422 return False
@@ -517,6 +617,9 @@ async def __run_task_function(self) -> None:
517617 proxy_info = proxy_info ,
518618 send_request = self ._prepare_send_request_function (session , proxy_info ),
519619 add_requests = result .add_requests ,
620+ get_data = self .get_data ,
621+ push_data = self .push_data ,
622+ export_to = self .export_to ,
520623 )
521624
522625 statistics_id = request .id or request .unique_key
0 commit comments