Skip to content

Commit 849d73c

Browse files
authored
feat: add Playwright's enqueue links helper (#196)
### Description - Add Playwright's enqueue links helper ### Related issues - #180 ### Testing - New unit test was implemented ### Checklist - [x] Changes are described in the `CHANGELOG.md` - [x] CI passed
1 parent 332673c commit 849d73c

6 files changed

Lines changed: 82 additions & 10 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
- Add batched request addition in `RequestQueue`
1414
- Add start requests option to `BasicCrawler`
1515
- Add storage-related helpers `get_data`, `push_data` and `export_to` to `BasicCrawler` and `BasicContext`
16+
- Add `PlaywrightCrawler`'s enqueue links helper
1617

1718
## [0.0.4](../../releases/tag/v0.0.4) - 2024-05-30
1819

src/crawlee/beautifulsoup_crawler/beautifulsoup_crawler.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,8 @@ async def enqueue_links(
113113
user_data: dict[str, Any] | None = None,
114114
**kwargs: Unpack[AddRequestsKwargs],
115115
) -> None:
116+
kwargs.setdefault('strategy', EnqueueStrategy.SAME_HOSTNAME)
117+
116118
requests = list[BaseRequestData]()
117119
user_data = user_data or {}
118120

@@ -126,9 +128,6 @@ async def enqueue_links(
126128
if (href := link.attrs.get('href')) is not None:
127129
requests.append(BaseRequestData.from_url(href, user_data=link_user_data))
128130

129-
uses_patterns = 'include' in kwargs or 'exclude' in kwargs
130-
kwargs.setdefault('strategy', EnqueueStrategy.SAME_HOSTNAME if uses_patterns else EnqueueStrategy.ALL)
131-
132131
await context.add_requests(requests, **kwargs)
133132

134133
yield BeautifulSoupCrawlingContext(

src/crawlee/playwright_crawler/playwright_crawler.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,14 @@
66

77
from crawlee.basic_crawler import BasicCrawler, BasicCrawlerOptions, ContextPipeline
88
from crawlee.browsers import BrowserPool
9+
from crawlee.enqueue_strategy import EnqueueStrategy
10+
from crawlee.models import BaseRequestData
911
from crawlee.playwright_crawler.types import PlaywrightCrawlingContext
1012

1113
if TYPE_CHECKING:
1214
from collections.abc import AsyncGenerator
1315

14-
from crawlee.basic_crawler.types import BasicCrawlingContext
16+
from crawlee.basic_crawler.types import AddRequestsKwargs, BasicCrawlingContext
1517

1618

1719
class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext]):
@@ -63,6 +65,34 @@ async def _page_goto(
6365
await crawlee_page.page.goto(context.request.url)
6466
context.request.loaded_url = crawlee_page.page.url
6567

68+
async def enqueue_links(
69+
*,
70+
selector: str = 'a',
71+
label: str | None = None,
72+
user_data: dict | None = None,
73+
**kwargs: Unpack[AddRequestsKwargs],
74+
) -> None:
75+
kwargs.setdefault('strategy', EnqueueStrategy.SAME_HOSTNAME)
76+
77+
requests = list[BaseRequestData]()
78+
user_data = user_data or {}
79+
80+
elements = await crawlee_page.page.query_selector_all(selector)
81+
82+
for element in elements:
83+
href = await element.get_attribute('href')
84+
85+
if href:
86+
link_user_data = user_data.copy()
87+
88+
if label is not None:
89+
link_user_data.setdefault('label', label)
90+
91+
request = BaseRequestData.from_url(href, user_data=link_user_data)
92+
requests.append(request)
93+
94+
await context.add_requests(requests, **kwargs)
95+
6696
yield PlaywrightCrawlingContext(
6797
request=context.request,
6898
session=context.session,
@@ -71,6 +101,7 @@ async def _page_goto(
71101
push_data=context.push_data,
72102
proxy_info=context.proxy_info,
73103
page=crawlee_page.page,
104+
enqueue_links=enqueue_links,
74105
)
75106

76107
await crawlee_page.page.close()

src/crawlee/playwright_crawler/types.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from dataclasses import dataclass
44
from typing import TYPE_CHECKING
55

6-
from crawlee.basic_crawler.types import BasicCrawlingContext
6+
from crawlee.basic_crawler.types import BasicCrawlingContext, EnqueueLinksFunction
77

88
if TYPE_CHECKING:
99
from playwright.async_api import Page
@@ -14,3 +14,4 @@ class PlaywrightCrawlingContext(BasicCrawlingContext):
1414
"""Crawling context used by PlaywrightSoupCrawler."""
1515

1616
page: Page
17+
enqueue_links: EnqueueLinksFunction

tests/unit/beautifulsoup_crawler/test_beautifulsoup_crawler.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from httpx import Response
99

1010
from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler
11+
from crawlee.enqueue_strategy import EnqueueStrategy
1112
from crawlee.storages import RequestList
1213

1314
if TYPE_CHECKING:
@@ -87,7 +88,8 @@ async def test_enqueue_links(server: respx.MockRouter) -> None:
8788
@crawler.router.default_handler
8889
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
8990
visit(context.request.url)
90-
await context.enqueue_links()
91+
# Note: with RESPX server mocking, we have to set EnqueueStrategy to ALL
92+
await context.enqueue_links(strategy=EnqueueStrategy.ALL)
9193

9294
await crawler.run()
9395

Lines changed: 42 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,21 @@
1+
# TODO: The current PlaywrightCrawler tests rely on external websites. It means they can fail or take more time
2+
# due to network issues. To enhance test stability and reliability, we should mock the network requests.
3+
# https://github.com/apify/crawlee-python/issues/197
4+
15
from __future__ import annotations
26

37
from typing import TYPE_CHECKING
8+
from unittest import mock
49

510
from crawlee.playwright_crawler import PlaywrightCrawler
6-
from crawlee.storages.request_list import RequestList
711

812
if TYPE_CHECKING:
913
from crawlee.playwright_crawler import PlaywrightCrawlingContext
1014

1115

1216
async def test_basic_request(httpbin: str) -> None:
13-
request_provider = RequestList([f'{httpbin}/'])
14-
crawler = PlaywrightCrawler(request_provider=request_provider)
17+
requests = [f'{httpbin}/']
18+
crawler = PlaywrightCrawler()
1519
result: dict = {}
1620

1721
@crawler.router.default_handler
@@ -22,8 +26,42 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
2226
result['page_title'] = await context.page.title()
2327
result['page_content'] = await context.page.content()
2428

25-
await crawler.run()
29+
await crawler.run(requests)
2630

2731
assert result.get('request_url') == result.get('page_url') == f'{httpbin}/'
2832
assert 'httpbin' in result.get('page_title', '')
2933
assert '<html' in result.get('page_content', '') # there is some HTML content
34+
35+
36+
async def test_enqueue_links() -> None:
37+
requests = ['https://crawlee.dev/']
38+
crawler = PlaywrightCrawler()
39+
visit = mock.Mock()
40+
41+
@crawler.router.default_handler
42+
async def request_handler(context: PlaywrightCrawlingContext) -> None:
43+
visit(context.request.url)
44+
await context.enqueue_links()
45+
46+
await crawler.run(requests)
47+
48+
visited = {call[0][0] for call in visit.call_args_list}
49+
50+
assert visited == {
51+
'https://crawlee.dev/',
52+
'https://crawlee.dev/docs/guides/javascript-rendering',
53+
'https://crawlee.dev/docs/guides/typescript-project',
54+
'https://crawlee.dev/docs/guides/avoid-blocking',
55+
'https://crawlee.dev/docs/guides/cheerio-crawler-guide',
56+
'https://crawlee.dev/docs/guides/result-storage',
57+
'https://crawlee.dev/docs/guides/proxy-management',
58+
'https://crawlee.dev/api/core/class/AutoscaledPool',
59+
'https://crawlee.dev/docs/guides/jsdom-crawler-guide',
60+
'https://crawlee.dev/docs/guides/request-storage',
61+
'https://crawlee.dev/api/utils',
62+
'https://crawlee.dev/api/utils/namespace/social',
63+
'https://crawlee.dev/docs/deployment/aws-cheerio',
64+
'https://crawlee.dev/api/basic-crawler/interface/BasicCrawlerOptions',
65+
'https://crawlee.dev/docs/deployment/gcp-cheerio',
66+
'https://crawlee.dev/docs/quick-start',
67+
}

0 commit comments

Comments
 (0)