Skip to content

Commit 1030459

Browse files
Mantisusvdusek
andauthored
feat: add block_requests helper for PlaywrightCrawler (apify#919)
### Description - Add `block_requests` helper for `PlaywrightCrawler` - Add example for docs ### Issues - Closes: apify#848 --------- Co-authored-by: Vlada Dusek <[email protected]>
1 parent 1fa4f7b commit 1030459

File tree

6 files changed

+139
-3
lines changed

6 files changed

+139
-3
lines changed
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import asyncio
2+
3+
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext, PlaywrightPreNavCrawlingContext
4+
5+
6+
async def main() -> None:
7+
crawler = PlaywrightCrawler(
8+
# Limit the crawl to max requests. Remove or increase it for crawling all links.
9+
max_requests_per_crawl=10,
10+
)
11+
12+
# Define the default request handler, which will be called for every request.
13+
@crawler.router.default_handler
14+
async def request_handler(context: PlaywrightCrawlingContext) -> None:
15+
context.log.info(f'Processing {context.request.url} ...')
16+
17+
await context.enqueue_links()
18+
19+
# Define the hook, which will be called before every request.
20+
@crawler.pre_navigation_hook
21+
async def navigation_hook(context: PlaywrightPreNavCrawlingContext) -> None:
22+
context.log.info(f'Navigating to {context.request.url} ...')
23+
24+
# Block all requests to URLs that include `adsbygoogle.js` and also all defaults.
25+
await context.block_requests(extra_url_patterns=['adsbygoogle.js'])
26+
27+
# Run the crawler with the initial list of URLs.
28+
await crawler.run(['https://crawlee.dev/'])
29+
30+
31+
if __name__ == '__main__':
32+
asyncio.run(main())
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
---
2+
id: playwright-crawler-with-block-requests
3+
title: Playwright crawler with block requests
4+
---
5+
6+
import ApiLink from '@site/src/components/ApiLink';
7+
import CodeBlock from '@theme/CodeBlock';
8+
9+
import PlaywrightBlockRequests from '!!raw-loader!./code/playwright_block_requests.py';
10+
11+
This example demonstrates how to optimize your <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> performance by blocking unnecessary network requests.
12+
13+
The primary use case is when you need to scrape or interact with web pages without loading non-essential resources like images, styles, or analytics scripts. This can significantly reduce bandwidth usage and improve crawling speed.
14+
15+
The <ApiLink to="class/BlockRequestsFunction">`block_requests`</ApiLink> helper provides the most efficient way to block requests as it operates directly in the browser.
16+
17+
By default, <ApiLink to="class/BlockRequestsFunction">`block_requests`</ApiLink> will block all URLs including the following patterns:
18+
19+
```python
20+
['.css', '.webp', '.jpg', '.jpeg', '.png', '.svg', '.gif', '.woff', '.pdf', '.zip']
21+
```
22+
23+
You can also replace the default patterns list with your own by providing `url_patterns`, or extend it by passing additional patterns in `extra_url_patterns`.
24+
25+
<CodeBlock className="language-python">
26+
{PlaywrightBlockRequests}
27+
</CodeBlock>

src/crawlee/crawlers/_playwright/_playwright_crawler.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from __future__ import annotations
22

33
import logging
4+
from functools import partial
45
from typing import TYPE_CHECKING, Any, Callable
56

67
from pydantic import ValidationError
@@ -16,7 +17,7 @@
1617

1718
from ._playwright_crawling_context import PlaywrightCrawlingContext
1819
from ._playwright_pre_nav_crawling_context import PlaywrightPreNavCrawlingContext
19-
from ._utils import infinite_scroll
20+
from ._utils import block_requests, infinite_scroll
2021

2122
if TYPE_CHECKING:
2223
from collections.abc import AsyncGenerator, Awaitable, Mapping
@@ -148,6 +149,7 @@ async def _open_page(
148149
get_key_value_store=context.get_key_value_store,
149150
log=context.log,
150151
page=crawlee_page.page,
152+
block_requests=partial(block_requests, page=crawlee_page.page),
151153
)
152154

153155
for hook in self._pre_navigation_hooks:
@@ -169,8 +171,8 @@ async def _navigate(
169171
SessionError: If the URL cannot be loaded by the browser.
170172
171173
Yields:
172-
The enhanced crawling context with the Playwright-specific features (page, response, enqueue_links, and
173-
infinite_scroll).
174+
The enhanced crawling context with the Playwright-specific features (page, response, enqueue_links,
175+
infinite_scroll and block_requests).
174176
"""
175177
async with context.page:
176178
if context.request.headers:
@@ -241,6 +243,7 @@ async def enqueue_links(
241243
infinite_scroll=lambda: infinite_scroll(context.page),
242244
response=response,
243245
enqueue_links=enqueue_links,
246+
block_requests=partial(block_requests, page=context.page),
244247
)
245248

246249
async def _handle_blocked_request(

src/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
if TYPE_CHECKING:
1010
from playwright.async_api import Page
1111

12+
from ._types import BlockRequestsFunction
13+
1214

1315
@dataclass(frozen=True)
1416
@docs_group('Data structures')
@@ -20,3 +22,6 @@ class PlaywrightPreNavCrawlingContext(BasicCrawlingContext):
2022

2123
page: Page
2224
"""The Playwright `Page` object for the current page."""
25+
26+
block_requests: BlockRequestsFunction
27+
"""Blocks network requests matching specified URL patterns. Works only for Chromium browser."""
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
from __future__ import annotations
2+
3+
from typing import Protocol
4+
5+
from crawlee._utils.docs import docs_group
6+
7+
8+
@docs_group('Functions')
9+
class BlockRequestsFunction(Protocol):
10+
"""A function for blocking unwanted HTTP requests during page loads in PlaywrightCrawler.
11+
12+
It simplifies the process of blocking specific HTTP requests during page navigation.
13+
The function allows blocking both default resource types (like images, fonts, stylesheets) and custom URL patterns.
14+
"""
15+
16+
async def __call__(
17+
self, url_patterns: list[str] | None = None, extra_url_patterns: list[str] | None = None
18+
) -> None:
19+
"""Call dunder method.
20+
21+
Args:
22+
url_patterns: List of URL patterns to block. If None, uses default patterns.
23+
extra_url_patterns: Additional URL patterns to append to the main patterns list.
24+
"""

src/crawlee/crawlers/_playwright/_utils.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,19 @@
88
from playwright.async_api import Page
99
from playwright.async_api import Request as PlaywrightRequest
1010

11+
_DEFAULT_BLOCK_REQUEST_URL_PATTERNS = [
12+
'.css',
13+
'.webp',
14+
'.jpg',
15+
'.jpeg',
16+
'.png',
17+
'.svg',
18+
'.gif',
19+
'.woff',
20+
'.pdf',
21+
'.zip',
22+
]
23+
1124

1225
async def infinite_scroll(page: Page) -> None:
1326
"""Scroll to the bottom of a page, handling loading of additional items."""
@@ -63,3 +76,35 @@ async def check_finished() -> None:
6376
check_task.cancel()
6477
with suppress(asyncio.CancelledError):
6578
await check_task
79+
80+
81+
async def block_requests(
82+
page: Page, url_patterns: list[str] | None = None, extra_url_patterns: list[str] | None = None
83+
) -> None:
84+
"""Blocks network requests matching specified URL patterns.
85+
86+
Args:
87+
page: Playwright Page object to block requests on.
88+
url_patterns: List of URL patterns to block. If None, uses default patterns.
89+
extra_url_patterns: Additional URL patterns to append to the main patterns list.
90+
"""
91+
url_patterns = url_patterns or _DEFAULT_BLOCK_REQUEST_URL_PATTERNS
92+
93+
url_patterns.extend(extra_url_patterns or [])
94+
95+
browser_type = page.context.browser.browser_type.name if page.context.browser else 'undefined'
96+
97+
if browser_type == 'chromium':
98+
client = await page.context.new_cdp_session(page)
99+
100+
await client.send('Network.enable')
101+
await client.send('Network.setBlockedURLs', {'urls': url_patterns})
102+
else:
103+
extensions = [pattern.strip('*.') for pattern in url_patterns if pattern.startswith(('*.', '.'))]
104+
specific_files = [pattern for pattern in url_patterns if not pattern.startswith(('*.', '.'))]
105+
106+
if extensions:
107+
await page.route(f"**/*.{{{','.join(extensions)}}}*", lambda route, _: route.abort())
108+
109+
if specific_files:
110+
await page.route(f"**/{{{','.join(specific_files)}}}*", lambda route, _: route.abort())

0 commit comments

Comments
 (0)