feat: add block_requests helper for PlaywrightCrawler (apify#919)

Mantisus · vdusek · web-flow · commit 103045994908 · 2025-01-23T10:51:38.000+01:00
### Description - Add `block_requests` helper for `PlaywrightCrawler` - Add example for docs ### Issues - Closes: apify#848 --------- Co-authored-by: Vlada Dusek <v.dusek96@gmail.com>
diff --git a/docs/examples/code/playwright_block_requests.py b/docs/examples/code/playwright_block_requests.py
@@ -0,0 +1,32 @@
+import asyncio
+
+from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext, PlaywrightPreNavCrawlingContext
+
+
+async def main() -> None:
+    crawler = PlaywrightCrawler(
+        # Limit the crawl to max requests. Remove or increase it for crawling all links.
+        max_requests_per_crawl=10,
+    )
+
+    # Define the default request handler, which will be called for every request.
+    @crawler.router.default_handler
+    async def request_handler(context: PlaywrightCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url} ...')
+
+        await context.enqueue_links()
+
+    # Define the hook, which will be called before every request.
+    @crawler.pre_navigation_hook
+    async def navigation_hook(context: PlaywrightPreNavCrawlingContext) -> None:
+        context.log.info(f'Navigating to {context.request.url} ...')
+
+        # Block all requests to URLs that include `adsbygoogle.js` and also all defaults.
+        await context.block_requests(extra_url_patterns=['adsbygoogle.js'])
+
+    # Run the crawler with the initial list of URLs.
+    await crawler.run(['https://crawlee.dev/'])
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/examples/playwright_crawler_with_block_requests.mdx b/docs/examples/playwright_crawler_with_block_requests.mdx
@@ -0,0 +1,27 @@
+---
+id: playwright-crawler-with-block-requests
+title: Playwright crawler with block requests
+---
+
+import ApiLink from '@site/src/components/ApiLink';
+import CodeBlock from '@theme/CodeBlock';
+
+import PlaywrightBlockRequests from '!!raw-loader!./code/playwright_block_requests.py';
+
+This example demonstrates how to optimize your <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> performance by blocking unnecessary network requests.
+
+The primary use case is when you need to scrape or interact with web pages without loading non-essential resources like images, styles, or analytics scripts. This can significantly reduce bandwidth usage and improve crawling speed.
+
+The <ApiLink to="class/BlockRequestsFunction">`block_requests`</ApiLink> helper provides the most efficient way to block requests as it operates directly in the browser.
+
+By default, <ApiLink to="class/BlockRequestsFunction">`block_requests`</ApiLink> will block all URLs including the following patterns:
+
+```python
+['.css', '.webp', '.jpg', '.jpeg', '.png', '.svg', '.gif', '.woff', '.pdf', '.zip']
+```
+
+You can also replace the default patterns list with your own by providing `url_patterns`, or extend it by passing additional patterns in `extra_url_patterns`.
+
+<CodeBlock className="language-python">
+    {PlaywrightBlockRequests}
+</CodeBlock>
diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import logging
+from functools import partial
 from typing import TYPE_CHECKING, Any, Callable
 
 from pydantic import ValidationError
@@ -16,7 +17,7 @@
 
 from ._playwright_crawling_context import PlaywrightCrawlingContext
 from ._playwright_pre_nav_crawling_context import PlaywrightPreNavCrawlingContext
-from ._utils import infinite_scroll
+from ._utils import block_requests, infinite_scroll
 
 if TYPE_CHECKING:
     from collections.abc import AsyncGenerator, Awaitable, Mapping
@@ -148,6 +149,7 @@ async def _open_page(
             get_key_value_store=context.get_key_value_store,
             log=context.log,
             page=crawlee_page.page,
+            block_requests=partial(block_requests, page=crawlee_page.page),
         )
 
         for hook in self._pre_navigation_hooks:
@@ -169,8 +171,8 @@ async def _navigate(
             SessionError: If the URL cannot be loaded by the browser.
 
         Yields:
-            The enhanced crawling context with the Playwright-specific features (page, response, enqueue_links, and
-                infinite_scroll).
+            The enhanced crawling context with the Playwright-specific features (page, response, enqueue_links,
+                infinite_scroll and block_requests).
         """
         async with context.page:
             if context.request.headers:
@@ -241,6 +243,7 @@ async def enqueue_links(
                 infinite_scroll=lambda: infinite_scroll(context.page),
                 response=response,
                 enqueue_links=enqueue_links,
+                block_requests=partial(block_requests, page=context.page),
             )
 
     async def _handle_blocked_request(
diff --git a/src/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py b/src/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py
@@ -9,6 +9,8 @@
 if TYPE_CHECKING:
     from playwright.async_api import Page
 
+    from ._types import BlockRequestsFunction
+
 
 @dataclass(frozen=True)
 @docs_group('Data structures')
@@ -20,3 +22,6 @@ class PlaywrightPreNavCrawlingContext(BasicCrawlingContext):
 
     page: Page
     """The Playwright `Page` object for the current page."""
+
+    block_requests: BlockRequestsFunction
+    """Blocks network requests matching specified URL patterns. Works only for Chromium browser."""
diff --git a/src/crawlee/crawlers/_playwright/_types.py b/src/crawlee/crawlers/_playwright/_types.py
@@ -0,0 +1,24 @@
+from __future__ import annotations
+
+from typing import Protocol
+
+from crawlee._utils.docs import docs_group
+
+
+@docs_group('Functions')
+class BlockRequestsFunction(Protocol):
+    """A function for blocking unwanted HTTP requests during page loads in PlaywrightCrawler.
+
+    It simplifies the process of blocking specific HTTP requests during page navigation.
+    The function allows blocking both default resource types (like images, fonts, stylesheets) and custom URL patterns.
+    """
+
+    async def __call__(
+        self, url_patterns: list[str] | None = None, extra_url_patterns: list[str] | None = None
+    ) -> None:
+        """Call dunder method.
+
+        Args:
+            url_patterns: List of URL patterns to block. If None, uses default patterns.
+            extra_url_patterns: Additional URL patterns to append to the main patterns list.
+        """
diff --git a/src/crawlee/crawlers/_playwright/_utils.py b/src/crawlee/crawlers/_playwright/_utils.py
@@ -8,6 +8,19 @@
     from playwright.async_api import Page
     from playwright.async_api import Request as PlaywrightRequest
 
+_DEFAULT_BLOCK_REQUEST_URL_PATTERNS = [
+    '.css',
+    '.webp',
+    '.jpg',
+    '.jpeg',
+    '.png',
+    '.svg',
+    '.gif',
+    '.woff',
+    '.pdf',
+    '.zip',
+]
+
 
 async def infinite_scroll(page: Page) -> None:
     """Scroll to the bottom of a page, handling loading of additional items."""
@@ -63,3 +76,35 @@ async def check_finished() -> None:
             check_task.cancel()
         with suppress(asyncio.CancelledError):
             await check_task
+
+
+async def block_requests(
+    page: Page, url_patterns: list[str] | None = None, extra_url_patterns: list[str] | None = None
+) -> None:
+    """Blocks network requests matching specified URL patterns.
+
+    Args:
+        page: Playwright Page object to block requests on.
+        url_patterns: List of URL patterns to block. If None, uses default patterns.
+        extra_url_patterns: Additional URL patterns to append to the main patterns list.
+    """
+    url_patterns = url_patterns or _DEFAULT_BLOCK_REQUEST_URL_PATTERNS
+
+    url_patterns.extend(extra_url_patterns or [])
+
+    browser_type = page.context.browser.browser_type.name if page.context.browser else 'undefined'
+
+    if browser_type == 'chromium':
+        client = await page.context.new_cdp_session(page)
+
+        await client.send('Network.enable')
+        await client.send('Network.setBlockedURLs', {'urls': url_patterns})
+    else:
+        extensions = [pattern.strip('*.') for pattern in url_patterns if pattern.startswith(('*.', '.'))]
+        specific_files = [pattern for pattern in url_patterns if not pattern.startswith(('*.', '.'))]
+
+        if extensions:
+            await page.route(f"**/*.{{{','.join(extensions)}}}*", lambda route, _: route.abort())
+
+        if specific_files:
+            await page.route(f"**/{{{','.join(specific_files)}}}*", lambda route, _: route.abort())