docs: Add a new guide on how to avoid getting blocked#576
docs: Add a new guide on how to avoid getting blocked#576MostlyKIGuess wants to merge 12 commits intoapify:masterfrom
Conversation
|
@janbuchar @vdusek Can you help me figure out add the operating system change, I am not sure if the current code works and there's no way to test it.. |
vdusek
left a comment
There was a problem hiding this comment.
Have you tried to execute the code samples?
| browser_pool = BrowserPool.with_default_plugin( | ||
| headless=True, | ||
| kwargs={ | ||
| 'use_fingerprints': False, | ||
| }, | ||
| ) |
There was a problem hiding this comment.
Additional kwargs can be provided directly.
| browser_pool = BrowserPool.with_default_plugin( | |
| headless=True, | |
| kwargs={ | |
| 'use_fingerprints': False, | |
| }, | |
| ) | |
| browser_pool = BrowserPool.with_default_plugin( | |
| headless=True, | |
| use_fingerprints=False, | |
| ) |
There was a problem hiding this comment.
for this it's not working directly so I added the extra code in the conversation as well and then there's a new issue , I have attached screenshots for it
There was a problem hiding this comment.
Yes, this is not going to work, since use_fingerprints is the parameter of the Plugin and not the BrowserPool.
| kwargs={ | ||
| 'use_fingerprints': True, | ||
| 'fingerprint_options': { | ||
| 'fingerprint_generator_options': { | ||
| 'browsers': [ | ||
| { | ||
| 'name': 'chromium', # Or 'firefox', or 'webkit' | ||
| 'min_version': 96, | ||
| }, | ||
| ], | ||
| 'devices': ['desktop'], # Specify device types directly | ||
| 'operating_systems': ['windows'], # Specify OS types directly | ||
| }, | ||
| }, | ||
| }, | ||
| ) |
There was a problem hiding this comment.
As I wrote below, additional kwargs can be provided directly. But in this case, I'm not sure whether this is correct. Have you tried to execute it?
|
@vdusek , Hey so I was experimenting by changing the source code , and i found:
from __future__ import annotations
from logging import getLogger
from typing import TYPE_CHECKING, Any
from playwright.async_api import Playwright, async_playwright
from typing_extensions import override
from crawlee.browsers._base_browser_plugin import BaseBrowserPlugin
from crawlee.browsers._playwright_browser_controller import PlaywrightBrowserController
if TYPE_CHECKING:
from collections.abc import Mapping
from types import TracebackType
from crawlee.browsers._types import BrowserType
logger = getLogger(__name__)
class PlaywrightBrowserPlugin(BaseBrowserPlugin):
"""A plugin for managing Playwright automation library.
It should work as a factory for creating new browser instances.
"""
AUTOMATION_LIBRARY = 'playwright'
def __init__(
self,
*,
browser_type: BrowserType = 'chromium',
browser_options: Mapping[str, Any] | None = None,
page_options: Mapping[str, Any] | None = None,
max_open_pages_per_browser: int = 20,
fingerprint_generator_options: Mapping[str, Any] | None = None,
use_fingerprints: bool = False,
) -> None:
"""Create a new instance.
Args:
browser_type: The type of the browser to launch.
browser_options: Options to configure the browser instance.
page_options: Options to configure a new page instance.
max_open_pages_per_browser: The maximum number of pages that can be opened in a single browser instance.
Once reached, a new browser instance will be launched to handle the excess.
fingerprint_generator_options: Options for generating browser fingerprints.
use_fingerprints: Whether to use browser fingerprints.
"""
self._browser_type = browser_type
self._browser_options = browser_options or {}
self._page_options = page_options or {}
self._max_open_pages_per_browser = max_open_pages_per_browser
self._fingerprint_generator_options = fingerprint_generator_options or {}
self._use_fingerprints = use_fingerprints
self._playwright_context_manager = async_playwright()
self._playwright: Playwright | None = None
@property
@override
def browser_type(self) -> BrowserType:
return self._browser_type
@property
@override
def browser_options(self) -> Mapping[str, Any]:
return self._browser_options
@property
@override
def page_options(self) -> Mapping[str, Any]:
return self._page_options
@property
@override
def max_open_pages_per_browser(self) -> int:
return self._max_open_pages_per_browser
@property
def fingerprint_generator_options(self) -> Mapping[str, Any]:
return self._fingerprint_generator_options
@property
def use_fingerprints(self) -> bool:
return self._use_fingerprints
@override
async def __aenter__(self) -> PlaywrightBrowserPlugin:
logger.debug('Initializing Playwright browser plugin.')
self._playwright = await self._playwright_context_manager.__aenter__()
return self
@override
async def __aexit__(
self,
exc_type: type[BaseException] | None,
exc_value: BaseException | None,
exc_traceback: TracebackType | None,
) -> None:
logger.debug('Closing Playwright browser plugin.')
await self._playwright_context_manager.__aexit__(exc_type, exc_value, exc_traceback)
@override
async def new_browser(self) -> PlaywrightBrowserController:
if not self._playwright:
raise RuntimeError('Playwright browser plugin is not initialized.')
if self._browser_type == 'chromium':
browser = await self._playwright.chromium.launch(**self._browser_options)
elif self._browser_type == 'firefox':
browser = await self._playwright.firefox.launch(**self._browser_options)
elif self._browser_type == 'webkit':
browser = await self._playwright.webkit.launch(**self._browser_options)
else:
raise ValueError(f'Invalid browser type: {self._browser_type}')
return PlaywrightBrowserController(
browser,
max_open_pages_per_browser=self._max_open_pages_per_browser,
)
# Updated avoid_blocking_playwright.py
from crawlee.browsers import BrowserPool
from crawlee.playwright_crawler import PlaywrightCrawler
from crawlee.browsers._playwright_browser_plugin import PlaywrightBrowserPlugin
import asyncio
# Create the PlaywrightBrowserPlugin with customized options
plugin = PlaywrightBrowserPlugin(
browser_type='chromium', # Use 'chromium', 'firefox'
browser_options={
'args': [
'--no-sandbox',
'--disable-setuid-sandbox',
],
},
fingerprint_generator_options={
'devices': ['desktop'],
'operating_systems': ['windows'], # Specify OS types directly
},
use_fingerprints=True,
)
# Create the browser pool with the customized plugin
browser_pool = BrowserPool(plugins=[plugin])
# Instantiate the PlaywrightCrawler with the customized browser pool
crawler = PlaywrightCrawler(
browser_pool=browser_pool,
)
async def main():
async with browser_pool:
crawlee_page = await browser_pool.new_page()
page = crawlee_page.page
await page.goto('https://www.whatismybrowser.com/')
user_agent = await page.evaluate('navigator.userAgent')
print(f'User-Agent: {user_agent}')
await page.screenshot(path='screenshot.png')
asyncio.run(main()) |
|
@vdusek So if you suggest we can only keep those 3 options in the documentation, let me know what should be added, I have tweaked the source a little bit because it wasn't accepting plugin option in the above code |
|
Hey @vdusek , can you please guide me on do we just keep the 3 options as I mentioned above or wait until features get implemented, I think keeping what we have along with additional tips would be a better option than nothing anyways |
vdusek
left a comment
There was a problem hiding this comment.
The fingerprinting in Crawlee for Python is currently very limited. We have implemented only basics so far, see #401 and #402. The next step is #549. It means, you cannot just copy content from the JS guide.
Next steps...
- Write the guide only with current (limited) feature set regarding the blocking.
- Or wait for the fingerprinting to be completely implemented.
| browser_pool = BrowserPool.with_default_plugin( | ||
| headless=True, | ||
| kwargs={ | ||
| 'use_fingerprints': False, | ||
| }, | ||
| ) |
There was a problem hiding this comment.
Yes, this is not going to work, since use_fingerprints is the parameter of the Plugin and not the BrowserPool.
|
Closing due to author inactivity. |




Description
Added 5 files, out of which 2 will aren't currently being used, when crawlee-python will complete puppeteer crawler, those can be used.
Added additional information apart from the reference.
Closes: Create a new guide about how to not get blocked #481
Testing