Description
I am trying to web scrape my companies website. To speed up, I have used Asynchronous Web Scraping library Arsenic. When I run this Code I see multiple drivers are spawned at different local ports.
Starting ChromeDriver 109.0.5414.74 (e7c5703604daa9cc128ccf5a5d3e993513758913-refs/branch-heads/5414@{#1172}) on port 59479 Only local connections are allowed. Please see https://chromedriver.chromium.org/security-considerations for suggestions on keeping ChromeDriver safe. [1674821791.415][SEVERE]: bind() failed: Cannot assign requested address (99) ChromeDriver was started successfully. Starting ChromeDriver 109.0.5414.74 (e7c5703604daa9cc128ccf5a5d3e993513758913-refs/branch-heads/5414@{#1172}) on port 40633 Only local connections are allowed. Please see https://chromedriver.chromium.org/security-considerations for suggestions on keeping ChromeDriver safe. [1674821791.853][SEVERE]: bind() failed: Cannot assign requested address (99) ChromeDriver was started successfully.
after scraping some urls it is giving an Error, which I am not able to understand.
2023-01-27 12:16.44 [error ] error data={'error': 'unknown error', 'message': 'unknown error: net::ERR_CONNECTION_CLOSED\n (Session info: headless chrome=109.0.5414.119)', 'stacktrace': '#0 0x55e6edd7e303 <unknown>\n#1 0x55e6edb52d37 <unknown>\n#2 0x55e6edb4ad85 <unknown>\n#3 0x55e6edb3df87 <unknown>\n#4 0x55e6edb3f4e9 <unknown>\n#5 0x55e6edb3e2fe <unknown>\n#6 0x55e6edb3d432 <unknown>\n#7 0x55e6edb3d285 <unknown>\n#8 0x55e6edb3bc77 <unknown>\n#9 0x55e6edb3c2a4 <unknown>\n#10 0x55e6edb54c48 <unknown>\n#11 0x55e6edbc7f15 <unknown>\n#12 0x55e6edbaf982 <unknown>\n#13 0x55e6edbc788c <unknown>\n#14 0x55e6edbaf753 <unknown>\n#15 0x55e6edb82a14 <unknown>\n#16 0x55e6edb83b7e <unknown>\n#17 0x55e6eddcd32e <unknown>\n#18 0x55e6eddd0c0e <unknown>\n#19 0x55e6eddb3610 <unknown>\n#20 0x55e6eddd1c23 <unknown>\n#21 0x55e6edda5545 <unknown>\n#22 0x55e6eddf26a8 <unknown>\n#23 0x55e6eddf2836 <unknown>\n#24 0x55e6ede0dd13 <unknown>\n#25 0x7fae53b0fea5 start_thread\n'} message=unknown error: net::ERR_CONNECTION_CLOSED (Session info: headless chrome=109.0.5414.119) stacktrace=#0 0x55e6edd7e303 <unknown> #1 0x55e6edb52d37 <unknown> #2 0x55e6edb4ad85 <unknown> #3 0x55e6edb3df87 <unknown> #4 0x55e6edb3f4e9 <unknown> #5 0x55e6edb3e2fe <unknown> #6 0x55e6edb3d432 <unknown> #7 0x55e6edb3d285 <unknown> #8 0x55e6edb3bc77 <unknown> #9 0x55e6edb3c2a4 <unknown> #10 0x55e6edb54c48 <unknown> #11 0x55e6edbc7f15 <unknown> #12 0x55e6edbaf982 <unknown> #13 0x55e6edbc788c <unknown> #14 0x55e6edbaf753 <unknown> #15 0x55e6edb82a14 <unknown> #16 0x55e6edb83b7e <unknown> #17 0x55e6eddcd32e <unknown> #18 0x55e6eddd0c0e <unknown> #19 0x55e6eddb3610 <unknown> #20 0x55e6eddd1c23 <unknown> #21 0x55e6edda5545 <unknown> #22 0x55e6eddf26a8 <unknown> #23 0x55e6eddf2836 <unknown> #24 0x55e6ede0dd13 <unknown> #25 0x7fae53b0fea5 start_thread status=500 type=<class 'arsenic.errors.UnknownError'> failed getting session
I am running this in Docker using Linux RHEL 7 image. Python 3.8 Arsenic 21.8 Chrome v109 ChromeDriver v109
code:
from arsenic import get_session, stop_session, browsers, services
def initialize_webdriver():
service = services.Chromedriver(binary=os.environ.get('CHROMEDRIVER_PATH'))
browser = browsers.Chrome()
browser.capabilities = {
"goog:chromeOptions": {"args": ["--no-sandbox", "--headless", "--verbose",
"--disable-gpu", "--disable-web-security", "--allow_insecure_localhost",
"--disable-dev-shm-usage", "--enable-javascript"
]
}
}
return service, browser
async def scraper(limit, service, browser, url):
async with limit:
try:
async with get_session(service, browser) as session:
# print("inside scraper")
await session.get(url)
try:
<code to get web elements>
return results
except asyncio.TimeoutError as msg:
print("failed scraping url ", url)
await stop_session(session)
print(msg)
return []
except (arsenic.errors.UnknownArsenicError, arsenic.errors.UnknownError, arsenic.errors.ArsenicError)as msg:
print("failed getting session")
global failed_urls
failed_urls.append(urls)
limit.release()
return []
async def run(service, browser, urls):
limit = asyncio.Semaphore(30)
results = await asyncio.gather(*[scraper(limit, service, browser,
url) for url in urls)])
print(results)
if __name__ == "__main__":
failed_urls = []
urls = extract_urls() # it collects urls from website's sitemap url
service, browser = initialize_webdriver()
asyncio.run(run(service, browser, urls))
After reducing the semaphore to 20, I am getting the same issue. Need to understand why this error is occurring and how to resolve this.
Activity