99import respx
1010from httpx import Response
1111
12- from crawlee . _request import Request
12+ from crawlee import ConcurrencySettings , Request
1313from crawlee .crawlers import HttpCrawler
1414from crawlee .http_clients import CurlImpersonateHttpClient , HttpxHttpClient
1515from crawlee .sessions import SessionPool
@@ -183,7 +183,15 @@ async def test_handles_server_error(
183183 assert server ['500_endpoint' ].called
184184
185185
186- async def test_stores_cookies (httpbin : URL ) -> None :
186+ @pytest .mark .parametrize (
187+ 'http_client_class' ,
188+ [
189+ pytest .param (CurlImpersonateHttpClient , id = 'curl' ),
190+ pytest .param (HttpxHttpClient , id = 'httpx' ),
191+ ],
192+ )
193+ async def test_stores_cookies (http_client_class : type [BaseHttpClient ], httpbin : URL ) -> None :
194+ http_client = http_client_class ()
187195 visit = Mock ()
188196 track_session_usage = Mock ()
189197
@@ -192,6 +200,7 @@ async def test_stores_cookies(httpbin: URL) -> None:
192200 # /cookies/set might redirect us to a page that we can't access - no problem, we only care about cookies
193201 ignore_http_error_status_codes = [401 ],
194202 session_pool = session_pool ,
203+ http_client = http_client ,
195204 )
196205
197206 @crawler .router .default_handler
@@ -410,3 +419,68 @@ def mark_request_execution(request: Request) -> Response: # noqa: ARG001 # Unus
410419 await crawler .run ([Request .from_url (url = test_url )])
411420
412421 assert execution_order == ['pre-navigation-hook 1' , 'pre-navigation-hook 2' , 'request' , 'final handler' ]
422+
423+
424+ @pytest .mark .parametrize (
425+ 'http_client_class' ,
426+ [
427+ pytest .param (CurlImpersonateHttpClient , id = 'curl' ),
428+ pytest .param (HttpxHttpClient , id = 'httpx' ),
429+ ],
430+ )
431+ async def test_isolation_cookies (http_client_class : type [BaseHttpClient ], httpbin : URL ) -> None :
432+ http_client = http_client_class ()
433+ sessions_ids : list [str ] = []
434+ sessions_cookies : dict [str , dict [str , str ]] = {}
435+ response_cookies : dict [str , dict [str , str ]] = {}
436+
437+ crawler = HttpCrawler (
438+ session_pool = SessionPool (max_pool_size = 1 ),
439+ http_client = http_client ,
440+ concurrency_settings = ConcurrencySettings (max_concurrency = 1 ),
441+ )
442+
443+ @crawler .router .default_handler
444+ async def handler (context : HttpCrawlingContext ) -> None :
445+ if not context .session :
446+ return
447+
448+ sessions_ids .append (context .session .id )
449+
450+ if context .request .unique_key not in {'1' , '2' }:
451+ return
452+
453+ sessions_cookies [context .session .id ] = context .session .cookies
454+ response_data = json .loads (context .http_response .read ())
455+ response_cookies [context .session .id ] = response_data .get ('cookies' )
456+
457+ if context .request .user_data .get ('retire_session' ):
458+ context .session .retire ()
459+
460+ await crawler .run (
461+ [
462+ # The first request sets the cookie in the session
463+ str (httpbin .with_path ('/cookies/set' ).extend_query (a = 1 )),
464+ # With the second request, we check the cookies in the session and set retire
465+ Request .from_url (str (httpbin .with_path ('/cookies' )), unique_key = '1' , user_data = {'retire_session' : True }),
466+ # The third request is made with a new session to make sure it does not use another session's cookies
467+ Request .from_url (str (httpbin .with_path ('/cookies' )), unique_key = '2' ),
468+ ]
469+ )
470+
471+ assert len (sessions_cookies ) == 2
472+ assert len (response_cookies ) == 2
473+
474+ assert sessions_ids [0 ] == sessions_ids [1 ]
475+
476+ cookie_session_id = sessions_ids [0 ]
477+ clean_session_id = sessions_ids [2 ]
478+
479+ assert cookie_session_id != clean_session_id
480+
481+ # The initiated cookies must match in both the response and the session store
482+ assert sessions_cookies [cookie_session_id ] == response_cookies [cookie_session_id ] == {'a' : '1' }
483+
484+ # For a clean session, the cookie should not be in the session store or in the response
485+ # This way we can be sure that no cookies are being leaked through the http client
486+ assert sessions_cookies [clean_session_id ] == response_cookies [clean_session_id ] == {}
0 commit comments