Skip to content

Commit 4c3d891

Browse files
committed
request headers derives from dict
1 parent 414ac66 commit 4c3d891

8 files changed

Lines changed: 57 additions & 93 deletions

File tree

src/crawlee/_request.py

Lines changed: 3 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,11 @@
22

33
from __future__ import annotations
44

5-
from _collections_abc import dict_items, dict_keys
6-
from collections.abc import Iterator, Mapping, MutableMapping
5+
from collections.abc import Iterator, MutableMapping
76
from datetime import datetime
87
from decimal import Decimal
98
from enum import Enum
10-
from typing import Annotated, Any, cast, overload
9+
from typing import Annotated, Any, cast
1110

1211
from pydantic import (
1312
BaseModel,
@@ -17,12 +16,11 @@
1716
JsonValue,
1817
PlainSerializer,
1918
PlainValidator,
20-
RootModel,
2119
TypeAdapter,
2220
)
2321
from typing_extensions import Self
2422

25-
from crawlee._types import EnqueueStrategy, HttpMethod, HttpPayload, HttpQueryParams
23+
from crawlee._types import EnqueueStrategy, HttpHeaders, HttpMethod, HttpPayload, HttpQueryParams
2624
from crawlee._utils.requests import compute_unique_key, unique_key_to_request_id
2725
from crawlee._utils.urls import extract_query_params, validate_http_url
2826

@@ -98,73 +96,6 @@ def __len__(self) -> int:
9896
user_data_adapter = TypeAdapter(UserData)
9997

10098

101-
class HttpHeaders(RootModel):
102-
"""An immutable mapping for HTTP headers that ensures case-insensitivity for header names."""
103-
104-
def __init__(self, headers: Mapping[str, str] | None = None) -> None:
105-
"""Create a new instance.
106-
107-
Args:
108-
headers: A mapping of header names to values.
109-
"""
110-
# Ensure immutability by sorting and fixing the order.
111-
headers = headers or {}
112-
headers = {k.lower(): v for k, v in headers.items()}
113-
self._headers = dict(sorted(headers.items()))
114-
115-
@property
116-
def __dict__(self) -> dict[str, str]:
117-
"""Return the headers as a dictionary."""
118-
# We have to implement this because of `BaseModel.__iter__` implementation.
119-
return dict(self._headers)
120-
121-
@__dict__.setter
122-
def __dict__(self, value: dict[str, str]) -> None:
123-
"""Set the headers from a dictionary."""
124-
self._headers = {k.lower(): v for k, v in value.items()}
125-
126-
def __len__(self) -> int:
127-
"""Return the number of headers."""
128-
return len(self._headers)
129-
130-
def __repr__(self) -> str:
131-
"""Return a string representation of the object."""
132-
return f'{self.__class__.__name__}({self._headers})'
133-
134-
def __getitem__(self, key: str) -> str:
135-
"""Get the value of a header by its name, case-insensitive."""
136-
return self._headers[key.lower()]
137-
138-
def __setitem__(self, key: str, value: str) -> None:
139-
"""Prevent setting a header, as the object is immutable."""
140-
raise TypeError(f'{self.__class__.__name__} is immutable')
141-
142-
def __delitem__(self, key: str) -> None:
143-
"""Prevent deleting a header, as the object is immutable."""
144-
raise TypeError(f'{self.__class__.__name__} is immutable')
145-
146-
def keys(self) -> dict_keys[str, str]:
147-
"""Return an iterator over the header names."""
148-
return self._headers.keys()
149-
150-
def items(self) -> dict_items[str, str]:
151-
"""Return an iterator over the header names and values."""
152-
return self._headers.items()
153-
154-
@overload
155-
def get(self, key: str) -> str | None: ...
156-
157-
@overload
158-
def get(self, key: str, default: str) -> str: ...
159-
160-
@overload
161-
def get(self, key: str, default: None) -> None: ...
162-
163-
def get(self, key: str, default: str | None = None) -> str | None:
164-
"""Returns the value of the header if it exists, otherwise returns the default."""
165-
return self._headers.get(key, default)
166-
167-
16899
class BaseRequestData(BaseModel):
169100
"""Data needed to create a new crawling request."""
170101

src/crawlee/_types.py

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,10 @@
99
if TYPE_CHECKING:
1010
import logging
1111
import re
12-
from collections.abc import Coroutine, Sequence
12+
from collections.abc import Coroutine, Iterator, Sequence
1313

1414
from crawlee import Glob
15-
from crawlee._request import BaseRequestData, HttpHeaders, Request
15+
from crawlee._request import BaseRequestData, Request
1616
from crawlee.base_storage_client._models import DatasetItemsListPage
1717
from crawlee.http_clients import HttpResponse
1818
from crawlee.proxy_configuration import ProxyInfo
@@ -221,3 +221,42 @@ async def add_requests(
221221
) -> None:
222222
"""Track a call to the `add_requests` context helper."""
223223
self.add_requests_calls.append(AddRequestsFunctionCall(requests=requests, **kwargs))
224+
225+
226+
class HttpHeaders(dict[str, str]):
227+
"""An immutable mapping for HTTP headers that ensures case-insensitivity for header names."""
228+
229+
def __init__(self, headers: dict[str, str] | None = None) -> None:
230+
"""Create a new instance.
231+
232+
Args:
233+
headers: A mapping of header names to values.
234+
"""
235+
# Ensure immutability by sorting and fixing the order.
236+
headers = headers or {}
237+
headers = {k.lower(): v for k, v in headers.items()}
238+
self._headers = dict(sorted(headers.items()))
239+
240+
def __iter__(self) -> Iterator[str]:
241+
"""Return an iterator over the header names."""
242+
return iter(self._headers)
243+
244+
def __len__(self) -> int:
245+
"""Return the number of headers."""
246+
return len(self._headers)
247+
248+
def __repr__(self) -> str:
249+
"""Return a string representation of the object."""
250+
return f'{self._headers}'
251+
252+
def __getitem__(self, key: str) -> str:
253+
"""Get the value of a header by its name, case-insensitive."""
254+
return self._headers[key.lower()]
255+
256+
def __setitem__(self, key: str, value: str) -> None:
257+
"""Prevent setting a header, as the object is immutable."""
258+
raise TypeError(f'{self.__class__.__name__} is immutable')
259+
260+
def __delitem__(self, key: str) -> None:
261+
"""Prevent deleting a header, as the object is immutable."""
262+
raise TypeError(f'{self.__class__.__name__} is immutable')

src/crawlee/basic_crawler/_basic_crawler.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@
2323
from crawlee._autoscaling.snapshotter import Snapshotter
2424
from crawlee._autoscaling.system_status import SystemStatus
2525
from crawlee._log_config import configure_logger, get_configured_log_level
26-
from crawlee._request import BaseRequestData, HttpHeaders, Request, RequestState
27-
from crawlee._types import BasicCrawlingContext, RequestHandlerRunResult, SendRequestFunction
26+
from crawlee._request import BaseRequestData, Request, RequestState
27+
from crawlee._types import BasicCrawlingContext, HttpHeaders, RequestHandlerRunResult, SendRequestFunction
2828
from crawlee._utils.byte_size import ByteSize
2929
from crawlee._utils.http import is_status_code_client_error
3030
from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute

src/crawlee/fingerprint_suite/_header_generator.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,14 @@
11
from __future__ import annotations
22

33
import random
4-
from typing import TYPE_CHECKING
54

65
from crawlee.fingerprint_suite._consts import COMMON_ACCEPT, COMMON_ACCEPT_LANGUAGE, USER_AGENT_POOL
76

8-
if TYPE_CHECKING:
9-
from collections.abc import Mapping
10-
117

128
class HeaderGenerator:
139
"""Generates common headers for HTTP requests."""
1410

15-
def get_common_headers(self) -> Mapping[str, str]:
11+
def get_common_headers(self) -> dict[str, str]:
1612
"""Get common headers for HTTP requests.
1713
1814
We do not modify the 'Accept-Encoding', 'Connection' and other headers. They should be included and handled

src/crawlee/http_clients/_base.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,7 @@
1010
if TYPE_CHECKING:
1111
from collections.abc import Iterable
1212

13-
from crawlee._request import HttpHeaders
14-
from crawlee._types import HttpMethod, HttpPayload, HttpQueryParams
13+
from crawlee._types import HttpHeaders, HttpMethod, HttpPayload, HttpQueryParams
1514
from crawlee.base_storage_client._models import Request
1615
from crawlee.proxy_configuration import ProxyInfo
1716
from crawlee.sessions import Session

src/crawlee/http_clients/_httpx.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import httpx
77
from typing_extensions import override
88

9-
from crawlee._request import HttpHeaders
9+
from crawlee._types import HttpHeaders
1010
from crawlee._utils.blocked import ROTATE_PROXY_ERRORS
1111
from crawlee.errors import ProxyError
1212
from crawlee.fingerprint_suite import HeaderGenerator
@@ -130,7 +130,7 @@ async def crawl(
130130
http_request = client.build_request(
131131
url=request.url,
132132
method=request.method,
133-
headers=dict(headers) if headers else None,
133+
headers=headers,
134134
params=request.query_params,
135135
data=request.payload,
136136
cookies=session.cookies if session else None,
@@ -177,7 +177,7 @@ async def send_request(
177177
http_request = client.build_request(
178178
url=url,
179179
method=method,
180-
headers=dict(headers) if headers else None,
180+
headers=headers,
181181
params=query_params,
182182
data=payload,
183183
extensions={'crawlee_session': session if self._persist_cookies_per_session else None},
@@ -230,7 +230,7 @@ def _combine_headers(self, explicit_headers: HttpHeaders | None) -> HttpHeaders
230230
headers = HttpHeaders(common_headers)
231231

232232
if explicit_headers:
233-
headers = HttpHeaders({**dict(headers), **dict(headers)})
233+
headers = HttpHeaders({**headers, **explicit_headers})
234234

235235
return headers if headers else None
236236

src/crawlee/http_clients/curl_impersonate.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,7 @@
2525

2626
from curl_cffi.requests import Response
2727

28-
from crawlee._request import HttpHeaders
29-
from crawlee._types import HttpMethod, HttpPayload, HttpQueryParams
28+
from crawlee._types import HttpHeaders, HttpMethod, HttpPayload, HttpQueryParams
3029
from crawlee.base_storage_client._models import Request
3130
from crawlee.proxy_configuration import ProxyInfo
3231
from crawlee.sessions import Session
@@ -119,7 +118,7 @@ async def crawl(
119118
response = await client.request(
120119
url=request.url,
121120
method=request.method.upper(), # type: ignore # curl-cffi requires uppercase method
122-
headers=dict(request.headers) if request.headers else None,
121+
headers=request.headers,
123122
params=request.query_params,
124123
data=request.payload,
125124
cookies=session.cookies if session else None,
@@ -164,7 +163,7 @@ async def send_request(
164163
response = await client.request(
165164
url=url,
166165
method=method.upper(), # type: ignore # curl-cffi requires uppercase method
167-
headers=dict(headers) if headers else None,
166+
headers=headers,
168167
params=query_params,
169168
data=payload,
170169
cookies=session.cookies if session else None,

tests/unit/basic_crawler/test_basic_crawler.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@
1515
import pytest
1616

1717
from crawlee import ConcurrencySettings, EnqueueStrategy, Glob
18-
from crawlee._request import BaseRequestData, HttpHeaders, Request
19-
from crawlee._types import AddRequestsKwargs, BasicCrawlingContext
18+
from crawlee._request import BaseRequestData, Request
19+
from crawlee._types import AddRequestsKwargs, BasicCrawlingContext, HttpHeaders
2020
from crawlee.basic_crawler import BasicCrawler
2121
from crawlee.configuration import Configuration
2222
from crawlee.errors import SessionError, UserDefinedErrorHandlerError

0 commit comments

Comments
 (0)