Skip to content

Commit f0ea79c

Browse files
committed
request headers derives from dict
1 parent a6280a4 commit f0ea79c

7 files changed

Lines changed: 54 additions & 88 deletions

File tree

src/crawlee/_request.py

Lines changed: 3 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,11 @@
22

33
from __future__ import annotations
44

5-
from _collections_abc import dict_items, dict_keys
6-
from collections.abc import Iterator, Mapping, MutableMapping
5+
from collections.abc import Iterator, MutableMapping
76
from datetime import datetime
87
from decimal import Decimal
98
from enum import Enum
10-
from typing import Annotated, Any, cast, overload
9+
from typing import Annotated, Any, cast
1110

1211
from pydantic import (
1312
BaseModel,
@@ -17,12 +16,11 @@
1716
JsonValue,
1817
PlainSerializer,
1918
PlainValidator,
20-
RootModel,
2119
TypeAdapter,
2220
)
2321
from typing_extensions import Self
2422

25-
from crawlee._types import EnqueueStrategy, HttpMethod, HttpPayload, HttpQueryParams
23+
from crawlee._types import EnqueueStrategy, HttpHeaders, HttpMethod, HttpPayload, HttpQueryParams
2624
from crawlee._utils.requests import compute_unique_key, unique_key_to_request_id
2725
from crawlee._utils.urls import extract_query_params, validate_http_url
2826

@@ -98,73 +96,6 @@ def __len__(self) -> int:
9896
user_data_adapter = TypeAdapter(UserData)
9997

10098

101-
class HttpHeaders(RootModel):
102-
"""An immutable mapping for HTTP headers that ensures case-insensitivity for header names."""
103-
104-
def __init__(self, headers: Mapping[str, str] | None = None) -> None:
105-
"""Create a new instance.
106-
107-
Args:
108-
headers: A mapping of header names to values.
109-
"""
110-
# Ensure immutability by sorting and fixing the order.
111-
headers = headers or {}
112-
headers = {k.lower(): v for k, v in headers.items()}
113-
self._headers = dict(sorted(headers.items()))
114-
115-
@property
116-
def __dict__(self) -> dict[str, str]:
117-
"""Return the headers as a dictionary."""
118-
# We have to implement this because of `BaseModel.__iter__` implementation.
119-
return dict(self._headers)
120-
121-
@__dict__.setter
122-
def __dict__(self, value: dict[str, str]) -> None:
123-
"""Set the headers from a dictionary."""
124-
self._headers = {k.lower(): v for k, v in value.items()}
125-
126-
def __len__(self) -> int:
127-
"""Return the number of headers."""
128-
return len(self._headers)
129-
130-
def __repr__(self) -> str:
131-
"""Return a string representation of the object."""
132-
return f'{self.__class__.__name__}({self._headers})'
133-
134-
def __getitem__(self, key: str) -> str:
135-
"""Get the value of a header by its name, case-insensitive."""
136-
return self._headers[key.lower()]
137-
138-
def __setitem__(self, key: str, value: str) -> None:
139-
"""Prevent setting a header, as the object is immutable."""
140-
raise TypeError(f'{self.__class__.__name__} is immutable')
141-
142-
def __delitem__(self, key: str) -> None:
143-
"""Prevent deleting a header, as the object is immutable."""
144-
raise TypeError(f'{self.__class__.__name__} is immutable')
145-
146-
def keys(self) -> dict_keys[str, str]:
147-
"""Return an iterator over the header names."""
148-
return self._headers.keys()
149-
150-
def items(self) -> dict_items[str, str]:
151-
"""Return an iterator over the header names and values."""
152-
return self._headers.items()
153-
154-
@overload
155-
def get(self, key: str) -> str | None: ...
156-
157-
@overload
158-
def get(self, key: str, default: str) -> str: ...
159-
160-
@overload
161-
def get(self, key: str, default: None) -> None: ...
162-
163-
def get(self, key: str, default: str | None = None) -> str | None:
164-
"""Returns the value of the header if it exists, otherwise returns the default."""
165-
return self._headers.get(key, default)
166-
167-
16899
class BaseRequestData(BaseModel):
169100
"""Data needed to create a new crawling request."""
170101

src/crawlee/_types.py

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from collections.abc import Coroutine, Iterator, Sequence
1414

1515
from crawlee import Glob
16-
from crawlee._request import BaseRequestData, HttpHeaders, Request
16+
from crawlee._request import BaseRequestData, Request
1717
from crawlee.base_storage_client._models import DatasetItemsListPage
1818
from crawlee.http_clients import HttpResponse
1919
from crawlee.proxy_configuration import ProxyInfo
@@ -222,3 +222,42 @@ async def add_requests(
222222
) -> None:
223223
"""Track a call to the `add_requests` context helper."""
224224
self.add_requests_calls.append(AddRequestsFunctionCall(requests=requests, **kwargs))
225+
226+
227+
class HttpHeaders(dict[str, str]):
228+
"""An immutable mapping for HTTP headers that ensures case-insensitivity for header names."""
229+
230+
def __init__(self, headers: dict[str, str] | None = None) -> None:
231+
"""Create a new instance.
232+
233+
Args:
234+
headers: A mapping of header names to values.
235+
"""
236+
# Ensure immutability by sorting and fixing the order.
237+
headers = headers or {}
238+
headers = {k.lower(): v for k, v in headers.items()}
239+
self._headers = dict(sorted(headers.items()))
240+
241+
def __iter__(self) -> Iterator[str]:
242+
"""Return an iterator over the header names."""
243+
return iter(self._headers)
244+
245+
def __len__(self) -> int:
246+
"""Return the number of headers."""
247+
return len(self._headers)
248+
249+
def __repr__(self) -> str:
250+
"""Return a string representation of the object."""
251+
return f'{self._headers}'
252+
253+
def __getitem__(self, key: str) -> str:
254+
"""Get the value of a header by its name, case-insensitive."""
255+
return self._headers[key.lower()]
256+
257+
def __setitem__(self, key: str, value: str) -> None:
258+
"""Prevent setting a header, as the object is immutable."""
259+
raise TypeError(f'{self.__class__.__name__} is immutable')
260+
261+
def __delitem__(self, key: str) -> None:
262+
"""Prevent deleting a header, as the object is immutable."""
263+
raise TypeError(f'{self.__class__.__name__} is immutable')

src/crawlee/basic_crawler/_basic_crawler.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@
2323
from crawlee._autoscaling.snapshotter import Snapshotter
2424
from crawlee._autoscaling.system_status import SystemStatus
2525
from crawlee._log_config import configure_logger, get_configured_log_level
26-
from crawlee._request import BaseRequestData, HttpHeaders, Request, RequestState
27-
from crawlee._types import BasicCrawlingContext, RequestHandlerRunResult, SendRequestFunction
26+
from crawlee._request import BaseRequestData, Request, RequestState
27+
from crawlee._types import BasicCrawlingContext, HttpHeaders, RequestHandlerRunResult, SendRequestFunction
2828
from crawlee._utils.byte_size import ByteSize
2929
from crawlee._utils.http import is_status_code_client_error
3030
from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute

src/crawlee/fingerprint_suite/_header_generator.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,14 @@
11
from __future__ import annotations
22

33
import random
4-
from typing import TYPE_CHECKING
54

65
from crawlee.fingerprint_suite._consts import COMMON_ACCEPT, COMMON_ACCEPT_LANGUAGE, USER_AGENT_POOL
76

8-
if TYPE_CHECKING:
9-
from collections.abc import Mapping
10-
117

128
class HeaderGenerator:
139
"""Generates common headers for HTTP requests."""
1410

15-
def get_common_headers(self) -> Mapping[str, str]:
11+
def get_common_headers(self) -> dict[str, str]:
1612
"""Get common headers for HTTP requests.
1713
1814
We do not modify the 'Accept-Encoding', 'Connection' and other headers. They should be included and handled

src/crawlee/http_clients/_httpx.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import httpx
77
from typing_extensions import override
88

9-
from crawlee._request import HttpHeaders
9+
from crawlee._types import HttpHeaders
1010
from crawlee._utils.blocked import ROTATE_PROXY_ERRORS
1111
from crawlee.errors import ProxyError
1212
from crawlee.fingerprint_suite import HeaderGenerator
@@ -130,7 +130,7 @@ async def crawl(
130130
http_request = client.build_request(
131131
url=request.url,
132132
method=request.method,
133-
headers=dict(headers) if headers else None,
133+
headers=headers,
134134
params=request.query_params,
135135
data=request.payload,
136136
cookies=session.cookies if session else None,
@@ -177,7 +177,7 @@ async def send_request(
177177
http_request = client.build_request(
178178
url=url,
179179
method=method,
180-
headers=dict(headers) if headers else None,
180+
headers=headers,
181181
params=query_params,
182182
data=payload,
183183
extensions={'crawlee_session': session if self._persist_cookies_per_session else None},
@@ -230,7 +230,7 @@ def _combine_headers(self, explicit_headers: HttpHeaders | None) -> HttpHeaders
230230
headers = HttpHeaders(common_headers)
231231

232232
if explicit_headers:
233-
headers = HttpHeaders({**dict(headers), **dict(headers)})
233+
headers = HttpHeaders({**headers, **explicit_headers})
234234

235235
return headers if headers else None
236236

src/crawlee/http_clients/curl_impersonate.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ async def crawl(
118118
response = await client.request(
119119
url=request.url,
120120
method=request.method.upper(), # type: ignore # curl-cffi requires uppercase method
121-
headers=dict(request.headers) if request.headers else None,
121+
headers=request.headers,
122122
params=request.query_params,
123123
data=request.payload,
124124
cookies=session.cookies if session else None,
@@ -163,7 +163,7 @@ async def send_request(
163163
response = await client.request(
164164
url=url,
165165
method=method.upper(), # type: ignore # curl-cffi requires uppercase method
166-
headers=dict(headers) if headers else None,
166+
headers=headers,
167167
params=query_params,
168168
data=payload,
169169
cookies=session.cookies if session else None,

tests/unit/basic_crawler/test_basic_crawler.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@
1515
import pytest
1616

1717
from crawlee import ConcurrencySettings, EnqueueStrategy, Glob
18-
from crawlee._request import BaseRequestData, HttpHeaders, Request
19-
from crawlee._types import AddRequestsKwargs, BasicCrawlingContext
18+
from crawlee._request import BaseRequestData, Request
19+
from crawlee._types import AddRequestsKwargs, BasicCrawlingContext, HttpHeaders
2020
from crawlee.basic_crawler import BasicCrawler
2121
from crawlee.configuration import Configuration
2222
from crawlee.errors import SessionError, UserDefinedErrorHandlerError

0 commit comments

Comments
 (0)