Skip to content

Commit 3d7489c

Browse files
committed
Add security tests and enhance input sanitization mechanisms
- Introduced comprehensive security tests, including CORS configuration, XML injection prevention, path traversal prevention, and information disclosure protection. - Improved input sanitization for filenames and XML content to mitigate injection attacks and ensure safe handling. - Updated CORS middleware to restrict HTTP methods and block credentials with wildcard origins. - Removed traceback exposure in API error responses, even in debug mode. - Verified rate limiting and input validation for sensitive API endpoints. - Ensured no hardcoded secrets are present in the main codebase.
1 parent e72876f commit 3d7489c

File tree

3 files changed

+449
-23
lines changed

3 files changed

+449
-23
lines changed

tests/test_security.py

Lines changed: 395 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,395 @@
1+
import os
2+
import re
3+
import tempfile
4+
from pathlib import Path
5+
6+
import pytest
7+
from fastapi.testclient import TestClient
8+
9+
from theHarvester.__main__ import sanitize_filename, sanitize_for_xml
10+
11+
12+
class TestCORSConfiguration:
13+
"""Test CORS security configuration."""
14+
15+
def test_cors_does_not_allow_credentials_with_wildcard_origins(self):
16+
"""
17+
Security Test: CORS should not allow credentials with wildcard origins.
18+
19+
This prevents credential theft attacks where any origin can make
20+
authenticated requests to the API.
21+
"""
22+
from theHarvester.lib.api.api import app
23+
24+
# Find CORS middleware in the app
25+
cors_middleware = None
26+
for middleware in app.user_middleware:
27+
if 'CORSMiddleware' in str(middleware.cls):
28+
cors_middleware = middleware
29+
break
30+
31+
assert cors_middleware is not None, 'CORS middleware should be configured'
32+
33+
# Check that if allow_origins contains '*', allow_credentials must be False
34+
# Access kwargs from the middleware
35+
options = cors_middleware.kwargs
36+
allow_origins = options.get('allow_origins', [])
37+
allow_credentials = options.get('allow_credentials', False)
38+
39+
if '*' in allow_origins:
40+
assert (
41+
allow_credentials is False
42+
), 'CRITICAL: CORS must not allow credentials with wildcard origins (CVE risk)'
43+
44+
def test_cors_restricts_http_methods(self):
45+
"""
46+
Security Test: CORS should restrict HTTP methods to only what's needed.
47+
48+
Reduces attack surface by limiting available methods.
49+
"""
50+
from theHarvester.lib.api.api import app
51+
52+
cors_middleware = None
53+
for middleware in app.user_middleware:
54+
if 'CORSMiddleware' in str(middleware.cls):
55+
cors_middleware = middleware
56+
break
57+
58+
assert cors_middleware is not None
59+
60+
options = cors_middleware.kwargs
61+
allow_methods = options.get('allow_methods', [])
62+
63+
# Should not allow all methods
64+
assert allow_methods != ['*'], 'CORS should restrict HTTP methods, not allow all (*)'
65+
66+
# Should only allow necessary methods (GET, POST for this API)
67+
if isinstance(allow_methods, list):
68+
dangerous_methods = {'DELETE', 'PUT', 'PATCH', 'TRACE', 'CONNECT'}
69+
allowed_set = {m.upper() for m in allow_methods}
70+
assert not (
71+
allowed_set & dangerous_methods
72+
), f'Unnecessary HTTP methods detected: {allowed_set & dangerous_methods}'
73+
74+
75+
class TestXMLInjectionPrevention:
76+
"""Test XML injection prevention."""
77+
78+
def test_sanitize_for_xml_escapes_special_characters(self):
79+
"""
80+
Security Test: Verify XML special characters are properly escaped.
81+
82+
Prevents XML injection attacks.
83+
"""
84+
# Test all XML special characters
85+
test_cases = [
86+
('&', '&'),
87+
('<', '&lt;'),
88+
('>', '&gt;'),
89+
('"', '&quot;'),
90+
("'", '&apos;'),
91+
('<script>alert("XSS")</script>', '&lt;script&gt;alert(&quot;XSS&quot;)&lt;/script&gt;'),
92+
('[email protected] & <test>', '[email protected] &amp; &lt;test&gt;'),
93+
('Normal text', 'Normal text'),
94+
]
95+
96+
for input_text, expected_output in test_cases:
97+
result = sanitize_for_xml(input_text)
98+
assert result == expected_output, f'Failed to properly escape: {input_text}'
99+
100+
def test_sanitize_for_xml_prevents_xml_entity_injection(self):
101+
"""
102+
Security Test: Prevent XML entity injection attempts.
103+
"""
104+
malicious_inputs = [
105+
'<?xml version="1.0"?><!DOCTYPE foo [<!ENTITY xxe SYSTEM "file:///etc/passwd">]>',
106+
'<!ENTITY xxe SYSTEM "file:///dev/random">',
107+
'<![CDATA[malicious]]>',
108+
'&#x3C;script&#x3E;',
109+
]
110+
111+
for malicious_input in malicious_inputs:
112+
result = sanitize_for_xml(malicious_input)
113+
# Ensure dangerous characters are escaped
114+
assert '&lt;' in result or '&amp;' in result, f'Failed to sanitize: {malicious_input}'
115+
assert '<' not in result or result == malicious_input.replace('<', '&lt;'), f'XML tags not escaped: {malicious_input}'
116+
117+
def test_command_line_args_are_sanitized_in_xml_output(self):
118+
"""
119+
Security Test: Command line arguments must be sanitized before XML output.
120+
121+
This test is a conceptual check - in real usage, ensure the XML writing
122+
code uses sanitize_for_xml() on all user-controlled data.
123+
"""
124+
# Simulate dangerous command line arguments
125+
dangerous_args = [
126+
'--domain=test.com',
127+
"--source='<script>alert(1)</script>'",
128+
'--output="; rm -rf /',
129+
'--domain=example.com&param=<injection>',
130+
]
131+
132+
for arg in dangerous_args:
133+
sanitized = sanitize_for_xml(arg)
134+
# Verify no unescaped XML special characters remain
135+
assert '<script>' not in sanitized, f'Script tag not escaped in: {arg}'
136+
assert '&param=' not in sanitized or '&amp;' in sanitized, f'Ampersand not escaped in: {arg}'
137+
138+
139+
class TestInformationDisclosure:
140+
"""Test information disclosure prevention."""
141+
142+
@pytest.fixture
143+
def client(self):
144+
"""Create a test client for API testing."""
145+
from theHarvester.lib.api.api import app
146+
147+
return TestClient(app)
148+
149+
def test_api_does_not_expose_traceback_in_error_responses(self, client):
150+
"""
151+
Security Test: API should never expose stack traces to clients.
152+
153+
Stack traces can reveal sensitive information about the system.
154+
"""
155+
# Test the /sources endpoint with a simulated error condition
156+
response = client.get('/sources')
157+
158+
# Even if there's an error, traceback should not be in response
159+
if response.status_code >= 400:
160+
response_data = response.json()
161+
assert 'traceback' not in response_data, 'Traceback exposed in error response'
162+
assert 'Traceback' not in str(response_data), 'Traceback text found in response'
163+
assert 'File "' not in str(response_data), 'File paths exposed in response'
164+
165+
def test_error_responses_do_not_leak_internal_paths(self, client):
166+
"""
167+
Security Test: Error messages should not reveal internal file paths.
168+
"""
169+
# Try various endpoints
170+
endpoints = ['/sources', '/dnsbrute?domain=test', '/query?domain=test&source=baidu']
171+
172+
for endpoint in endpoints:
173+
response = client.get(endpoint)
174+
response_text = str(response.json() if response.status_code != 200 else {})
175+
176+
# Check for common path leakage patterns
177+
path_patterns = [
178+
r'/home/\w+/',
179+
r'/usr/local/',
180+
r'C:\\Users\\',
181+
r'/var/www/',
182+
r'site-packages/',
183+
r'\.py:\d+', # filename.py:123
184+
]
185+
186+
for pattern in path_patterns:
187+
matches = re.findall(pattern, response_text)
188+
assert not matches, f'Internal path leaked in {endpoint}: {matches}'
189+
190+
def test_debug_mode_does_not_expose_sensitive_info(self, client, monkeypatch):
191+
"""
192+
Security Test: Even with DEBUG=1, sensitive info should not be exposed to clients.
193+
"""
194+
# Set DEBUG environment variable
195+
monkeypatch.setenv('DEBUG', '1')
196+
197+
# Make request that might trigger an error
198+
response = client.get('/dnsbrute?domain=') # Invalid request
199+
200+
if response.status_code >= 400:
201+
response_data = response.json()
202+
# Even with DEBUG=1, traceback should NOT be sent to client
203+
assert 'traceback' not in response_data, 'DEBUG mode exposes tracebacks to clients'
204+
205+
206+
class TestPathTraversalPrevention:
207+
"""Test path traversal prevention."""
208+
209+
def test_sanitize_filename_removes_path_components(self):
210+
"""
211+
Security Test: Filenames should not contain path traversal sequences.
212+
"""
213+
dangerous_filenames = [
214+
'../../../etc/passwd',
215+
'..\\..\\..\\windows\\system32\\config\\sam',
216+
'/etc/passwd',
217+
'C:\\Windows\\System32\\config\\sam',
218+
'../../sensitive_file.txt',
219+
'./../hidden_file',
220+
'subdir/../../../etc/passwd',
221+
]
222+
223+
for dangerous_filename in dangerous_filenames:
224+
result = sanitize_filename(dangerous_filename)
225+
226+
# Should not contain any path separators
227+
assert '/' not in result, f'Path separator found in sanitized filename: {result}'
228+
assert '\\' not in result, f'Windows path separator found: {result}'
229+
230+
# Should not start with .. (parent directory reference at the beginning is most dangerous)
231+
assert not result.startswith('..'), f'Parent directory reference at start: {result}'
232+
233+
# Should only be the basename
234+
assert os.path.dirname(result) == '', f'Path component remains: {result}'
235+
236+
def test_sanitize_filename_removes_dangerous_characters(self):
237+
"""
238+
Security Test: Filenames should only contain safe characters.
239+
"""
240+
test_cases = [
241+
'file; rm -rf /',
242+
'file`whoami`.txt',
243+
'file$(malicious).txt',
244+
'file|cmd.txt',
245+
'file&background.txt',
246+
'normal-file_123.txt',
247+
]
248+
249+
for input_filename in test_cases:
250+
result = sanitize_filename(input_filename)
251+
252+
# Should not be empty
253+
assert len(result) > 0, f'Sanitized filename is empty for: {input_filename}'
254+
255+
# Should not contain shell special characters
256+
dangerous_chars = [';', '|', '&', '$', '`', '(', ')', '{', '}', '[', ']', '<', '>']
257+
for char in dangerous_chars:
258+
assert char not in result, f'Dangerous character {char} found in: {result}'
259+
260+
# Should only contain alphanumeric, dash, underscore, and dot
261+
assert re.match(r'^[a-zA-Z0-9._-]+$', result), f'Invalid characters in sanitized filename: {result}'
262+
263+
def test_sanitize_filename_prevents_hidden_files(self):
264+
"""
265+
Security Test: Prevent creation of hidden files.
266+
"""
267+
hidden_files = ['.bashrc', '.ssh_config', '.env', '..hidden', '.']
268+
269+
for hidden_file in hidden_files:
270+
result = sanitize_filename(hidden_file)
271+
272+
# Should not start with a dot (except for allowed extensions)
273+
if result: # If not empty
274+
assert not result.startswith('.'), f'Hidden file not prevented: {result}'
275+
276+
def test_filename_sanitization_preserves_safe_filenames(self):
277+
"""
278+
Security Test: Safe filenames should remain mostly unchanged.
279+
"""
280+
safe_filenames = [
281+
'report.json',
282+
'results_2024-01-17.xml',
283+
'scan-output.txt',
284+
'data_file_v2.csv',
285+
]
286+
287+
for safe_filename in safe_filenames:
288+
result = sanitize_filename(safe_filename)
289+
290+
# Safe filenames should be preserved (possibly with minor changes)
291+
assert len(result) > 0, 'Safe filename was completely removed'
292+
assert '.' in result if '.' in safe_filename else True, 'File extension removed incorrectly'
293+
294+
def test_path_traversal_in_file_operations(self):
295+
"""
296+
Integration Test: Verify file operations don't allow path traversal.
297+
"""
298+
# This tests the actual usage in the code
299+
from theHarvester.__main__ import sanitize_filename
300+
301+
# Simulate user input
302+
user_input = '../../../etc/passwd'
303+
sanitized = sanitize_filename(user_input)
304+
305+
# Try to create a file with sanitized name
306+
with tempfile.TemporaryDirectory() as tmpdir:
307+
safe_path = os.path.join(tmpdir, sanitized)
308+
309+
# Ensure the resolved path is still within tmpdir
310+
assert os.path.commonpath([tmpdir, safe_path]) == tmpdir, 'Path traversal detected!'
311+
312+
# Verify we can't escape the directory
313+
assert tmpdir in os.path.abspath(safe_path), 'File path escaped temporary directory'
314+
315+
316+
class TestSecurityBestPractices:
317+
"""Additional security best practices tests."""
318+
319+
def test_no_hardcoded_secrets_in_code(self):
320+
"""
321+
Security Test: Ensure no hardcoded secrets in main code files.
322+
"""
323+
# Check main application files for common secret patterns
324+
files_to_check = [
325+
'theHarvester/__main__.py',
326+
'theHarvester/lib/api/api.py',
327+
'theHarvester/lib/core.py',
328+
]
329+
330+
# Patterns that might indicate hardcoded secrets
331+
secret_patterns = [
332+
r'password\s*=\s*["\'][^"\']+["\']',
333+
r'api_key\s*=\s*["\'][a-zA-Z0-9]{20,}["\']',
334+
r'secret\s*=\s*["\'][^"\']+["\']',
335+
r'token\s*=\s*["\'][a-zA-Z0-9]{20,}["\']',
336+
]
337+
338+
for file_path in files_to_check:
339+
if os.path.exists(file_path):
340+
with open(file_path) as f:
341+
content = f.read()
342+
343+
for pattern in secret_patterns:
344+
matches = re.findall(pattern, content, re.IGNORECASE)
345+
# Filter out obvious non-secrets (like example values, empty strings, variable names)
346+
real_matches = [
347+
m
348+
for m in matches
349+
if 'example' not in m.lower()
350+
and 'your_' not in m.lower()
351+
and '""' not in m
352+
and "''" not in m
353+
]
354+
assert not real_matches, f'Potential hardcoded secret in {file_path}: {real_matches}'
355+
356+
def test_api_has_rate_limiting(self):
357+
"""
358+
Security Test: Verify API endpoints have rate limiting enabled.
359+
"""
360+
from theHarvester.lib.api.api import app
361+
362+
# Check that rate limiting is configured
363+
assert hasattr(app.state, 'limiter'), 'Rate limiter not configured'
364+
assert app.state.limiter is not None, 'Rate limiter is None'
365+
366+
def test_sensitive_endpoints_require_validation(self):
367+
"""
368+
Security Test: Ensure sensitive endpoints validate input.
369+
"""
370+
from fastapi.testclient import TestClient
371+
372+
from theHarvester.lib.api.api import app
373+
374+
client = TestClient(app)
375+
376+
# Test that endpoints reject invalid input
377+
# Note: The /query endpoint requires 'source' as a list parameter
378+
test_cases = [
379+
('/dnsbrute?domain=', 400), # Empty domain should be rejected
380+
]
381+
382+
for endpoint, expected_status in test_cases:
383+
response = client.get(endpoint)
384+
assert (
385+
response.status_code >= 400
386+
), f'Endpoint {endpoint} should reject invalid input (got {response.status_code})'
387+
388+
# Test query endpoint with proper parameter format but invalid domain
389+
response = client.get('/query?domain=a&source=baidu') # Too short domain
390+
# This may or may not fail depending on validation, but we check it doesn't crash
391+
assert response.status_code in [200, 400, 422, 500], 'Unexpected status code'
392+
393+
394+
if __name__ == '__main__':
395+
pytest.main([__file__, '-v'])

0 commit comments

Comments
 (0)