Skip to content

Commit fa2dedd

Browse files
committed
Refactor and standardize output handling with new utilities in theHarvester.lib.output, add tests for print_linkedin_sections and sorted_unique. Fix regex inaccuracies and enhance CORS validation.
1 parent 4686533 commit fa2dedd

File tree

6 files changed

+118
-70
lines changed

6 files changed

+118
-70
lines changed

tests/lib/test_output.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
from __future__ import annotations
2+
3+
4+
from theHarvester.lib.output import print_linkedin_sections, sorted_unique
5+
6+
7+
def test_sorted_unique_sorts_and_deduplicates() -> None:
8+
assert sorted_unique(["b", "a", "b"]) == ["a", "b"]
9+
10+
11+
def test_print_linkedin_sections_prints_links_when_present(capsys) -> None:
12+
# Regression coverage: the CLI previously never printed LinkedIn links when the list was non-empty.
13+
print_linkedin_sections(
14+
engines=["linkedin"],
15+
people=[],
16+
links=["https://b.example", "https://a.example", "https://a.example"],
17+
)
18+
19+
out = capsys.readouterr().out
20+
assert "No LinkedIn users found" in out
21+
assert "LinkedIn Links found: 3" in out
22+
assert "https://a.example" in out
23+
assert "https://b.example" in out
24+
25+
26+
def test_print_linkedin_sections_prints_people_and_links(capsys) -> None:
27+
print_linkedin_sections(
28+
engines=["rocketreach"],
29+
people=["bob", "alice", "bob"],
30+
links=["https://z.example", "https://z.example"],
31+
)
32+
33+
out = capsys.readouterr().out
34+
assert "LinkedIn Users found: 3" in out
35+
assert "alice" in out
36+
assert "bob" in out
37+
assert "LinkedIn Links found: 2" in out
38+
assert "https://z.example" in out

tests/test_security.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def test_cors_does_not_allow_credentials_with_wildcard_origins(self):
3636
allow_origins = options.get('allow_origins', [])
3737
allow_credentials = options.get('allow_credentials', False)
3838

39-
if '*' in allow_origins:
39+
if isinstance(allow_origins, (list, tuple, set)) and '*' in allow_origins:
4040
assert (
4141
allow_credentials is False
4242
), 'CRITICAL: CORS must not allow credentials with wildcard origins (CVE risk)'

theHarvester/__main__.py

Lines changed: 17 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@
7272
from theHarvester.discovery.constants import MissingKey
7373
from theHarvester.lib import hostchecker, stash
7474
from theHarvester.lib.core import DATA_DIR, Core, show_default_error_message
75+
from theHarvester.lib.output import print_linkedin_sections, print_section, sorted_unique
7576
from theHarvester.screenshot.screenshot import ScreenShotter
7677

7778
if TYPE_CHECKING:
@@ -1330,55 +1331,35 @@ async def handler(lst):
13301331

13311332
# Results
13321333
if len(total_asns) > 0:
1333-
print(f'\n[*] ASNS found: {len(total_asns)}')
1334-
print('--------------------')
1335-
total_asns = list(sorted(set(total_asns)))
1336-
for asn in total_asns:
1337-
print(asn)
1334+
print_section(f'\n[*] ASNS found: {len(total_asns)}', total_asns, '--------------------')
1335+
total_asns = sorted_unique(total_asns)
13381336

13391337
if len(interesting_urls) > 0:
1340-
print(f'\n[*] Interesting Urls found: {len(interesting_urls)}')
1341-
print('--------------------')
1342-
interesting_urls = list(sorted(set(interesting_urls)))
1343-
for iurl in interesting_urls:
1344-
print(iurl)
1338+
print_section(f'\n[*] Interesting Urls found: {len(interesting_urls)}', interesting_urls, '--------------------')
1339+
interesting_urls = sorted_unique(interesting_urls)
13451340

13461341
if len(twitter_people_list_tracker) == 0 and 'twitter' in engines:
13471342
print('\n[*] No Twitter users found.\n\n')
13481343
elif len(twitter_people_list_tracker) >= 1:
1349-
print('\n[*] Twitter Users found: ' + str(len(twitter_people_list_tracker)))
1350-
print('---------------------')
1351-
twitter_people_list_tracker = list(sorted(set(twitter_people_list_tracker)))
1352-
for usr in twitter_people_list_tracker:
1353-
print(usr)
1354-
1355-
if len(linkedin_people_list_tracker) == 0 and 'linkedin' in engines:
1356-
print('\n[*] No LinkedIn users found.\n\n')
1357-
elif len(linkedin_people_list_tracker) >= 1:
1358-
print('\n[*] LinkedIn Users found: ' + str(len(linkedin_people_list_tracker)))
1359-
print('---------------------')
1360-
linkedin_people_list_tracker = list(sorted(set(linkedin_people_list_tracker)))
1361-
for usr in linkedin_people_list_tracker:
1362-
print(usr)
1363-
1364-
if len(linkedin_links_tracker) == 0 and ('linkedin' in engines or 'rocketreach' in engines):
1365-
print(f'\n[*] LinkedIn Links found: {len(linkedin_links_tracker)}')
1366-
linkedin_links_tracker = list(sorted(set(linkedin_links_tracker)))
1367-
print('---------------------')
1368-
for link in linkedin_people_list_tracker:
1369-
print(link)
1344+
print_section(
1345+
'\n[*] Twitter Users found: ' + str(len(twitter_people_list_tracker)),
1346+
twitter_people_list_tracker,
1347+
'---------------------',
1348+
)
1349+
twitter_people_list_tracker = sorted_unique(twitter_people_list_tracker)
1350+
1351+
print_linkedin_sections(engines, linkedin_people_list_tracker, linkedin_links_tracker)
1352+
linkedin_people_list_tracker = sorted_unique(linkedin_people_list_tracker)
1353+
linkedin_links_tracker = sorted_unique(linkedin_links_tracker)
13701354

13711355
length_urls = len(all_urls)
13721356
if length_urls == 0:
13731357
if len(engines) >= 1 and 'trello' in engines:
13741358
print('\n[*] No Trello URLs found.')
13751359
else:
13761360
total = length_urls
1377-
print('\n[*] Trello URLs found: ' + str(total))
1378-
print('--------------------')
1379-
all_urls = list(sorted(set(all_urls)))
1380-
for url in sorted(all_urls):
1381-
print(url)
1361+
print_section('\n[*] Trello URLs found: ' + str(total), all_urls, '--------------------')
1362+
all_urls = sorted_unique(all_urls)
13821363

13831364
if len(all_ip) == 0:
13841365
print('\n[*] No IPs found.')
@@ -1847,7 +1828,6 @@ async def handler(lst):
18471828
try:
18481829
print('\n[*] Performing BuiltWith scan...')
18491830
builtwith_scanner = builtwith.SearchBuiltWith(word)
1850-
# Use the process method according to the original structure of this module
18511831
await builtwith_scanner.process(use_proxy)
18521832

18531833
hosts = await builtwith_scanner.get_hostnames()

theHarvester/lib/api/api_example.py

Lines changed: 24 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
import aiohttp
88
import netaddr
99

10+
from theHarvester.lib.output import print_section, sorted_unique
11+
1012

1113
async def fetch_json(session, url):
1214
try:
@@ -50,55 +52,46 @@ async def main() -> None:
5052
hosts = fetched_json.get('hosts', [])
5153

5254
if len(total_asns) > 0:
53-
print(f'\n[*] ASNS found: {len(total_asns)}')
54-
print('--------------------')
55-
total_asns = list(sorted(set(total_asns)))
56-
for asn in total_asns:
57-
print(asn)
55+
print_section(f'\n[*] ASNS found: {len(total_asns)}', total_asns, '--------------------')
56+
total_asns = sorted_unique(total_asns)
5857

5958
if len(interesting_urls) > 0:
60-
print(f'\n[*] Interesting Urls found: {len(interesting_urls)}')
61-
print('--------------------')
62-
interesting_urls = list(sorted(set(interesting_urls)))
63-
for iurl in interesting_urls:
64-
print(iurl)
59+
print_section(f'\n[*] Interesting Urls found: {len(interesting_urls)}', interesting_urls, '--------------------')
60+
interesting_urls = sorted_unique(interesting_urls)
6561

6662
if len(twitter_people_list_tracker) == 0:
6763
print('\n[*] No Twitter users found.')
6864
elif len(twitter_people_list_tracker) >= 1:
69-
print('\n[*] Twitter Users found: ' + str(len(twitter_people_list_tracker)))
70-
print('---------------------')
71-
twitter_people_list_tracker = list(sorted(set(twitter_people_list_tracker)))
72-
for usr in twitter_people_list_tracker:
73-
print(usr)
65+
print_section(
66+
'\n[*] Twitter Users found: ' + str(len(twitter_people_list_tracker)),
67+
twitter_people_list_tracker,
68+
'---------------------',
69+
)
70+
twitter_people_list_tracker = sorted_unique(twitter_people_list_tracker)
7471

7572
if len(linkedin_people_list_tracker) == 0:
7673
print('\n[*] No LinkedIn users found.')
7774
elif len(linkedin_people_list_tracker) >= 1:
78-
print('\n[*] LinkedIn Users found: ' + str(len(linkedin_people_list_tracker)))
79-
print('---------------------')
80-
linkedin_people_list_tracker = list(sorted(set(linkedin_people_list_tracker)))
81-
for usr in linkedin_people_list_tracker:
82-
print(usr)
75+
print_section(
76+
'\n[*] LinkedIn Users found: ' + str(len(linkedin_people_list_tracker)),
77+
linkedin_people_list_tracker,
78+
'---------------------',
79+
)
80+
linkedin_people_list_tracker = sorted_unique(linkedin_people_list_tracker)
8381

8482
if len(linkedin_links_tracker) == 0:
8583
print('\n[*] No LinkedIn links found.')
8684
else:
87-
print(f'\n[*] LinkedIn Links found: {len(linkedin_links_tracker)}')
88-
print('---------------------')
89-
linkedin_links_tracker = list(sorted(set(linkedin_links_tracker)))
90-
for link in linkedin_links_tracker:
91-
print(link)
85+
print_section(
86+
f'\n[*] LinkedIn Links found: {len(linkedin_links_tracker)}', linkedin_links_tracker, '---------------------'
87+
)
88+
linkedin_links_tracker = sorted_unique(linkedin_links_tracker)
9289

9390
length_urls = len(trello_urls)
9491
if length_urls == 0:
9592
print('\n[*] No Trello URLs found.')
9693
else:
97-
print('\n[*] Trello URLs found: ' + str(length_urls))
98-
print('--------------------')
99-
all_urls = list(sorted(set(trello_urls)))
100-
for url in sorted(all_urls):
101-
print(url)
94+
print_section('\n[*] Trello URLs found: ' + str(length_urls), trello_urls, '--------------------')
10295

10396
if len(ips) == 0:
10497
print('\n[*] No IPs found.')
@@ -114,7 +107,7 @@ async def main() -> None:
114107
else:
115108
print('\n[*] Emails found: ' + str(len(emails)))
116109
print('----------------------')
117-
all_emails = sorted(list(set(emails)))
110+
all_emails = sorted_unique(emails)
118111
print('\n'.join(all_emails))
119112

120113
if len(hosts) == 0:

theHarvester/lib/output.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
from __future__ import annotations
2+
3+
from collections.abc import Hashable, Iterable, Sequence
4+
from typing import TypeVar
5+
6+
T = TypeVar('T', bound=Hashable)
7+
8+
9+
def sorted_unique[T: Hashable](items: Iterable[T]) -> list[T]:
10+
# `T` is only required to be hashable, not orderable.
11+
# Sorting by `str` keeps output deterministic without requiring rich comparison support.
12+
return list(sorted(set(items), key=str))
13+
14+
15+
def print_section(header: str, items: Iterable[str], separator: str) -> None:
16+
print(header)
17+
print(separator)
18+
for item in sorted_unique(items):
19+
print(item)
20+
21+
22+
def print_linkedin_sections(
23+
engines: Sequence[str], people: Sequence[str], links: Sequence[str], separator: str = '---------------------'
24+
) -> None:
25+
if len(people) == 0 and 'linkedin' in engines:
26+
print('\n[*] No LinkedIn users found.\n\n')
27+
elif len(people) >= 1:
28+
print('\n[*] LinkedIn Users found: ' + str(len(people)))
29+
print(separator)
30+
for usr in sorted_unique(people):
31+
print(usr)
32+
33+
if 'linkedin' in engines or 'rocketreach' in engines:
34+
print(f'\n[*] LinkedIn Links found: {len(links)}')
35+
print(separator)
36+
for link in sorted_unique(links):
37+
print(link)

theHarvester/parsers/myparser.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ async def set(self):
110110
return sets
111111

112112
async def urls(self) -> Set[str]:
113-
found = re.finditer(r'(http|https)://(www\.)?trello.com/([a-zA-Z\d\-_\.]+/?)*', self.results)
113+
found = re.finditer(r'(http|https)://(www\.)?trello.com/([a-zA-Z\d\-_.]+/?)*', self.results)
114114
urls = {match.group().strip() for match in found}
115115
return urls
116116

0 commit comments

Comments
 (0)