Skip to content

Commit 24346de

Browse files
author
Encore
committed
feat: add followers & location
1 parent 1272671 commit 24346de

File tree

1 file changed

+144
-124
lines changed

1 file changed

+144
-124
lines changed

scraper.ts

Lines changed: 144 additions & 124 deletions
Original file line numberDiff line numberDiff line change
@@ -1,66 +1,36 @@
11
import puppeteer, { Page } from "puppeteer";
22

3+
interface OrgInfo {
4+
name: string;
5+
description: string;
6+
topLanguages: string[];
7+
peopleCount: number;
8+
followers: number;
9+
location: string;
10+
website: string;
11+
socialLinks: string[];
12+
repositories?: RepoData[];
13+
totalRepositoriesCount?: number;
14+
}
15+
16+
interface RepoData {
17+
name: string;
18+
link: string;
19+
about?: string;
20+
stars?: string;
21+
forks?: string;
22+
docsPulls?: string;
23+
}
24+
325
async function scrapeGitHubOrg(orgName: string) {
426
const browser = await puppeteer.launch({ headless: true });
527
const page = await browser.newPage();
628

729
try {
8-
// Navigate to the organization's page
9-
await page.goto(`https://github.com/${orgName}`, {
10-
waitUntil: "networkidle2",
11-
});
12-
let orgInfo: any = {};
13-
const cleanText = (el?: Element | null) =>
14-
el?.textContent?.replace(/\s+/g, " ").trim();
15-
16-
// Extract organization information
17-
const baseInfo = await page.evaluate(() => {
18-
const name =
19-
document
20-
.querySelector("div.application-main main header h1")
21-
?.textContent?.trim() || "";
22-
const description =
23-
document
24-
.querySelector("div.application-main main header h1 + div div")
25-
?.textContent?.trim() || "";
26-
const topLanguages = Array.from(
27-
document.querySelectorAll('a[href$="?language="]')
28-
).map((lang) => lang.textContent);
29-
const employeesCount =
30-
parseInt(
31-
document
32-
.querySelector('a[href$="/people"]')
33-
?.textContent?.trim()
34-
?.replace(/[^\d]/g, "") || "0"
35-
) || 0;
36-
const website =
37-
document.querySelector('a[href^="http"]')?.getAttribute("href") || "";
38-
const socialLinks = Array.from(
39-
document.querySelectorAll('a[href^="http"]')
40-
)
41-
.map((link) => link.getAttribute("href"))
42-
.filter((href) => href && !href.includes("github.com"));
43-
44-
return {
45-
name,
46-
description,
47-
topLanguages,
48-
employeesCount,
49-
website,
50-
socialLinks,
51-
};
52-
});
53-
54-
orgInfo = { ...baseInfo };
55-
56-
await scrapeAllRepos(orgName, page).then(
57-
(repos) => (orgInfo.repositories = repos)
58-
);
59-
30+
const orgInfo = await getOrgInfo(orgName, page);
31+
orgInfo.repositories = await scrapeAllRepos(orgName, page);
6032
orgInfo.totalRepositoriesCount = orgInfo.repositories.length;
6133

62-
await page.close();
63-
6434
console.log("Organization Info with Repos:", orgInfo);
6535
} catch (error) {
6636
console.error("Error scraping GitHub:", error);
@@ -69,85 +39,135 @@ async function scrapeGitHubOrg(orgName: string) {
6939
}
7040
}
7141

72-
interface RepoData {
73-
name: string;
74-
link: string;
75-
about?: string;
76-
stars?: string;
77-
forks?: string;
78-
docsPulls?: string;
42+
async function getOrgInfo(orgName: string, page: Page): Promise<OrgInfo> {
43+
await page.goto(`https://github.com/${orgName}`, {
44+
waitUntil: "networkidle2",
45+
});
46+
47+
return await page.evaluate(() => {
48+
const cleanText = (el?: Element | null) =>
49+
el?.textContent?.replace(/\s+/g, " ").trim();
50+
51+
const name =
52+
document
53+
.querySelector("div.application-main main header h1")
54+
?.textContent?.trim() || "";
55+
const description =
56+
document
57+
.querySelector("div.application-main main header h1 + div div")
58+
?.textContent?.trim() || "";
59+
const topLanguages = Array.from(
60+
document.querySelectorAll(
61+
"a > span > span[itemprop='programmingLanguage']"
62+
)
63+
)
64+
.map((lang) => lang.textContent)
65+
.filter((lang): lang is string => lang !== null);
66+
const followers =
67+
parseInt(
68+
document
69+
.querySelector('a[href$="/followers"]')
70+
?.textContent?.trim()
71+
?.replace(/[^\d]/g, "") || "0"
72+
) || 0;
73+
const peopleCount =
74+
parseInt(
75+
document
76+
.querySelector('a[href$="/people"]')
77+
?.textContent?.trim()
78+
?.replace(/[^\d]/g, "") || "0"
79+
) || 0;
80+
const location =
81+
document
82+
.querySelector('li span[itemprop="location"]')
83+
?.textContent?.trim() || "";
84+
const website =
85+
document
86+
.querySelector('li a[href^="http"][itemprop="url"]')
87+
?.getAttribute("href") || "";
88+
const socialLinks = Array.from(
89+
document.querySelectorAll(
90+
'a[href^="http"].Link--primary:not([itemprop="url"])'
91+
)
92+
)
93+
.map((link) => link.getAttribute("href"))
94+
.filter((href): href is string => href !== null)
95+
.filter((href) => href && !href.includes("github.com"));
96+
97+
return {
98+
name,
99+
description,
100+
topLanguages,
101+
followers,
102+
peopleCount,
103+
website,
104+
location,
105+
socialLinks,
106+
};
107+
});
79108
}
80109

81-
async function scrapeAllRepos(orgName: string, page: Page) {
110+
async function scrapeAllRepos(
111+
orgName: string,
112+
page: Page
113+
): Promise<RepoData[]> {
82114
const allRepos: RepoData[] = [];
83115
let pageNumber = 1;
84-
const MAX_PAGES = 50; // 安全防护防止无限循环
116+
const MAX_PAGES = 50;
117+
118+
while (pageNumber <= MAX_PAGES) {
119+
await page.goto(
120+
pageNumber === 1
121+
? `https://github.com/orgs/${orgName}/repositories`
122+
: `https://github.com/orgs/${orgName}/repositories?page=${pageNumber}`,
123+
{
124+
waitUntil: "networkidle2",
125+
}
126+
);
85127

86-
try {
87-
while (pageNumber <= MAX_PAGES) {
88-
// Navigate to the repositories page
89-
await page.goto(
90-
pageNumber == 1
91-
? `https://github.com/orgs/${orgName}/repositories`
92-
: `https://github.com/orgs/${orgName}/repositories?page=${pageNumber}`,
93-
{
94-
waitUntil: "networkidle2",
95-
}
96-
);
97-
98-
console.log(`正在抓取第 ${pageNumber} 页...`);
99-
100-
// 提取当前页数据
101-
const pageRepos = await page.evaluate((): RepoData[] => {
102-
const cleanText = (el?: Element | null) =>
103-
el?.textContent?.replace(/\s+/g, " ").trim();
104-
105-
return Array.from(
106-
document.querySelectorAll(
107-
"ul[data-listview-component='items-list'] li"
108-
)
109-
).map((repo) => ({
110-
name: cleanText(repo.querySelector("h4 a")) || "",
111-
link: repo.querySelector("h4 a")?.getAttribute("href") || "",
112-
about: cleanText(repo.querySelector('div[class^="Description-"]')),
113-
stars: cleanText(repo.querySelector('a[href$="/stargazers"]')),
114-
forks: cleanText(repo.querySelector('a[href$="/forks"]')),
115-
docsPulls: cleanText(repo.querySelector('a[href$="/docs/pulls"]')),
116-
}));
117-
});
118-
119-
allRepos.push(...pageRepos.filter((r: { name: string }) => r.name));
120-
121-
// 尝试翻页
122-
const nextPage = await page.$(
123-
'a[rel="next"]:not([aria-disabled="true"])'
124-
);
125-
if (!nextPage) break;
126-
127-
// 带重试机制的点击
128-
let retries = 3;
129-
while (retries--) {
130-
try {
131-
await Promise.all([
132-
nextPage.click(),
133-
page.waitForNavigation({
134-
waitUntil: "networkidle2",
135-
timeout: 15000,
136-
}),
137-
]);
138-
pageNumber++;
139-
break;
140-
} catch (err) {
141-
if (retries === 0) throw err;
142-
await new Promise((resolve) => setTimeout(resolve, 2000));
143-
}
128+
console.log(`Scraping page ${pageNumber}...`);
129+
130+
const pageRepos = await page.evaluate((): RepoData[] => {
131+
const cleanText = (el?: Element | null) =>
132+
el?.textContent?.replace(/\s+/g, " ").trim();
133+
134+
return Array.from(
135+
document.querySelectorAll("ul[data-listview-component='items-list'] li")
136+
).map((repo) => ({
137+
name: cleanText(repo.querySelector("h4 a")) || "",
138+
link: repo.querySelector("h4 a")?.getAttribute("href") || "",
139+
about: cleanText(repo.querySelector('div[class^="Description-"]')),
140+
stars: cleanText(repo.querySelector('a[href$="/stargazers"]')),
141+
forks: cleanText(repo.querySelector('a[href$="/forks"]')),
142+
docsPulls: cleanText(repo.querySelector('a[href$="/docs/pulls"]')),
143+
}));
144+
});
145+
146+
allRepos.push(...pageRepos.filter((r) => r.name));
147+
148+
const nextPage = await page.$('a[rel="next"]:not([aria-disabled="true"])');
149+
if (!nextPage) break;
150+
151+
let retries = 3;
152+
while (retries--) {
153+
try {
154+
await Promise.all([
155+
nextPage.click(),
156+
page.waitForNavigation({
157+
waitUntil: "networkidle2",
158+
timeout: 15000,
159+
}),
160+
]);
161+
pageNumber++;
162+
break;
163+
} catch (err) {
164+
if (retries === 0) throw err;
165+
await new Promise((resolve) => setTimeout(resolve, 2000));
144166
}
145167
}
146-
} catch (error) {
147-
console.error("抓取中断:", error);
148168
}
149169

150-
console.log(`共抓取 ${allRepos.length} 个仓库`);
170+
console.log(`Total repositories scraped: ${allRepos.length}`);
151171
return allRepos;
152172
}
153173

0 commit comments

Comments
 (0)