feat: add followers & location

Encore · Encore · commit 24346ded9ec8 · 2025-02-27T19:22:55.000+08:00
diff --git a/scraper.ts b/scraper.ts
@@ -1,66 +1,36 @@
 import puppeteer, { Page } from "puppeteer";
 
+interface OrgInfo {
+  name: string;
+  description: string;
+  topLanguages: string[];
+  peopleCount: number;
+  followers: number;
+  location: string;
+  website: string;
+  socialLinks: string[];
+  repositories?: RepoData[];
+  totalRepositoriesCount?: number;
+}
+
+interface RepoData {
+  name: string;
+  link: string;
+  about?: string;
+  stars?: string;
+  forks?: string;
+  docsPulls?: string;
+}
+
 async function scrapeGitHubOrg(orgName: string) {
   const browser = await puppeteer.launch({ headless: true });
   const page = await browser.newPage();
 
   try {
-    // Navigate to the organization's page
-    await page.goto(`https://github.com/${orgName}`, {
-      waitUntil: "networkidle2",
-    });
-    let orgInfo: any = {};
-    const cleanText = (el?: Element | null) =>
-      el?.textContent?.replace(/\s+/g, " ").trim();
-
-    // Extract organization information
-    const baseInfo = await page.evaluate(() => {
-      const name =
-        document
-          .querySelector("div.application-main main header h1")
-          ?.textContent?.trim() || "";
-      const description =
-        document
-          .querySelector("div.application-main main header h1 + div div")
-          ?.textContent?.trim() || "";
-      const topLanguages = Array.from(
-        document.querySelectorAll('a[href$="?language="]')
-      ).map((lang) => lang.textContent);
-      const employeesCount =
-        parseInt(
-          document
-            .querySelector('a[href$="/people"]')
-            ?.textContent?.trim()
-            ?.replace(/[^\d]/g, "") || "0"
-        ) || 0;
-      const website =
-        document.querySelector('a[href^="http"]')?.getAttribute("href") || "";
-      const socialLinks = Array.from(
-        document.querySelectorAll('a[href^="http"]')
-      )
-        .map((link) => link.getAttribute("href"))
-        .filter((href) => href && !href.includes("github.com"));
-
-      return {
-        name,
-        description,
-        topLanguages,
-        employeesCount,
-        website,
-        socialLinks,
-      };
-    });
-
-    orgInfo = { ...baseInfo };
-
-    await scrapeAllRepos(orgName, page).then(
-      (repos) => (orgInfo.repositories = repos)
-    );
-
+    const orgInfo = await getOrgInfo(orgName, page);
+    orgInfo.repositories = await scrapeAllRepos(orgName, page);
     orgInfo.totalRepositoriesCount = orgInfo.repositories.length;
 
-    await page.close();
-
     console.log("Organization Info with Repos:", orgInfo);
   } catch (error) {
     console.error("Error scraping GitHub:", error);
@@ -69,85 +39,135 @@ async function scrapeGitHubOrg(orgName: string) {
   }
 }
 
-interface RepoData {
-  name: string;
-  link: string;
-  about?: string;
-  stars?: string;
-  forks?: string;
-  docsPulls?: string;
+async function getOrgInfo(orgName: string, page: Page): Promise<OrgInfo> {
+  await page.goto(`https://github.com/${orgName}`, {
+    waitUntil: "networkidle2",
+  });
+
+  return await page.evaluate(() => {
+    const cleanText = (el?: Element | null) =>
+      el?.textContent?.replace(/\s+/g, " ").trim();
+
+    const name =
+      document
+        .querySelector("div.application-main main header h1")
+        ?.textContent?.trim() || "";
+    const description =
+      document
+        .querySelector("div.application-main main header h1 + div div")
+        ?.textContent?.trim() || "";
+    const topLanguages = Array.from(
+      document.querySelectorAll(
+        "a > span > span[itemprop='programmingLanguage']"
+      )
+    )
+      .map((lang) => lang.textContent)
+      .filter((lang): lang is string => lang !== null);
+    const followers =
+      parseInt(
+        document
+          .querySelector('a[href$="/followers"]')
+          ?.textContent?.trim()
+          ?.replace(/[^\d]/g, "") || "0"
+      ) || 0;
+    const peopleCount =
+      parseInt(
+        document
+          .querySelector('a[href$="/people"]')
+          ?.textContent?.trim()
+          ?.replace(/[^\d]/g, "") || "0"
+      ) || 0;
+    const location =
+      document
+        .querySelector('li span[itemprop="location"]')
+        ?.textContent?.trim() || "";
+    const website =
+      document
+        .querySelector('li a[href^="http"][itemprop="url"]')
+        ?.getAttribute("href") || "";
+    const socialLinks = Array.from(
+      document.querySelectorAll(
+        'a[href^="http"].Link--primary:not([itemprop="url"])'
+      )
+    )
+      .map((link) => link.getAttribute("href"))
+      .filter((href): href is string => href !== null)
+      .filter((href) => href && !href.includes("github.com"));
+
+    return {
+      name,
+      description,
+      topLanguages,
+      followers,
+      peopleCount,
+      website,
+      location,
+      socialLinks,
+    };
+  });
 }
 
-async function scrapeAllRepos(orgName: string, page: Page) {
+async function scrapeAllRepos(
+  orgName: string,
+  page: Page
+): Promise<RepoData[]> {
   const allRepos: RepoData[] = [];
   let pageNumber = 1;
-  const MAX_PAGES = 50; // &#23433;&#20840;&#38450;&#25252;&#38450;&#27490;&#26080;&#38480;&#24490;&#29615;
+  const MAX_PAGES = 50;
+
+  while (pageNumber <= MAX_PAGES) {
+    await page.goto(
+      pageNumber === 1
+        ? `https://github.com/orgs/${orgName}/repositories`
+        : `https://github.com/orgs/${orgName}/repositories?page=${pageNumber}`,
+      {
+        waitUntil: "networkidle2",
+      }
+    );
 
-  try {
-    while (pageNumber <= MAX_PAGES) {
-      // Navigate to the repositories page
-      await page.goto(
-        pageNumber == 1
-          ? `https://github.com/orgs/${orgName}/repositories`
-          : `https://github.com/orgs/${orgName}/repositories?page=${pageNumber}`,
-        {
-          waitUntil: "networkidle2",
-        }
-      );
-
-      console.log(`&#27491;&#22312;&#25235;&#21462;&#31532; ${pageNumber} &#39029;...`);
-
-      // &#25552;&#21462;&#24403;&#21069;&#39029;&#25968;&#25454;
-      const pageRepos = await page.evaluate((): RepoData[] => {
-        const cleanText = (el?: Element | null) =>
-          el?.textContent?.replace(/\s+/g, " ").trim();
-
-        return Array.from(
-          document.querySelectorAll(
-            "ul[data-listview-component='items-list'] li"
-          )
-        ).map((repo) => ({
-          name: cleanText(repo.querySelector("h4 a")) || "",
-          link: repo.querySelector("h4 a")?.getAttribute("href") || "",
-          about: cleanText(repo.querySelector('div[class^="Description-"]')),
-          stars: cleanText(repo.querySelector('a[href$="/stargazers"]')),
-          forks: cleanText(repo.querySelector('a[href$="/forks"]')),
-          docsPulls: cleanText(repo.querySelector('a[href$="/docs/pulls"]')),
-        }));
-      });
-
-      allRepos.push(...pageRepos.filter((r: { name: string }) => r.name));
-
-      // &#23581;&#35797;&#32763;&#39029;
-      const nextPage = await page.$(
-        'a[rel="next"]:not([aria-disabled="true"])'
-      );
-      if (!nextPage) break;
-
-      // &#24102;&#37325;&#35797;&#26426;&#21046;&#30340;&#28857;&#20987;
-      let retries = 3;
-      while (retries--) {
-        try {
-          await Promise.all([
-            nextPage.click(),
-            page.waitForNavigation({
-              waitUntil: "networkidle2",
-              timeout: 15000,
-            }),
-          ]);
-          pageNumber++;
-          break;
-        } catch (err) {
-          if (retries === 0) throw err;
-          await new Promise((resolve) => setTimeout(resolve, 2000));
-        }
+    console.log(`Scraping page ${pageNumber}...`);
+
+    const pageRepos = await page.evaluate((): RepoData[] => {
+      const cleanText = (el?: Element | null) =>
+        el?.textContent?.replace(/\s+/g, " ").trim();
+
+      return Array.from(
+        document.querySelectorAll("ul[data-listview-component='items-list'] li")
+      ).map((repo) => ({
+        name: cleanText(repo.querySelector("h4 a")) || "",
+        link: repo.querySelector("h4 a")?.getAttribute("href") || "",
+        about: cleanText(repo.querySelector('div[class^="Description-"]')),
+        stars: cleanText(repo.querySelector('a[href$="/stargazers"]')),
+        forks: cleanText(repo.querySelector('a[href$="/forks"]')),
+        docsPulls: cleanText(repo.querySelector('a[href$="/docs/pulls"]')),
+      }));
+    });
+
+    allRepos.push(...pageRepos.filter((r) => r.name));
+
+    const nextPage = await page.$('a[rel="next"]:not([aria-disabled="true"])');
+    if (!nextPage) break;
+
+    let retries = 3;
+    while (retries--) {
+      try {
+        await Promise.all([
+          nextPage.click(),
+          page.waitForNavigation({
+            waitUntil: "networkidle2",
+            timeout: 15000,
+          }),
+        ]);
+        pageNumber++;
+        break;
+      } catch (err) {
+        if (retries === 0) throw err;
+        await new Promise((resolve) => setTimeout(resolve, 2000));
       }
     }
-  } catch (error) {
-    console.error("&#25235;&#21462;&#20013;&#26029;:", error);
   }
 
-  console.log(`&#20849;&#25235;&#21462; ${allRepos.length} &#20010;&#20179;&#24211;`);
+  console.log(`Total repositories scraped: ${allRepos.length}`);
   return allRepos;
 }