11import puppeteer , { Page } from "puppeteer" ;
22
3+ interface OrgInfo {
4+ name : string ;
5+ description : string ;
6+ topLanguages : string [ ] ;
7+ peopleCount : number ;
8+ followers : number ;
9+ location : string ;
10+ website : string ;
11+ socialLinks : string [ ] ;
12+ repositories ?: RepoData [ ] ;
13+ totalRepositoriesCount ?: number ;
14+ }
15+
16+ interface RepoData {
17+ name : string ;
18+ link : string ;
19+ about ?: string ;
20+ stars ?: string ;
21+ forks ?: string ;
22+ docsPulls ?: string ;
23+ }
24+
325async function scrapeGitHubOrg ( orgName : string ) {
426 const browser = await puppeteer . launch ( { headless : true } ) ;
527 const page = await browser . newPage ( ) ;
628
729 try {
8- // Navigate to the organization's page
9- await page . goto ( `https://github.com/${ orgName } ` , {
10- waitUntil : "networkidle2" ,
11- } ) ;
12- let orgInfo : any = { } ;
13- const cleanText = ( el ?: Element | null ) =>
14- el ?. textContent ?. replace ( / \s + / g, " " ) . trim ( ) ;
15-
16- // Extract organization information
17- const baseInfo = await page . evaluate ( ( ) => {
18- const name =
19- document
20- . querySelector ( "div.application-main main header h1" )
21- ?. textContent ?. trim ( ) || "" ;
22- const description =
23- document
24- . querySelector ( "div.application-main main header h1 + div div" )
25- ?. textContent ?. trim ( ) || "" ;
26- const topLanguages = Array . from (
27- document . querySelectorAll ( 'a[href$="?language="]' )
28- ) . map ( ( lang ) => lang . textContent ) ;
29- const employeesCount =
30- parseInt (
31- document
32- . querySelector ( 'a[href$="/people"]' )
33- ?. textContent ?. trim ( )
34- ?. replace ( / [ ^ \d ] / g, "" ) || "0"
35- ) || 0 ;
36- const website =
37- document . querySelector ( 'a[href^="http"]' ) ?. getAttribute ( "href" ) || "" ;
38- const socialLinks = Array . from (
39- document . querySelectorAll ( 'a[href^="http"]' )
40- )
41- . map ( ( link ) => link . getAttribute ( "href" ) )
42- . filter ( ( href ) => href && ! href . includes ( "github.com" ) ) ;
43-
44- return {
45- name,
46- description,
47- topLanguages,
48- employeesCount,
49- website,
50- socialLinks,
51- } ;
52- } ) ;
53-
54- orgInfo = { ...baseInfo } ;
55-
56- await scrapeAllRepos ( orgName , page ) . then (
57- ( repos ) => ( orgInfo . repositories = repos )
58- ) ;
59-
30+ const orgInfo = await getOrgInfo ( orgName , page ) ;
31+ orgInfo . repositories = await scrapeAllRepos ( orgName , page ) ;
6032 orgInfo . totalRepositoriesCount = orgInfo . repositories . length ;
6133
62- await page . close ( ) ;
63-
6434 console . log ( "Organization Info with Repos:" , orgInfo ) ;
6535 } catch ( error ) {
6636 console . error ( "Error scraping GitHub:" , error ) ;
@@ -69,85 +39,135 @@ async function scrapeGitHubOrg(orgName: string) {
6939 }
7040}
7141
72- interface RepoData {
73- name : string ;
74- link : string ;
75- about ?: string ;
76- stars ?: string ;
77- forks ?: string ;
78- docsPulls ?: string ;
42+ async function getOrgInfo ( orgName : string , page : Page ) : Promise < OrgInfo > {
43+ await page . goto ( `https://github.com/${ orgName } ` , {
44+ waitUntil : "networkidle2" ,
45+ } ) ;
46+
47+ return await page . evaluate ( ( ) => {
48+ const cleanText = ( el ?: Element | null ) =>
49+ el ?. textContent ?. replace ( / \s + / g, " " ) . trim ( ) ;
50+
51+ const name =
52+ document
53+ . querySelector ( "div.application-main main header h1" )
54+ ?. textContent ?. trim ( ) || "" ;
55+ const description =
56+ document
57+ . querySelector ( "div.application-main main header h1 + div div" )
58+ ?. textContent ?. trim ( ) || "" ;
59+ const topLanguages = Array . from (
60+ document . querySelectorAll (
61+ "a > span > span[itemprop='programmingLanguage']"
62+ )
63+ )
64+ . map ( ( lang ) => lang . textContent )
65+ . filter ( ( lang ) : lang is string => lang !== null ) ;
66+ const followers =
67+ parseInt (
68+ document
69+ . querySelector ( 'a[href$="/followers"]' )
70+ ?. textContent ?. trim ( )
71+ ?. replace ( / [ ^ \d ] / g, "" ) || "0"
72+ ) || 0 ;
73+ const peopleCount =
74+ parseInt (
75+ document
76+ . querySelector ( 'a[href$="/people"]' )
77+ ?. textContent ?. trim ( )
78+ ?. replace ( / [ ^ \d ] / g, "" ) || "0"
79+ ) || 0 ;
80+ const location =
81+ document
82+ . querySelector ( 'li span[itemprop="location"]' )
83+ ?. textContent ?. trim ( ) || "" ;
84+ const website =
85+ document
86+ . querySelector ( 'li a[href^="http"][itemprop="url"]' )
87+ ?. getAttribute ( "href" ) || "" ;
88+ const socialLinks = Array . from (
89+ document . querySelectorAll (
90+ 'a[href^="http"].Link--primary:not([itemprop="url"])'
91+ )
92+ )
93+ . map ( ( link ) => link . getAttribute ( "href" ) )
94+ . filter ( ( href ) : href is string => href !== null )
95+ . filter ( ( href ) => href && ! href . includes ( "github.com" ) ) ;
96+
97+ return {
98+ name,
99+ description,
100+ topLanguages,
101+ followers,
102+ peopleCount,
103+ website,
104+ location,
105+ socialLinks,
106+ } ;
107+ } ) ;
79108}
80109
81- async function scrapeAllRepos ( orgName : string , page : Page ) {
110+ async function scrapeAllRepos (
111+ orgName : string ,
112+ page : Page
113+ ) : Promise < RepoData [ ] > {
82114 const allRepos : RepoData [ ] = [ ] ;
83115 let pageNumber = 1 ;
84- const MAX_PAGES = 50 ; // 安全防护防止无限循环
116+ const MAX_PAGES = 50 ;
117+
118+ while ( pageNumber <= MAX_PAGES ) {
119+ await page . goto (
120+ pageNumber === 1
121+ ? `https://github.com/orgs/${ orgName } /repositories`
122+ : `https://github.com/orgs/${ orgName } /repositories?page=${ pageNumber } ` ,
123+ {
124+ waitUntil : "networkidle2" ,
125+ }
126+ ) ;
85127
86- try {
87- while ( pageNumber <= MAX_PAGES ) {
88- // Navigate to the repositories page
89- await page . goto (
90- pageNumber == 1
91- ? `https://github.com/orgs/${ orgName } /repositories`
92- : `https://github.com/orgs/${ orgName } /repositories?page=${ pageNumber } ` ,
93- {
94- waitUntil : "networkidle2" ,
95- }
96- ) ;
97-
98- console . log ( `正在抓取第 ${ pageNumber } 页...` ) ;
99-
100- // 提取当前页数据
101- const pageRepos = await page . evaluate ( ( ) : RepoData [ ] => {
102- const cleanText = ( el ?: Element | null ) =>
103- el ?. textContent ?. replace ( / \s + / g, " " ) . trim ( ) ;
104-
105- return Array . from (
106- document . querySelectorAll (
107- "ul[data-listview-component='items-list'] li"
108- )
109- ) . map ( ( repo ) => ( {
110- name : cleanText ( repo . querySelector ( "h4 a" ) ) || "" ,
111- link : repo . querySelector ( "h4 a" ) ?. getAttribute ( "href" ) || "" ,
112- about : cleanText ( repo . querySelector ( 'div[class^="Description-"]' ) ) ,
113- stars : cleanText ( repo . querySelector ( 'a[href$="/stargazers"]' ) ) ,
114- forks : cleanText ( repo . querySelector ( 'a[href$="/forks"]' ) ) ,
115- docsPulls : cleanText ( repo . querySelector ( 'a[href$="/docs/pulls"]' ) ) ,
116- } ) ) ;
117- } ) ;
118-
119- allRepos . push ( ...pageRepos . filter ( ( r : { name : string } ) => r . name ) ) ;
120-
121- // 尝试翻页
122- const nextPage = await page . $ (
123- 'a[rel="next"]:not([aria-disabled="true"])'
124- ) ;
125- if ( ! nextPage ) break ;
126-
127- // 带重试机制的点击
128- let retries = 3 ;
129- while ( retries -- ) {
130- try {
131- await Promise . all ( [
132- nextPage . click ( ) ,
133- page . waitForNavigation ( {
134- waitUntil : "networkidle2" ,
135- timeout : 15000 ,
136- } ) ,
137- ] ) ;
138- pageNumber ++ ;
139- break ;
140- } catch ( err ) {
141- if ( retries === 0 ) throw err ;
142- await new Promise ( ( resolve ) => setTimeout ( resolve , 2000 ) ) ;
143- }
128+ console . log ( `Scraping page ${ pageNumber } ...` ) ;
129+
130+ const pageRepos = await page . evaluate ( ( ) : RepoData [ ] => {
131+ const cleanText = ( el ?: Element | null ) =>
132+ el ?. textContent ?. replace ( / \s + / g, " " ) . trim ( ) ;
133+
134+ return Array . from (
135+ document . querySelectorAll ( "ul[data-listview-component='items-list'] li" )
136+ ) . map ( ( repo ) => ( {
137+ name : cleanText ( repo . querySelector ( "h4 a" ) ) || "" ,
138+ link : repo . querySelector ( "h4 a" ) ?. getAttribute ( "href" ) || "" ,
139+ about : cleanText ( repo . querySelector ( 'div[class^="Description-"]' ) ) ,
140+ stars : cleanText ( repo . querySelector ( 'a[href$="/stargazers"]' ) ) ,
141+ forks : cleanText ( repo . querySelector ( 'a[href$="/forks"]' ) ) ,
142+ docsPulls : cleanText ( repo . querySelector ( 'a[href$="/docs/pulls"]' ) ) ,
143+ } ) ) ;
144+ } ) ;
145+
146+ allRepos . push ( ...pageRepos . filter ( ( r ) => r . name ) ) ;
147+
148+ const nextPage = await page . $ ( 'a[rel="next"]:not([aria-disabled="true"])' ) ;
149+ if ( ! nextPage ) break ;
150+
151+ let retries = 3 ;
152+ while ( retries -- ) {
153+ try {
154+ await Promise . all ( [
155+ nextPage . click ( ) ,
156+ page . waitForNavigation ( {
157+ waitUntil : "networkidle2" ,
158+ timeout : 15000 ,
159+ } ) ,
160+ ] ) ;
161+ pageNumber ++ ;
162+ break ;
163+ } catch ( err ) {
164+ if ( retries === 0 ) throw err ;
165+ await new Promise ( ( resolve ) => setTimeout ( resolve , 2000 ) ) ;
144166 }
145167 }
146- } catch ( error ) {
147- console . error ( "抓取中断:" , error ) ;
148168 }
149169
150- console . log ( `共抓取 ${ allRepos . length } 个仓库 ` ) ;
170+ console . log ( `Total repositories scraped: ${ allRepos . length } ` ) ;
151171 return allRepos ;
152172}
153173
0 commit comments