Skip to content

Commit 6527b97

Browse files
author
littlelory
committed
豆瓣爬虫
1 parent d8d1b4a commit 6527b97

17 files changed

Lines changed: 579 additions & 5 deletions

File tree

python/autohome_spider/autohome_spider/items.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,11 @@ class ModelItem(scrapy.Item):
2626
name = scrapy.Field()
2727
group = scrapy.Field()
2828
price = scrapy.Field()
29+
30+
class SpecItem(scrapy.Item):
31+
id = scrapy.Field()
32+
spec = scrapy.Field()
33+
34+
class DetailItem(scrapy.Item):
35+
id = scrapy.Field()
36+
detail = scrapy.Field()

python/autohome_spider/autohome_spider/settings.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,11 @@
2323
}
2424

2525
# 数据集输出路径
26-
# FEED_URI = 'data/%(name)s_%(time)s.csv'
26+
FEED_URI = 'data/%(name)s_%(time)s.json'
2727
# 数据集输出格式
28-
# FEED_FORMAT = 'csv'
28+
FEED_FORMAT = 'json'
2929

3030
# 日志级别
31-
# LOG_LEVEL = 'INFO'
31+
LOG_LEVEL = 'INFO'
3232
# 日志文件路径
3333
# LOG_FILE = 'scrapy.log'
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
# #!/usr/bin/python
2+
# # coding=utf-8
3+
#
4+
import scrapy
5+
import codecs
6+
import json
7+
from bs4 import BeautifulSoup
8+
import re
9+
import sys
10+
import os
11+
import chardet
12+
from autohome_spider.items import DetailItem
13+
from scrapy import log
14+
15+
reload(sys)
16+
sys.setdefaultencoding('utf-8')
17+
18+
19+
# 车型参配数据爬虫
20+
class Detail(scrapy.Spider):
21+
name = 'detail'
22+
allowed_domains = 'autohome.com.cn'
23+
start_urls = []
24+
25+
def __init__(self):
26+
ids = self.readIds()
27+
self.start_urls = ['http://car.autohome.com.cn/config/spec/%s.html' % id for id in ids]
28+
29+
def parse(self, response):
30+
url = response.url
31+
log.msg('[url]%s' % url)
32+
33+
current = int(url.split('/')[-1].split('.')[0])
34+
35+
body = response.body
36+
37+
38+
matcher = re.search(r'var specIDs =\[(.*)\];', body)
39+
if not matcher:
40+
log.msg('modelId[%s], no data...' % current)
41+
return
42+
data = matcher.group(1)
43+
model_ids = data.split(',')
44+
if str(current) not in model_ids:
45+
log.msg('modelId[%s], no current data...' % current)
46+
return
47+
48+
detail = {}
49+
50+
# print '==============================='
51+
52+
matcher = re.search(r'var config = (\{.*\});', body)
53+
data = matcher.group(1)
54+
encoding = chardet.detect(data)['encoding']
55+
j = json.loads(data, encoding=encoding)
56+
57+
for config_types in j['result']['paramtypeitems']:
58+
for config_items in config_types['paramitems']:
59+
id = config_items['id']
60+
# name = config_items['name']
61+
# detail_name = DetailItem()
62+
# detail_name['id'] = id
63+
# detail_name['name'] = name
64+
# yield detail_name
65+
# print '------ id[%s] ------' % id
66+
values = config_items['valueitems']
67+
for value in values:
68+
if current == value['specid']:
69+
detail[id] = value['value']
70+
71+
# print '==============================='
72+
73+
matcher = re.search(r'var option = (\{.*\});', body)
74+
data = matcher.group(1)
75+
encoding = chardet.detect(data)['encoding']
76+
j = json.loads(data, encoding=encoding)
77+
78+
for config_types in j['result']['configtypeitems']:
79+
for config_items in config_types['configitems']:
80+
id = config_items['id']
81+
# name = config_items['name']
82+
# detail_name = DetailItem()
83+
# detail_name['id'] = id
84+
# detail_name['name'] = name
85+
# yield detail_name
86+
# print '------ id[%s] ------' % id
87+
values = config_items['valueitems']
88+
for value in values:
89+
if current == value['specid']:
90+
detail[id] = value['value']
91+
92+
detail_item = DetailItem()
93+
detail_item['id'] = current
94+
detail_item['detail'] = detail
95+
yield detail_item
96+
97+
def readIds(self):
98+
99+
names = filter(lambda x: 'model' in x and 'json' in x,
100+
os.listdir('/home/king/code/python_job/autohome_spider/data'))
101+
print names
102+
if not names:
103+
log.msg('[spec]no model data file in data dir.', log.ERROR)
104+
return
105+
model_file_name = names[-1]
106+
f = codecs.open('/home/king/code/python_job/autohome_spider/data/%s' % model_file_name, 'r')
107+
ids = [line['id'] for line in json.loads(f.read())]
108+
log.msg(len(ids), log.INFO)
109+
return ids
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
# #!/usr/bin/python
2+
# # coding=utf-8
3+
#
4+
import scrapy
5+
from autohome_spider.items import SpecItem
6+
import codecs
7+
import json
8+
from bs4 import BeautifulSoup
9+
import re
10+
import sys
11+
import os
12+
from scrapy import log
13+
14+
reload(sys)
15+
sys.setdefaultencoding('utf-8')
16+
17+
18+
# 车型参配数据爬虫
19+
class SpecSpider(scrapy.Spider):
20+
name = 'spec'
21+
allowed_domains = 'autohome.com.cn'
22+
start_urls = []
23+
24+
def __init__(self):
25+
ids = self.readIds()
26+
self.start_urls = ['http://www.autohome.com.cn/spec/%s' % id for id in ids]
27+
28+
def parse(self, response):
29+
url = response.url
30+
log.msg('[url]%s' % url)
31+
body = response.body
32+
soup = BeautifulSoup(body, 'lxml').select('.cardetail-infor')[0]
33+
text = str(self.gettextonly(soup)).decode('utf-8')
34+
m = re.findall(ur'(车身尺寸|综合油耗|厂商指导价|车身结构|整车质保|发 动 机|变 速 箱|驱动方式|二手车参考价):\n?(.+)\n', text, re.M | re.U)
35+
map = dict([(d[0], d[1]) for d in m])
36+
result = SpecItem()
37+
result['id'] = url.split('/')[-1]
38+
result['spec'] = map
39+
yield result
40+
41+
def gettextonly(self, soup):
42+
v = soup.string
43+
if v == None:
44+
c = soup.contents
45+
resulttext = ''
46+
for t in c:
47+
subtext = self.gettextonly(t)
48+
resulttext += subtext + '\n'
49+
return resulttext
50+
else:
51+
return v.strip()
52+
53+
def readIds(self):
54+
55+
names = filter(lambda x: 'model' in x and 'json' in x,
56+
os.listdir('/Users/king/Work/code/codePool/python/autohome_spider/data'))
57+
print names
58+
if not names:
59+
log.msg('[spec]no model data file in data dir.', log.ERROR)
60+
return
61+
model_file_name = names[-1]
62+
f = codecs.open('/Users/king/Work/code/codePool/python/autohome_spider/data/%s' % model_file_name, 'r')
63+
ids = [line['id'] for line in json.loads(f.read())]
64+
log.msg(len(ids), log.INFO)
65+
return ids
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
#!/usr/bin/python
2+
# coding=utf-8
3+
4+
import json
5+
import csv
6+
import codecs
7+
import types
8+
import sys
9+
reload(sys)
10+
sys.setdefaultencoding('utf-8')
11+
12+
brandList = json.loads(open('brand_2017-07-19T03-22-42.json').read())
13+
seriesList = json.loads(open('series_2017-07-19T03-24-17.json').read())
14+
modelList = json.loads(open('model_2017-07-19T03-25-04.json').read())
15+
specList = json.loads(open('spec_2017-07-19T10-09-46.json').read())
16+
17+
brandDict = {brand['id']: brand for brand in brandList}
18+
seriesDict = {series['id'].strip('s'): series for series in seriesList}
19+
# modelDict = {model['id']: model for model in modelList}
20+
specDict = {spec['id']: spec for spec in specList}
21+
22+
23+
specKeys = []
24+
for spec in specList:
25+
detail = spec['spec']
26+
for key in detail.keys():
27+
if key not in specKeys:
28+
specKeys.append(key)
29+
30+
31+
f = codecs.open('merge.csv', 'w+', 'utf-8')
32+
writer = csv.writer(f)
33+
34+
titles = ['品牌ID', '品牌名称', '车系ID', '车系名称', '车型ID', '车型名称']
35+
36+
for key in specKeys:
37+
titles.append(key)
38+
39+
writer.writerow(titles)
40+
41+
for model in modelList:
42+
modelId = model['id']
43+
modelName = model['name']
44+
seriesId = model['series_id']
45+
# print seriesId
46+
series = seriesDict[seriesId]
47+
seriesName = series['name']
48+
brandId = series['brand_id']
49+
brand = brandDict[brandId]
50+
brandName = brand['name']
51+
52+
row = [brandId, brandName, seriesId, seriesName, modelId, modelName]
53+
54+
spec = specDict[unicode(modelId)]
55+
if spec:
56+
detail = spec['spec']
57+
for key in specKeys:
58+
value = ""
59+
if key in detail:
60+
value = detail[key]
61+
row.append(value)
62+
63+
writer.writerow(row)
64+
65+
print 'finish..'
66+

python/autohome_spider/test.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
#!/usr/bin/python
2+
# coding=utf-8
3+
4+
import urllib2
5+
import re
6+
import json
7+
import sys
8+
import types
9+
import codecs
10+
import chardet
11+
import csv
12+
13+
reload(sys)
14+
sys.setdefaultencoding('utf-8')
15+
16+
17+
resp = urllib2.urlopen('http://car.autohome.com.cn/config/spec/25379.html')
18+
body = resp.read()
19+
20+
model_ids = []
21+
spec_ids = []
22+
23+
config = re.search(r'var specIDs =\[(.*)\];', body)
24+
data = config.group(1)
25+
model_ids = data.split(',')
26+
27+
result = {int(model_id): {} for model_id in model_ids}
28+
29+
print '==============================='
30+
31+
config = re.search(r'var config = (\{.*\});', body)
32+
data = config.group(1)
33+
encoding = chardet.detect(data)['encoding']
34+
j = json.loads(data, encoding=encoding)
35+
36+
for config_types in j['result']['paramtypeitems']:
37+
for config_items in config_types['paramitems']:
38+
id = config_items['id']
39+
if id not in spec_ids: spec_ids.append(id)
40+
print '------ id[%s] ------' % id
41+
values = config_items['valueitems']
42+
for value in values:
43+
result[value['specid']][id] = value['value']
44+
print '%s-[%s]' % (value['specid'], value['value'])
45+
46+
print '==============================='
47+
48+
config = re.search(r'var option = (\{.*\});', body)
49+
data = config.group(1)
50+
encoding = chardet.detect(data)['encoding']
51+
j = json.loads(data, encoding=encoding)
52+
53+
for config_types in j['result']['configtypeitems']:
54+
for config_items in config_types['configitems']:
55+
id = config_items['id']
56+
if id not in spec_ids: spec_ids.append(id)
57+
print '------ id[%s] ------' % id
58+
values = config_items['valueitems']
59+
for value in values:
60+
result[value['specid']][id] = value['value']
61+
print '%s-[%s]' % (value['specid'], value['value'])
62+
63+
for model_id, details in result.items():
64+
print '%s\t%s' % (model_id, details)
65+
66+
# f = csv.writer(open('detail.csv', 'w+'))
67+
# title = []
68+
# title.append('model_id')
69+
# title.append(model_ids)
70+
#
71+
#
72+
#
73+
# row = []
74+
# for model_id, details in result.items():
75+
# row.append(model_id)
76+
# for spec_id in spec_ids:
77+
# value = ""
78+
# if spec_id in details:
79+
# value = details[spec_id]
80+
# row.append(value)
81+
#
82+
# f.writerow(row)
83+
84+
85+
print 'finish...'
86+

python/douban_spider/douban/douban/__init__.py

Whitespace-only changes.
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Define here the models for your scraped items
4+
#
5+
# See documentation in:
6+
# http://doc.scrapy.org/en/latest/topics/items.html
7+
8+
import scrapy
9+
10+
11+
class DoubanItem(scrapy.Item):
12+
# define the fields for your item here like:
13+
# name = scrapy.Field()
14+
pass
15+
16+
17+
class BookName(scrapy.Item):
18+
book_id = scrapy.Field()
19+
book_name = scrapy.Field()

0 commit comments

Comments
 (0)