Skip to content

Commit 09c28db

Browse files
author
littlelory
committed
添加指导价的爬取
1 parent 51f8a5e commit 09c28db

File tree

4 files changed

+28
-19
lines changed

4 files changed

+28
-19
lines changed

python/autohome_spider/autohome_spider/items.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,3 +25,4 @@ class ModelItem(scrapy.Item):
2525
series_id = scrapy.Field()
2626
name = scrapy.Field()
2727
group = scrapy.Field()
28+
price = scrapy.Field()

python/autohome_spider/autohome_spider/RotateUserAgentMiddlewares.py renamed to python/autohome_spider/autohome_spider/random_user_agent_middlewares.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,9 @@
1111

1212
# user agent middleware
1313
# 在user agent池中,随机选取一个设置到request的Header中
14-
class RotateUserAgentMiddleware(UserAgentMiddleware):
14+
class RandomUserAgentMiddleware(UserAgentMiddleware):
1515
def __init__(self, user_agent=''):
16-
super(RotateUserAgentMiddleware, self).__init__(user_agent)
16+
super(RandomUserAgentMiddleware, self).__init__(user_agent)
1717
self.user_agent = user_agent
1818

1919
def process_request(self, request, spider):

python/autohome_spider/autohome_spider/settings.py

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,14 @@
77
ROBOTSTXT_OBEY = False
88

99
# 请求时间间隔,防止被屏蔽
10-
DOWNLOAD_DELAY = 10
10+
# DOWNLOAD_DELAY = 10
11+
12+
# 自动限流
13+
AUTOTHROTTLE_ENABLED = True
1114

1215
# 开启的middleware
1316
DOWNLOADER_MIDDLEWARES = {
14-
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
15-
'autohome_spider.RotateUserAgentMiddlewares.RotateUserAgentMiddleware': 400,
17+
'autohome_spider.random_user_agent_middlewares.RandomUserAgentMiddleware': 400,
1618
}
1719

1820
# 开启的pipeline
@@ -21,16 +23,11 @@
2123
}
2224

2325
# 数据集输出路径
24-
FEED_URI = 'data/%(name)s_%(time)s.csv'
26+
# FEED_URI = 'data/%(name)s_%(time)s.csv'
2527
# 数据集输出格式
26-
FEED_FORMAT = 'csv'
28+
# FEED_FORMAT = 'csv'
2729

2830
# 日志级别
29-
LOG_LEVEL = 'INFO'
31+
# LOG_LEVEL = 'INFO'
3032
# 日志文件路径
31-
LOG_FILE = 'scrapy.log'
32-
33-
# 开启重试
34-
RETRY_ENABLED = True
35-
# 重试次数
36-
RETRY_TIMES = 3
33+
# LOG_FILE = 'scrapy.log'

python/autohome_spider/autohome_spider/spiders/model_spider.py

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -67,14 +67,18 @@ def parse_model_selling(self, response):
6767

6868
models = panel[i * 2 + 1]
6969
for model in models.xpath('li'):
70-
model_name = model.xpath('div/div/p/a/text()').extract()
71-
model_id = model.xpath('div/div/p/@data-gcjid').extract()
70+
model_name = model.xpath('div[@class="interval01-list-cars"]/div/p/a/text()')[0].extract()
71+
model_id = model.xpath('div[@class="interval01-list-cars"]/div/p/@data-gcjid')[0].extract()
72+
price = model.xpath('div[@class="interval01-list-guidance"]/div/text()')[0].re(r'(\d+\.\d+)')
73+
if not price:
74+
price = model.xpath('div[@class="interval01-list-guidance"]/div/text()')[1].re(r'(\d+\.\d+)')
7275

7376
model = ModelItem()
7477
model['id'] = model_id
7578
model['name'] = model_name
7679
model['series_id'] = series_id
7780
model['group'] = group
81+
model['price'] = price
7882
yield model
7983
count += 1
8084

@@ -103,15 +107,20 @@ def parse_model_selling(self, response):
103107
else:
104108
log.msg('[parse_selling] is not selling.')
105109
count = 0
106-
model_tags = response.xpath('//td[@class="name_d"]/a')
110+
model_tags = response.xpath('//table/tboby/tr')
111+
if not model_tags or len(model_tags) == 0:
112+
model_tags = response.xpath('//table/tr')
113+
107114
for model_tag in model_tags:
108-
model_id = model_tag.xpath('@href')[0].re(r'spec/(\d+)/')[0]
109-
model_name = model_tag.xpath('@title')[0].extract()
115+
model_id = model_tag.xpath('td[@class="name_d"]/a/@href')[0].re(r'spec/(\d+)/')[0]
116+
model_name = model_tag.xpath('td[@class="name_d"]/a/@title')[0].extract()
117+
price = model_tag.xpath('td[@class="price_d"]/text()').re(ur'(\d+\.\d+)')
110118

111119
model = ModelItem()
112120
model['id'] = model_id
113121
model['name'] = model_name
114122
model['series_id'] = series_id
123+
model['price'] = price
115124
yield model
116125
count += 1
117126
log.msg('[parse_selling] model count is %d' % count)
@@ -127,12 +136,14 @@ def parse_model_selled(self, response):
127136
model_id = model['Id']
128137
model_name = model['Name']
129138
group = model['GroupName']
139+
price = model['Price']
130140

131141
model = ModelItem()
132142
model['id'] = model_id
133143
model['name'] = model_name
134144
model['series_id'] = series_id
135145
model['group'] = group
146+
model['price'] = price
136147
yield model
137148
count += 1
138149
log.msg('[parse_selled] model count is %d' % count)

0 commit comments

Comments
 (0)