Skip to content

Commit bf88de4

Browse files
author
littlelory
committed
按语言分组;提交autohome爬虫代码
1 parent 3bbfe83 commit bf88de4

File tree

14 files changed

+403
-0
lines changed

14 files changed

+403
-0
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# python compiled file
2+
*.pyc
File renamed without changes.
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
#!/usr/bin/python
2+
# coding=utf-8
3+
4+
5+
# !/usr/bin/python
6+
# -*-coding:utf-8-*-
7+
8+
import random
9+
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
10+
11+
12+
# user agent middleware
13+
# 在user agent池中,随机选取一个设置到request的Header中
14+
class RotateUserAgentMiddleware(UserAgentMiddleware):
15+
def __init__(self, user_agent=''):
16+
super(RotateUserAgentMiddleware, self).__init__(user_agent)
17+
self.user_agent = user_agent
18+
19+
def process_request(self, request, spider):
20+
ua = random.choice(self.user_agent_list)
21+
if ua:
22+
request.headers.setdefault('User-Agent', ua)
23+
24+
# the default user_agent_list composes chrome,I E,firefox,Mozilla,opera,netscape
25+
# for more user agent strings,you can find it in http://www.useragentstring.com/pages/useragentstring.php
26+
user_agent_list = [ \
27+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" \
28+
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", \
29+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", \
30+
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", \
31+
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", \
32+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", \
33+
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", \
34+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
35+
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
36+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
37+
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \
38+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \
39+
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
40+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
41+
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
42+
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", \
43+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", \
44+
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
45+
]

python/autohome_spider/autohome_spider/__init__.py

Whitespace-only changes.
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#!/usr/bin/python
2+
# coding=utf-8
3+
4+
# 车型类别key和车型类别ID的对应关系
5+
levelMap = {
6+
'a00': 1,
7+
'a0': 2,
8+
'a': 3,
9+
'b': 4,
10+
'c': 5,
11+
'd': 6,
12+
'suv': 7,
13+
'mpv': 8,
14+
's': 9,
15+
'p': 10,
16+
'mb': 11,
17+
'suva0': 16,
18+
'suva': 17,
19+
'suvb': 18,
20+
'suvc': 19,
21+
'suvd': 20
22+
}
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# -*- coding: utf-8 -*-
2+
import scrapy
3+
4+
5+
# 品牌
6+
class BrandItem(scrapy.Item):
7+
id = scrapy.Field()
8+
name = scrapy.Field()
9+
url = scrapy.Field()
10+
pic = scrapy.Field()
11+
12+
13+
# 车系
14+
class SeriesItem(scrapy.Item):
15+
id = scrapy.Field()
16+
brand_id = scrapy.Field()
17+
make_name = scrapy.Field()
18+
name = scrapy.Field()
19+
url = scrapy.Field()
20+
21+
22+
# 车型
23+
class ModelItem(scrapy.Item):
24+
id = scrapy.Field()
25+
series_id = scrapy.Field()
26+
name = scrapy.Field()
27+
group = scrapy.Field()
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Define here the models for your spider middleware
4+
#
5+
# See documentation in:
6+
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
7+
8+
from scrapy import signals
9+
10+
11+
# 默认生成的middlewares,没有使用到这个类
12+
class AutohomeSpiderSpiderMiddleware(object):
13+
# Not all methods need to be defined. If a method is not defined,
14+
# scrapy acts as if the spider middleware does not modify the
15+
# passed objects.
16+
17+
@classmethod
18+
def from_crawler(cls, crawler):
19+
# This method is used by Scrapy to create your spiders.
20+
s = cls()
21+
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
22+
return s
23+
24+
def process_spider_input(self, response, spider):
25+
# Called for each response that goes through the spider
26+
# middleware and into the spider.
27+
28+
# Should return None or raise an exception.
29+
return None
30+
31+
def process_spider_output(self, response, result, spider):
32+
# Called with the results returned from the Spider, after
33+
# it has processed the response.
34+
35+
# Must return an iterable of Request, dict or Item objects.
36+
for i in result:
37+
yield i
38+
39+
def process_spider_exception(self, response, exception, spider):
40+
# Called when a spider or process_spider_input() method
41+
# (from other spider middleware) raises an exception.
42+
43+
# Should return either None or an iterable of Response, dict
44+
# or Item objects.
45+
pass
46+
47+
def process_start_requests(self, start_requests, spider):
48+
# Called with the start requests of the spider, and works
49+
# similarly to the process_spider_output() method, except
50+
# that it doesn’t have a response associated.
51+
52+
# Must return only requests (not items).
53+
for r in start_requests:
54+
yield r
55+
56+
def spider_opened(self, spider):
57+
spider.logger.info('Spider opened: %s' % spider.name)
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Define your item pipelines here
4+
#
5+
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
6+
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7+
import json
8+
import codecs
9+
10+
11+
# 默认生成的pipeline,没有使用到
12+
class AutohomeSpiderPipeline(object):
13+
def process_item(self, item, spider):
14+
return item
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# -*- coding: utf-8 -*-
2+
BOT_NAME = 'autohome_spider'
3+
4+
SPIDER_MODULES = ['autohome_spider.spiders']
5+
NEWSPIDER_MODULE = 'autohome_spider.spiders'
6+
7+
ROBOTSTXT_OBEY = False
8+
9+
# 请求时间间隔,防止被屏蔽
10+
DOWNLOAD_DELAY = 10
11+
12+
# 开启的middleware
13+
DOWNLOADER_MIDDLEWARES = {
14+
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
15+
'autohome_spider.RotateUserAgentMiddlewares.RotateUserAgentMiddleware': 400,
16+
}
17+
18+
# 开启的pipeline
19+
ITEM_PIPELINES = {
20+
'autohome_spider.pipelines.AutohomeSpiderPipeline': 300,
21+
}
22+
23+
# 数据集输出路径
24+
FEED_URI = 'data/%(name)s_%(time)s.csv'
25+
# 数据集输出格式
26+
FEED_FORMAT = 'csv'
27+
28+
# 日志级别
29+
LOG_LEVEL = 'INFO'
30+
# 日志文件路径
31+
LOG_FILE = 'scrapy.log'
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# This package will contain the spiders of your Scrapy project
2+
#
3+
# Please refer to the documentation for information on how to create and manage
4+
# your spiders.

0 commit comments

Comments
 (0)