|
| 1 | +# #!/usr/bin/python |
| 2 | +# # coding=utf-8 |
| 3 | +# |
| 4 | +import scrapy |
| 5 | +import codecs |
| 6 | +import json |
| 7 | +from bs4 import BeautifulSoup |
| 8 | +import re |
| 9 | +import sys |
| 10 | +import os |
| 11 | +import chardet |
| 12 | +from autohome_spider.items import DetailItem |
| 13 | +from scrapy import log |
| 14 | + |
| 15 | +reload(sys) |
| 16 | +sys.setdefaultencoding('utf-8') |
| 17 | + |
| 18 | + |
| 19 | +# 车型参配数据爬虫 |
| 20 | +class Detail(scrapy.Spider): |
| 21 | + name = 'detail' |
| 22 | + allowed_domains = 'autohome.com.cn' |
| 23 | + start_urls = [] |
| 24 | + |
| 25 | + def __init__(self): |
| 26 | + ids = self.readIds() |
| 27 | + self.start_urls = ['http://car.autohome.com.cn/config/spec/%s.html' % id for id in ids] |
| 28 | + |
| 29 | + def parse(self, response): |
| 30 | + url = response.url |
| 31 | + log.msg('[url]%s' % url) |
| 32 | + |
| 33 | + current = int(url.split('/')[-1].split('.')[0]) |
| 34 | + |
| 35 | + body = response.body |
| 36 | + |
| 37 | + |
| 38 | + matcher = re.search(r'var specIDs =\[(.*)\];', body) |
| 39 | + if not matcher: |
| 40 | + log.msg('modelId[%s], no data...' % current) |
| 41 | + return |
| 42 | + data = matcher.group(1) |
| 43 | + model_ids = data.split(',') |
| 44 | + if str(current) not in model_ids: |
| 45 | + log.msg('modelId[%s], no current data...' % current) |
| 46 | + return |
| 47 | + |
| 48 | + detail = {} |
| 49 | + |
| 50 | + # print '===============================' |
| 51 | + |
| 52 | + matcher = re.search(r'var config = (\{.*\});', body) |
| 53 | + data = matcher.group(1) |
| 54 | + encoding = chardet.detect(data)['encoding'] |
| 55 | + j = json.loads(data, encoding=encoding) |
| 56 | + |
| 57 | + for config_types in j['result']['paramtypeitems']: |
| 58 | + for config_items in config_types['paramitems']: |
| 59 | + id = config_items['id'] |
| 60 | + # name = config_items['name'] |
| 61 | + # detail_name = DetailItem() |
| 62 | + # detail_name['id'] = id |
| 63 | + # detail_name['name'] = name |
| 64 | + # yield detail_name |
| 65 | + # print '------ id[%s] ------' % id |
| 66 | + values = config_items['valueitems'] |
| 67 | + for value in values: |
| 68 | + if current == value['specid']: |
| 69 | + detail[id] = value['value'] |
| 70 | + |
| 71 | + # print '===============================' |
| 72 | + |
| 73 | + matcher = re.search(r'var option = (\{.*\});', body) |
| 74 | + data = matcher.group(1) |
| 75 | + encoding = chardet.detect(data)['encoding'] |
| 76 | + j = json.loads(data, encoding=encoding) |
| 77 | + |
| 78 | + for config_types in j['result']['configtypeitems']: |
| 79 | + for config_items in config_types['configitems']: |
| 80 | + id = config_items['id'] |
| 81 | + # name = config_items['name'] |
| 82 | + # detail_name = DetailItem() |
| 83 | + # detail_name['id'] = id |
| 84 | + # detail_name['name'] = name |
| 85 | + # yield detail_name |
| 86 | + # print '------ id[%s] ------' % id |
| 87 | + values = config_items['valueitems'] |
| 88 | + for value in values: |
| 89 | + if current == value['specid']: |
| 90 | + detail[id] = value['value'] |
| 91 | + |
| 92 | + detail_item = DetailItem() |
| 93 | + detail_item['id'] = current |
| 94 | + detail_item['detail'] = detail |
| 95 | + yield detail_item |
| 96 | + |
| 97 | + def readIds(self): |
| 98 | + |
| 99 | + names = filter(lambda x: 'model' in x and 'json' in x, |
| 100 | + os.listdir('/home/king/code/python_job/autohome_spider/data')) |
| 101 | + print names |
| 102 | + if not names: |
| 103 | + log.msg('[spec]no model data file in data dir.', log.ERROR) |
| 104 | + return |
| 105 | + model_file_name = names[-1] |
| 106 | + f = codecs.open('/home/king/code/python_job/autohome_spider/data/%s' % model_file_name, 'r') |
| 107 | + ids = [line['id'] for line in json.loads(f.read())] |
| 108 | + log.msg(len(ids), log.INFO) |
| 109 | + return ids |
0 commit comments