mongming
diff --git a/‎python/autohome_spider/autohome_spider/items.py‎
Lines changed: 8 additions & 0 deletions b/‎python/autohome_spider/autohome_spider/items.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎python/autohome_spider/autohome_spider/settings.py‎
Lines changed: 3 additions & 3 deletions b/‎python/autohome_spider/autohome_spider/settings.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎python/autohome_spider/autohome_spider/spiders/detail_spider.py‎
Lines changed: 109 additions & 0 deletions b/‎python/autohome_spider/autohome_spider/spiders/detail_spider.py‎
Lines changed: 109 additions & 0 deletions
diff --git a/‎python/autohome_spider/autohome_spider/spiders/spec_spider.py‎
Lines changed: 65 additions & 0 deletions b/‎python/autohome_spider/autohome_spider/spiders/spec_spider.py‎
Lines changed: 65 additions & 0 deletions
diff --git a/‎python/autohome_spider/data/merge.py‎
Lines changed: 66 additions & 0 deletions b/‎python/autohome_spider/data/merge.py‎
Lines changed: 66 additions & 0 deletions
diff --git a/‎python/autohome_spider/test.py‎
Lines changed: 86 additions & 0 deletions b/‎python/autohome_spider/test.py‎
Lines changed: 86 additions & 0 deletions
diff --git a/‎python/douban_spider/douban/douban/__init__.py‎ b/‎python/douban_spider/douban/douban/__init__.py‎
diff --git a/‎python/douban_spider/douban/douban/items.py‎
Lines changed: 19 additions & 0 deletions b/‎python/douban_spider/douban/douban/items.py‎
Lines changed: 19 additions & 0 deletions
@@ -26,3 +26,11 @@ class ModelItem(scrapy.Item):
     name = scrapy.Field()
     group = scrapy.Field()
     price = scrapy.Field()
+
+class SpecItem(scrapy.Item):
+    id = scrapy.Field()
+    spec = scrapy.Field()
+
+class DetailItem(scrapy.Item):
+    id = scrapy.Field()
+    detail = scrapy.Field()
@@ -23,11 +23,11 @@
 }
 
 # 数据集输出路径
-# FEED_URI = 'data/%(name)s_%(time)s.csv'
+FEED_URI = 'data/%(name)s_%(time)s.json'
 # 数据集输出格式
-# FEED_FORMAT = 'csv'
+FEED_FORMAT = 'json'
 
 # 日志级别
-# LOG_LEVEL = 'INFO'
+LOG_LEVEL = 'INFO'
 # 日志文件路径
 # LOG_FILE = 'scrapy.log'
@@ -0,0 +1,109 @@
+# #!/usr/bin/python
+# # coding=utf-8
+#
+import scrapy
+import codecs
+import json
+from bs4 import BeautifulSoup
+import re
+import sys
+import os
+import chardet
+from autohome_spider.items import DetailItem
+from scrapy import log
+
+reload(sys)
+sys.setdefaultencoding('utf-8')
+
+
+# 车型参配数据爬虫
+class Detail(scrapy.Spider):
+    name = 'detail'
+    allowed_domains = 'autohome.com.cn'
+    start_urls = []
+
+    def __init__(self):
+        ids = self.readIds()
+        self.start_urls = ['http://car.autohome.com.cn/config/spec/%s.html' % id for id in ids]
+
+    def parse(self, response):
+        url = response.url
+        log.msg('[url]%s' % url)
+
+        current = int(url.split('/')[-1].split('.')[0])
+
+        body = response.body
+
+
+        matcher = re.search(r'var specIDs =\[(.*)\];', body)
+        if not matcher:
+            log.msg('modelId[%s], no data...' % current)
+            return
+        data = matcher.group(1)
+        model_ids = data.split(',')
+        if str(current) not in model_ids:
+            log.msg('modelId[%s], no current data...' % current)
+            return
+
+        detail = {}
+
+        # print '==============================='
+
+        matcher = re.search(r'var config = (\{.*\});', body)
+        data = matcher.group(1)
+        encoding = chardet.detect(data)['encoding']
+        j = json.loads(data, encoding=encoding)
+
+        for config_types in j['result']['paramtypeitems']:
+            for config_items in config_types['paramitems']:
+                id = config_items['id']
+                # name = config_items['name']
+                # detail_name = DetailItem()
+                # detail_name['id'] = id
+                # detail_name['name'] = name
+                # yield detail_name
+                # print '------ id[%s] ------' % id
+                values = config_items['valueitems']
+                for value in values:
+                    if current == value['specid']:
+                        detail[id] = value['value']
+
+        # print '==============================='
+
+        matcher = re.search(r'var option = (\{.*\});', body)
+        data = matcher.group(1)
+        encoding = chardet.detect(data)['encoding']
+        j = json.loads(data, encoding=encoding)
+
+        for config_types in j['result']['configtypeitems']:
+            for config_items in config_types['configitems']:
+                id = config_items['id']
+                # name = config_items['name']
+                # detail_name = DetailItem()
+                # detail_name['id'] = id
+                # detail_name['name'] = name
+                # yield detail_name
+                # print '------ id[%s] ------' % id
+                values = config_items['valueitems']
+                for value in values:
+                    if current == value['specid']:
+                        detail[id] = value['value']
+
+        detail_item = DetailItem()
+        detail_item['id'] = current
+        detail_item['detail'] = detail
+        yield detail_item
+
+    def readIds(self):
+
+        names = filter(lambda x: 'model' in x and 'json' in x,
+                       os.listdir('/home/king/code/python_job/autohome_spider/data'))
+        print names
+        if not names:
+            log.msg('[spec]no model data file in data dir.', log.ERROR)
+            return
+        model_file_name = names[-1]
+        f = codecs.open('/home/king/code/python_job/autohome_spider/data/%s' % model_file_name, 'r')
+        ids = [line['id'] for line in json.loads(f.read())]
+        log.msg(len(ids), log.INFO)
+        return ids
@@ -0,0 +1,65 @@
+# #!/usr/bin/python
+# # coding=utf-8
+#
+import scrapy
+from autohome_spider.items import SpecItem
+import codecs
+import json
+from bs4 import BeautifulSoup
+import re
+import sys
+import os
+from scrapy import log
+
+reload(sys)
+sys.setdefaultencoding('utf-8')
+
+
+# 车型参配数据爬虫
+class SpecSpider(scrapy.Spider):
+    name = 'spec'
+    allowed_domains = 'autohome.com.cn'
+    start_urls = []
+
+    def __init__(self):
+        ids = self.readIds()
+        self.start_urls = ['http://www.autohome.com.cn/spec/%s' % id for id in ids]
+
+    def parse(self, response):
+        url = response.url
+        log.msg('[url]%s' % url)
+        body = response.body
+        soup = BeautifulSoup(body, 'lxml').select('.cardetail-infor')[0]
+        text = str(self.gettextonly(soup)).decode('utf-8')
+        m = re.findall(ur'(车身尺寸|综合油耗|厂商指导价|车身结构|整车质保|发 动 机|变 速 箱|驱动方式|二手车参考价)：\n?(.+)\n', text, re.M | re.U)
+        map = dict([(d[0], d[1]) for d in m])
+        result = SpecItem()
+        result['id'] = url.split('/')[-1]
+        result['spec'] = map
+        yield result
+
+    def gettextonly(self, soup):
+        v = soup.string
+        if v == None:
+            c = soup.contents
+            resulttext = ''
+            for t in c:
+                subtext = self.gettextonly(t)
+                resulttext += subtext + '\n'
+            return resulttext
+        else:
+            return v.strip()
+
+    def readIds(self):
+
+        names = filter(lambda x: 'model' in x and 'json' in x,
+                       os.listdir('/Users/king/Work/code/codePool/python/autohome_spider/data'))
+        print names
+        if not names:
+            log.msg('[spec]no model data file in data dir.', log.ERROR)
+            return
+        model_file_name = names[-1]
+        f = codecs.open('/Users/king/Work/code/codePool/python/autohome_spider/data/%s' % model_file_name, 'r')
+        ids = [line['id'] for line in json.loads(f.read())]
+        log.msg(len(ids), log.INFO)
+        return ids
@@ -0,0 +1,66 @@
+#!/usr/bin/python
+# coding=utf-8
+
+import json
+import csv
+import codecs
+import types
+import sys
+reload(sys)
+sys.setdefaultencoding('utf-8')
+
+brandList = json.loads(open('brand_2017-07-19T03-22-42.json').read())
+seriesList = json.loads(open('series_2017-07-19T03-24-17.json').read())
+modelList = json.loads(open('model_2017-07-19T03-25-04.json').read())
+specList = json.loads(open('spec_2017-07-19T10-09-46.json').read())
+
+brandDict = {brand['id']: brand for brand in brandList}
+seriesDict = {series['id'].strip('s'): series for series in seriesList}
+# modelDict = {model['id']: model for model in modelList}
+specDict = {spec['id']: spec for spec in specList}
+
+
+specKeys = []
+for spec in specList:
+    detail = spec['spec']
+    for key in detail.keys():
+        if key not in specKeys:
+            specKeys.append(key)
+
+
+f = codecs.open('merge.csv', 'w+', 'utf-8')
+writer = csv.writer(f)
+
+titles = ['品牌ID', '品牌名称', '车系ID', '车系名称', '车型ID', '车型名称']
+
+for key in specKeys:
+    titles.append(key)
+
+writer.writerow(titles)
+
+for model in modelList:
+    modelId = model['id']
+    modelName = model['name']
+    seriesId = model['series_id']
+    # print seriesId
+    series = seriesDict[seriesId]
+    seriesName = series['name']
+    brandId = series['brand_id']
+    brand = brandDict[brandId]
+    brandName = brand['name']
+
+    row = [brandId, brandName, seriesId, seriesName, modelId, modelName]
+
+    spec = specDict[unicode(modelId)]
+    if spec:
+        detail = spec['spec']
+        for key in specKeys:
+            value = ""
+            if key in detail:
+                value = detail[key]
+            row.append(value)
+
+    writer.writerow(row)
+
+print 'finish..'
+
@@ -0,0 +1,86 @@
+#!/usr/bin/python
+# coding=utf-8
+
+import urllib2
+import re
+import json
+import sys
+import types
+import codecs
+import chardet
+import csv
+
+reload(sys)
+sys.setdefaultencoding('utf-8')
+
+
+resp = urllib2.urlopen('http://car.autohome.com.cn/config/spec/25379.html')
+body = resp.read()
+
+model_ids = []
+spec_ids = []
+
+config = re.search(r'var specIDs =\[(.*)\];', body)
+data = config.group(1)
+model_ids = data.split(',')
+
+result = {int(model_id): {} for model_id in model_ids}
+
+print '==============================='
+
+config = re.search(r'var config = (\{.*\});', body)
+data = config.group(1)
+encoding = chardet.detect(data)['encoding']
+j = json.loads(data, encoding=encoding)
+
+for config_types in j['result']['paramtypeitems']:
+    for config_items in config_types['paramitems']:
+        id = config_items['id']
+        if id not in spec_ids: spec_ids.append(id)
+        print '------ id[%s] ------' % id
+        values = config_items['valueitems']
+        for value in values:
+            result[value['specid']][id] = value['value']
+            print '%s-[%s]' % (value['specid'], value['value'])
+
+print '==============================='
+
+config = re.search(r'var option = (\{.*\});', body)
+data = config.group(1)
+encoding = chardet.detect(data)['encoding']
+j = json.loads(data, encoding=encoding)
+
+for config_types in j['result']['configtypeitems']:
+    for config_items in config_types['configitems']:
+        id = config_items['id']
+        if id not in spec_ids: spec_ids.append(id)
+        print '------ id[%s] ------' % id
+        values = config_items['valueitems']
+        for value in values:
+            result[value['specid']][id] = value['value']
+            print '%s-[%s]' % (value['specid'], value['value'])
+
+for model_id, details in result.items():
+    print '%s\t%s' % (model_id, details)
+
+# f = csv.writer(open('detail.csv', 'w+'))
+# title = []
+# title.append('model_id')
+# title.append(model_ids)
+#
+#
+#
+# row = []
+# for model_id, details in result.items():
+#     row.append(model_id)
+#     for spec_id in spec_ids:
+#         value = ""
+#         if spec_id in details:
+#             value = details[spec_id]
+#         row.append(value)
+#
+#     f.writerow(row)
+
+
+print 'finish...'
+
@@ -0,0 +1,19 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class DoubanItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    pass
+
+
+class BookName(scrapy.Item):
+    book_id = scrapy.Field()
+    book_name = scrapy.Field()