3 лет назад · ab0342a55b
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,8 @@
 
				+.idea/
			
 
				+content_spider/log
			
 
				+content_spider/*.pyc
			
 
				+content_spider/__pycache__
			
 
				+content_spider/spiders/__pycache__
			
 
				+content_spider/spiders/*/__pycache__
			
 
				+content_spider/spiders/*/*.pyc
			
 
				+.vscode/
			
--- a/README.md
+++ b/README.md
@@ -0,0 +1,11 @@
 
				+# 采集脚本
			
 
				+**cd /home/www/wangdu_spider**
			
 
				+
			
 
				+## 7lou采集
			
 
				+* 全部采集,有去重 scrapy crawl 7lou 
			
 
				+* 部分采集,不去重 scrapy crawl zbone -a bid=xx,xx,xxx
			
 
				+
			
 
				+## 趣阅采集
			
 
				+* 全部采集,有去重 scrapy crawl shuangduxs
			
 
				+* 部分采集,不去重 scrapy crawl sdone -a bid=xxx,xxx,xxx
			
 
				+
			
--- a/content_spider/Util.py
+++ b/content_spider/Util.py
@@ -0,0 +1,114 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+
			
 
				+import logging
			
 
				+import hashlib
			
 
				+import random
			
 
				+import time
			
 
				+
			
 
				+base_category = [
			
 
				+    {"id": 1, "category_name": "玄幻仙侠", "channel_id": 1, "pid": 0},
			
 
				+    {"id": 2, "category_name": "热血校园", "channel_id": 1, "pid": 0},
			
 
				+    {"id": 3, "category_name": "都市暧昧", "channel_id": 1, "pid": 0},
			
 
				+    {"id": 4, "category_name": "军事历史", "channel_id": 1, "pid": 0},
			
 
				+    {"id": 7, "category_name": "游戏竞技", "channel_id": 1, "pid": 0},
			
 
				+    {"id": 8, "category_name": "悬疑灵异", "channel_id": 1, "pid": 0},
			
 
				+    {"id": 12, "category_name": "灵异鬼怪", "channel_id": 1, "pid": 8},
			
 
				+    {"id": 14, "category_name": "历史穿越", "channel_id": 1, "pid": 4},
			
 
				+    {"id": 19, "category_name": "游戏竞技", "channel_id": 1, "pid": 7},
			
 
				+    {"id": 21, "category_name": "武侠仙侠", "channel_id": 1, "pid": 1},
			
 
				+    {"id": 22, "category_name": "西方玄幻", "channel_id": 1, "pid": 1},
			
 
				+    {"id": 23, "category_name": "玄幻奇幻", "channel_id": 1, "pid": 1},
			
 
				+    {"id": 30, "category_name": "校园黑道", "channel_id": 1, "pid": 2},
			
 
				+    {"id": 49, "category_name": "抗战烽火", "channel_id": 1, "pid": 4},
			
 
				+    {"id": 51, "category_name": "特种军旅", "channel_id": 1, "pid": 4},
			
 
				+    {"id": 54, "category_name": "都市爱情", "channel_id": 1, "pid": 3},
			
 
				+    {"id": 55, "category_name": "官场沉浮", "channel_id": 1, "pid": 3},
			
 
				+    {"id": 57, "category_name": "乡土风情", "channel_id": 1, "pid": 3},
			
 
				+    {"id": 67, "category_name": "校园励志", "channel_id": 1, "pid": 2},
			
 
				+    {"id": 68, "category_name": "现代修真", "channel_id": 1, "pid": 3},
			
 
				+    {"id": 81, "category_name": "灵异恐怖", "channel_id": 1, "pid": 8},
			
 
				+    {"id": 82, "category_name": "古代言情", "channel_id": 2, "pid": 0},
			
 
				+    {"id": 83, "category_name": "穿越重生", "channel_id": 2, "pid": 82},
			
 
				+    {"id": 84, "category_name": "经商种田", "channel_id": 2, "pid": 82},
			
 
				+    {"id": 87, "category_name": "现代言情", "channel_id": 2, "pid": 0},
			
 
				+    {"id": 88, "category_name": "豪门总裁", "channel_id": 2, "pid": 87},
			
 
				+    {"id": 92, "category_name": "女生灵异", "channel_id": 2, "pid": 0},
			
 
				+    {"id": 93, "category_name": "幻想言情", "channel_id": 2, "pid": 0},
			
 
				+    {"id": 94, "category_name": "青春爱情", "channel_id": 1, "pid": 2},
			
 
				+    {"id": 95, "category_name": "女生灵异", "channel_id": 2, "pid": 92},
			
 
				+    {"id": 96, "category_name": "东方玄幻", "channel_id": 2, "pid": 93},
			
 
				+    {"id": 97, "category_name": "古典仙侠", "channel_id": 2, "pid": 93},
			
 
				+    {"id": 98, "category_name": "婚恋情感", "channel_id": 2, "pid": 87},
			
 
				+    {"id": 99, "category_name": "民国爱情", "channel_id": 2, "pid": 87},
			
 
				+    {"id": 100, "category_name": "其他", "channel_id": 2, "pid": 0},
			
 
				+    {"id": 101, "category_name": "浪漫青春", "channel_id": 2, "pid": 0},
			
 
				+    {"id": 102, "category_name": "耽美同人", "channel_id": 2, "pid": 0},
			
 
				+    {"id": 103, "category_name": "青春纯爱", "channel_id": 2, "pid": 101},
			
 
				+    {"id": 104, "category_name": "青春校园", "channel_id": 2, "pid": 101},
			
 
				+    {"id": 105, "category_name": "蜕变成长", "channel_id": 2, "pid": 101},
			
 
				+    {"id": 106, "category_name": "耽美同人", "channel_id": 2, "pid": 102},
			
 
				+    {"id": 107, "category_name": "其他", "channel_id": 2, "pid": 100},
			
 
				+    {"id": 108, "category_name": "异世大陆", "channel_id": 2, "pid": 93},
			
 
				+    {"id": 109, "category_name": "远古神话", "channel_id": 2, "pid": 93},
			
 
				+    {"id": 110, "category_name": "上古蛮荒", "channel_id": 2, "pid": 93},
			
 
				+    {"id": 111, "category_name": "侦探推理", "channel_id": 2, "pid": 92},
			
 
				+    {"id": 112, "category_name": "神秘文化", "channel_id": 2, "pid": 92},
			
 
				+    {"id": 113, "category_name": "悬疑探险", "channel_id": 2, "pid": 92},
			
 
				+    {"id": 114, "category_name": "恐怖惊悚", "channel_id": 2, "pid": 92},
			
 
				+    {"id": 115, "category_name": "鬼夫言情", "channel_id": 2, "pid": 92},
			
 
				+    {"id": 116, "category_name": "都市职场", "channel_id": 2, "pid": 87},
			
 
				+    {"id": 117, "category_name": "娱乐明星", "channel_id": 2, "pid": 87},
			
 
				+    {"id": 118, "category_name": "都市异能", "channel_id": 2, "pid": 87},
			
 
				+    {"id": 119, "category_name": "游戏", "channel_id": 2, "pid": 87},
			
 
				+    {"id": 120, "category_name": "宫斗宅斗", "channel_id": 2, "pid": 82},
			
 
				+    {"id": 121, "category_name": "古典架空", "channel_id": 2, "pid": 82},
			
 
				+    {"id": 122, "category_name": "清穿民国", "channel_id": 2, "pid": 82},
			
 
				+    {"id": 123, "category_name": "女尊王朝", "channel_id": 2, "pid": 82},
			
 
				+    {"id": 124, "category_name": "其他", "channel_id": 1, "pid": 0},
			
 
				+    {"id": 125, "category_name": "衍生同人", "channel_id": 1, "pid": 124},
			
 
				+    {"id": 126, "category_name": "轻小说", "channel_id": 1, "pid": 124},
			
 
				+    {"id": 127, "category_name": "其他作品", "channel_id": 1, "pid": 124},
			
 
				+
			
 
				+]
			
 
				+
			
 
				+
			
 
				+def get_category_by_name(category_name):
			
 
				+    for item in base_category:
			
 
				+        if item['category_name'] == category_name:
			
 
				+            return item
			
 
				+    return None
			
 
				+
			
 
				+
			
 
				+def get_category_by_id(category_id):
			
 
				+    for item in base_category:
			
 
				+        if item['id'] == int(category_id):
			
 
				+            return item
			
 
				+    return None
			
 
				+
			
 
				+def random_str(slen=10):
			
 
				+    seed = "1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
			
 
				+    sa = []
			
 
				+    for i in range(slen):
			
 
				+        sa.append(random.choice(seed))
			
 
				+    return ''.join(sa)
			
 
				+
			
 
				+
			
 
				+def my_log(name, msg, level='info'):
			
 
				+    logger = logging.getLogger('stats')
			
 
				+    logger.setLevel("DEBUG")
			
 
				+    file = 'content_spider/log/common' + time.strftime("%Y-%m-%d", time.localtime()) + '.log'
			
 
				+    logger.addHandler(logging.FileHandler(file))
			
 
				+    now_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
			
 
				+    s = now_time + ' [{}] '.format(name) + level.upper() + ':' + str(msg)
			
 
				+    if level == 'error':
			
 
				+        logger.error(s)
			
 
				+    else:
			
 
				+        logger.info(s)
			
 
				+
			
 
				+
			
 
				+def md5(keystr):
			
 
				+    m = hashlib.md5()
			
 
				+    m.update(keystr.encode('utf-8'))
			
 
				+    return m.hexdigest()
			
 
				+
			
 
				+
			
--- a/content_spider/__init__.py
+++ b/content_spider/__init__.py
--- a/content_spider/baseSpider.py
+++ b/content_spider/baseSpider.py
@@ -0,0 +1,466 @@
 
				+import scrapy
			
 
				+import time
			
 
				+import random
			
 
				+from content_spider.mysqlHelper import MysqlHelper
			
 
				+from content_spider.items import BookInfoItem, ChapterItem
			
 
				+from content_spider.pipelines import formatcontent, removePunctuation
			
 
				+from content_spider.Util import my_log
			
 
				+
			
 
				+
			
 
				+
			
 
				+class baseSpider(scrapy.Spider):
			
 
				+    name = ''
			
 
				+    source = ''
			
 
				+    source_name = ''
			
 
				+    source_id = 0
			
 
				+    custom_settings = {
			
 
				+        'DOWNLOAD_DELAY': 0.01,
			
 
				+        'SOURCE': source,
			
 
				+    }
			
 
				+
			
 
				+    def __init__(self, host, user, password, db, stats, settings):
			
 
				+        scrapy.Spider.__init__(self)
			
 
				+        source = self.source
			
 
				+        source_id = self.source_id
			
 
				+        self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db, source=source,source_id=source_id)
			
 
				+        self.__stats = stats
			
 
				+        self.__stats.set_value('spider_type', 'add')
			
 
				+        self.__stats.set_value('bid_list', [])
			
 
				+        self.__is_first = True
			
 
				+
			
 
				+    @classmethod
			
 
				+    def from_crawler(cls, crawler):
			
 
				+        settings = crawler.settings
			
 
				+        host = settings.get('MYSQL_HOST')
			
 
				+        user = settings.get('MYSQL_USER')
			
 
				+        password = settings.get('MYSQL_PWD')
			
 
				+        db = settings.get('MYSQL_DB')
			
 
				+        return cls(host=host, user=user, password=password, db=db, stats=crawler.stats, settings=settings)
			
 
				+
			
 
				+    def start_requests(self):
			
 
				+        yield scrapy.Request(self.get_start_url(), callback=self.parse_book_list)
			
 
				+
			
 
				+    def parse_book_list(self, response):
			
 
				+        result = self.bid_list_result(response)
			
 
				+        for item in result:
			
 
				+            bid = item['id']
			
 
				+            result = self.mysqlHelper.get_book_info_by_source(bid)
			
 
				+            if result is not None:
			
 
				+                continue
			
 
				+            url = self.get_book_info_url(bid)
			
 
				+            yield scrapy.Request(url, callback=self.parse_book_info)
			
 
				+            # break
			
 
				+
			
 
				+    def parse_book_info(self, response):
			
 
				+        if response.text == '':
			
 
				+            return None
			
 
				+        now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
			
 
				+        result = self.book_info_result(response)
			
 
				+        if result is None:
			
 
				+            return None
			
 
				+
			
 
				+        category_id = 0 if result.get('category_id') is None else result.get('category_id')
			
 
				+
			
 
				+        book_info_item = BookInfoItem()
			
 
				+        source_bid = result.get('bid')
			
 
				+        book_info_item['cp_bid'] = source_bid
			
 
				+        book_info_item['cp_name'] = self.source_name
			
 
				+        book_info_item['cp_id'] = self.source_id
			
 
				+        book_info_item['name'] = result['name']
			
 
				+        book_info_item['author'] = result['author']
			
 
				+        book_info_item['intro'] = result['intro']
			
 
				+        book_info_item['cover'] = result['cover']
			
 
				+        book_info_item['keyword'] = result['keyword']
			
 
				+        book_info_item['category_id'] = category_id
			
 
				+        book_info_item['status'] = result['status']
			
 
				+        book_info_item['size'] = 0
			
 
				+        book_info_item['category_name'] = result['category']
			
 
				+        book_info_item['last_chapter'] = ''
			
 
				+        book_info_item['chapter_count'] = 0
			
 
				+        book_info_item['first_cid'] = 0
			
 
				+        book_info_item['last_cid'] = 0
			
 
				+        book_info_item['channel'] = result['channel']
			
 
				+        book_info_item['updated_at'] = now
			
 
				+        book_info_item['created_at'] = now
			
 
				+        bid = self.mysqlHelper.insert_book(book_info_item)
			
 
				+        if self.__is_first:
			
 
				+            self.__stats.set_value('bid_start', bid)
			
 
				+            self.__is_first = False
			
 
				+        self.__stats.get_value('bid_list').append(bid)
			
 
				+        url = self.get_chapter_list_url(source_bid)
			
 
				+        meta = {'bid': bid, 'source_bid': source_bid}
			
 
				+        yield scrapy.Request(url, callback=self.parse_chapter_list, meta=meta)
			
 
				+
			
 
				+    def parse_chapter_list(self, response):
			
 
				+        if response.text == '':
			
 
				+            return None
			
 
				+        result = self.chapter_list_result(response)
			
 
				+        bid = response.meta['bid']
			
 
				+        source_bid = response.meta['source_bid']
			
 
				+        i = 0
			
 
				+        for chapter_item in result:
			
 
				+            i = i + 1
			
 
				+            cid = chapter_item['source_chapter_id']
			
 
				+            meta = chapter_item
			
 
				+            meta['bid'] = bid
			
 
				+            url = self.get_chapter_content_url(source_bid, cid)
			
 
				+            yield scrapy.Request(url, callback=self.parse_chapter_content, meta=meta)
			
 
				+
			
 
				+    def parse_chapter_content(self, response):
			
 
				+        if response.text == '':
			
 
				+            return None
			
 
				+        now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
			
 
				+        result = self.chapter_content_result(response)
			
 
				+        meta = response.meta
			
 
				+        chapter_item = ChapterItem()
			
 
				+        chapter_item['bid'] = meta['bid']
			
 
				+        chapter_item['name'] = meta['name']
			
 
				+        chapter_item['sequence'] = meta['sequence']
			
 
				+        chapter_item['size'] = meta['size']
			
 
				+        chapter_item['is_vip'] = meta['is_vip']
			
 
				+        chapter_item['source_chapter_id'] = meta['source_chapter_id']
			
 
				+        chapter_item['chapter_content_id'] = 0
			
 
				+        chapter_item['content'] = formatcontent(result['content'])
			
 
				+
			
 
				+        if result.get('size') is not None:
			
 
				+            chapter_item['size'] = result.get('size')
			
 
				+        if result.get('is_vip') is not None:
			
 
				+            chapter_item['is_vip'] = result.get('is_vip')
			
 
				+        if result.get('name') is not None:
			
 
				+            chapter_item['name'] = result.get('name')
			
 
				+        if result.get('recent_update_at') is not None:
			
 
				+            chapter_item['recent_update_at'] = result.get('recent_update_at')
			
 
				+        if result.get('source_chapter_id') is not None:
			
 
				+            chapter_item['source_chapter_id'] = result.get('source_chapter_id')
			
 
				+        chapter_item['prev_cid'] = 0
			
 
				+        chapter_item['next_cid'] = 0
			
 
				+        chapter_item['updated_at'] = now
			
 
				+        chapter_item['created_at'] = now
			
 
				+        chapter_item = ChapterItem(meta)
			
 
				+        yield chapter_item
			
 
				+
			
 
				+    def get_start_url(self):
			
 
				+        raise NotImplementedError
			
 
				+
			
 
				+    def bid_list_result(self, response):
			
 
				+        raise NotImplementedError
			
 
				+
			
 
				+    def get_book_info_url(self, bid):
			
 
				+        raise NotImplementedError
			
 
				+
			
 
				+    def book_info_result(self, response):
			
 
				+        raise NotImplementedError
			
 
				+
			
 
				+    def get_chapter_list_url(self, bid):
			
 
				+        raise NotImplementedError
			
 
				+
			
 
				+    def chapter_list_result(self, response):
			
 
				+        raise NotImplementedError
			
 
				+
			
 
				+    def get_chapter_content_url(self, bid, cid):
			
 
				+        raise NotImplementedError
			
 
				+
			
 
				+    def chapter_content_result(self, response):
			
 
				+        raise NotImplementedError
			
 
				+
			
 
				+
			
 
				+class baseUpdateSpider(scrapy.Spider):
			
 
				+    name = ''
			
 
				+    source = ''
			
 
				+    source_id = 0
			
 
				+    custom_settings = {
			
 
				+        'DOWNLOAD_DELAY': 0.01,
			
 
				+        'SOURCE': source,
			
 
				+    }
			
 
				+
			
 
				+    def __init__(self, host, user, password, db, stats, settings):
			
 
				+        scrapy.Spider.__init__(self)
			
 
				+        source = self.source
			
 
				+        self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db, source=source,source_id=self.source_id)
			
 
				+        self.__stats = stats
			
 
				+        self.__stats.set_value('spider_type', 'update')
			
 
				+        self.__stats.set_value('bid_list', [])
			
 
				+        self.__is_first = True
			
 
				+
			
 
				+    @classmethod
			
 
				+    def from_crawler(cls, crawler):
			
 
				+        settings = crawler.settings
			
 
				+        host = settings.get('MYSQL_HOST')
			
 
				+        user = settings.get('MYSQL_USER')
			
 
				+        password = settings.get('MYSQL_PWD')
			
 
				+        db = settings.get('MYSQL_DB')
			
 
				+        return cls(host=host, user=user, password=password, db=db, stats=crawler.stats, settings=settings)
			
 
				+
			
 
				+    def start_requests(self):
			
 
				+        book_list = self.mysqlHelper.get_need_update_book_list()
			
 
				+        if book_list is not None:
			
 
				+            for book in book_list:
			
 
				+                url = self.get_chapter_list_url(book['copilot'])
			
 
				+                meta = {'bid': book['id'], 'cp_bid': book['cp_bid']}
			
 
				+                yield scrapy.Request(url, callback=self.parse_chapter_list, meta=meta)
			
 
				+
			
 
				+    def parse_chapter_list(self, response):
			
 
				+        if response.text == '':
			
 
				+            return None
			
 
				+        chapter_list = self.chapter_list_result(response)
			
 
				+        meta = response.meta
			
 
				+        if chapter_list is not None:
			
 
				+            bid = response.meta.get('bid')
			
 
				+            cp_bid = response.meta.get('cp_bid')
			
 
				+            last_chapter = self.mysqlHelper.get_last_cid_by_bid(bid)
			
 
				+            start = False
			
 
				+            if last_chapter is None:
			
 
				+                start = True
			
 
				+                last_source_cid = ''
			
 
				+                last_sequence = 0
			
 
				+                last_chapter_id = 0
			
 
				+            else:
			
 
				+                last_source_cid = str(last_chapter['source_chapter_id'])
			
 
				+                last_sequence = last_chapter['sequence']
			
 
				+                last_chapter_id = last_chapter['id']
			
 
				+
			
 
				+            has_new_chapter = False
			
 
				+            for chapter_item in chapter_list:
			
 
				+                if not start:
			
 
				+                    if len(last_source_cid) > 0:
			
 
				+                        if str(chapter_item['source_chapter_id']) == str(last_source_cid):
			
 
				+                            start = True
			
 
				+                    else:
			
 
				+                        if int(chapter_item['sequence']) == last_sequence:
			
 
				+                            start = True
			
 
				+                    continue
			
 
				+                if not has_new_chapter:
			
 
				+                    self.__stats.get_value('bid_list').append(
			
 
				+                        {"bid": meta['bid'], 'start': last_chapter_id, 'start_sequence': last_sequence})
			
 
				+                    has_new_chapter = True
			
 
				+                cid = chapter_item['source_chapter_id']
			
 
				+                last_sequence = last_sequence + 1
			
 
				+                meta = chapter_item
			
 
				+                meta['bid'] = bid
			
 
				+                url = self.get_chapter_content_url(cp_bid, cid)
			
 
				+                yield scrapy.Request(url, callback=self.parse_chapter_content, meta=meta)
			
 
				+
			
 
				+    def parse_chapter_content(self, response):
			
 
				+        if response.text == '':
			
 
				+            return None
			
 
				+        now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
			
 
				+        result = self.chapter_content_result(response)
			
 
				+        meta = response.meta
			
 
				+        chapter_item = ChapterItem()
			
 
				+        chapter_item['bid'] = meta['bid']
			
 
				+        chapter_item['name'] = meta['name']
			
 
				+        chapter_item['sequence'] = meta['sequence']
			
 
				+        chapter_item['size'] = meta['size']
			
 
				+        chapter_item['is_vip'] = meta['is_vip']
			
 
				+        chapter_item['source_chapter_id'] = meta['source_chapter_id']
			
 
				+        chapter_item['chapter_content_id'] = 0
			
 
				+        chapter_item['content'] = formatcontent(result['content'])
			
 
				+
			
 
				+        if result.get('size') is not None:
			
 
				+            chapter_item['size'] = result.get('size')
			
 
				+        if result.get('is_vip') is not None:
			
 
				+            chapter_item['is_vip'] = result.get('is_vip')
			
 
				+        if result.get('name') is not None:
			
 
				+            chapter_item['name'] = result.get('name')
			
 
				+        if result.get('recent_update_at') is not None:
			
 
				+            chapter_item['recent_update_at'] = result.get('recent_update_at')
			
 
				+        if result.get('source_chapter_id') is not None:
			
 
				+            chapter_item['source_chapter_id'] = result.get('source_chapter_id')
			
 
				+        chapter_item['prev_cid'] = 0
			
 
				+        chapter_item['next_cid'] = 0
			
 
				+        chapter_item['updated_at'] = now
			
 
				+        chapter_item['created_at'] = now
			
 
				+        chapter_item = ChapterItem(meta)
			
 
				+        yield chapter_item
			
 
				+
			
 
				+
			
 
				+    def get_chapter_list_url(self, bid):
			
 
				+        raise NotImplementedError
			
 
				+
			
 
				+    def chapter_list_result(self, response):
			
 
				+        raise NotImplementedError
			
 
				+
			
 
				+    def get_chapter_content_url(self, bid, cid):
			
 
				+        raise NotImplementedError
			
 
				+
			
 
				+    def chapter_content_result(self, response):
			
 
				+        raise NotImplementedError
			
 
				+
			
 
				+
			
 
				+class fixChapterSpider(scrapy.Spider):
			
 
				+    name = ''
			
 
				+    source = ''
			
 
				+    source_id = 0
			
 
				+    custom_settings = {
			
 
				+        'DOWNLOAD_DELAY': 0.01,
			
 
				+        'SOURCE': source,
			
 
				+    }
			
 
				+
			
 
				+    def __init__(self, host, user, password, db, bid_list, stats, settings):
			
 
				+        scrapy.Spider.__init__(self)
			
 
				+        source = self.source
			
 
				+        self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db, source=source,source_id=self.source_id)
			
 
				+        self.__stats = stats
			
 
				+        self.__stats.set_value('spider_type', 'fix')
			
 
				+        self.__stats.set_value('bid_list', [])
			
 
				+        self.__is_first = True
			
 
				+        self.bid_list = bid_list
			
 
				+
			
 
				+    @classmethod
			
 
				+    def from_crawler(cls, crawler, *args, **kwargs):
			
 
				+        settings = crawler.settings
			
 
				+        host = settings.get('MYSQL_HOST')
			
 
				+        user = settings.get('MYSQL_USER')
			
 
				+        password = settings.get('MYSQL_PWD')
			
 
				+        db = settings.get('MYSQL_DB')
			
 
				+        bid = kwargs.get('bid')
			
 
				+        if bid is not None:
			
 
				+            bid_list = bid.split(',')
			
 
				+        else:
			
 
				+            bid_list = []
			
 
				+        return cls(host=host, user=user, password=password, db=db,
			
 
				+                   bid_list=bid_list, stats=crawler.stats, settings=settings)
			
 
				+
			
 
				+    def start_requests(self):
			
 
				+        if self.bid_list is None:
			
 
				+            yield
			
 
				+            return
			
 
				+        for book in self.bid_list:
			
 
				+            info = self.mysqlHelper.get_book_info_by_id(book)
			
 
				+            if info is None:
			
 
				+                continue
			
 
				+            self.mysqlHelper.re_sequence(book)
			
 
				+            url = self.get_chapter_list_url(info['cp_bid'])
			
 
				+            meta = {'bid': book, 'cp_bid': info['cp_bid']}
			
 
				+            yield scrapy.Request(url, callback=self.parse_chapter_list, meta=meta)
			
 
				+
			
 
				+    def parse_chapter_list(self, response):
			
 
				+        if response.text == '':
			
 
				+            return None
			
 
				+        chapter_list = self.chapter_list_result(response)
			
 
				+        if chapter_list is not None:
			
 
				+            bid = response.meta.get('bid')
			
 
				+            cp_bid = response.meta.get('cp_bid')
			
 
				+            last_sequence = 0
			
 
				+            for chapter_item in chapter_list:
			
 
				+                last_sequence = last_sequence + 1
			
 
				+                chapter_info = self.mysqlHelper.get_cid_by_bid_sequence(bid, chapter_item['sequence'])
			
 
				+                cid = chapter_item['source_chapter_id']
			
 
				+                meta = chapter_item
			
 
				+                if chapter_info is not None:
			
 
				+                    meta['type'] = 'update'
			
 
				+                    meta['chapter_content_id'] = chapter_info['chapter_content_id']
			
 
				+                    meta['cid'] = chapter_info['id']
			
 
				+                meta['bid'] = bid
			
 
				+                url = self.get_chapter_content_url(cp_bid, cid)
			
 
				+                yield scrapy.Request(url, callback=self.parse_chapter_content, meta=meta)
			
 
				+            self.__stats.get_value('bid_list').append({'bid':bid,'end':last_sequence})
			
 
				+
			
 
				+    def parse_chapter_content(self, response):
			
 
				+        if response.text == '':
			
 
				+            return None
			
 
				+        now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
			
 
				+        result = self.chapter_content_result(response)
			
 
				+        meta = response.meta
			
 
				+        data = {}
			
 
				+        data['bid'] = meta['bid']
			
 
				+        data['name'] = meta['name']
			
 
				+        data['size'] = meta['size']
			
 
				+        data['is_vip'] = meta['is_vip']
			
 
				+        data['sequence'] = meta['sequence']
			
 
				+        data['source_chapter_id'] = meta['source_chapter_id']
			
 
				+        data['recent_update_at'] = meta['recent_update_at']
			
 
				+        data['content'] = formatcontent(result['content'])
			
 
				+
			
 
				+        if result.get('size') is not None:
			
 
				+            data['size'] = result.get('size')
			
 
				+        if result.get('is_vip') is not None:
			
 
				+            data['is_vip'] = result.get('is_vip')
			
 
				+        if result.get('name') is not None:
			
 
				+            data['name'] = result.get('name')
			
 
				+        if result.get('recent_update_at') is not None:
			
 
				+            data['recent_update_at'] = result.get('recent_update_at')
			
 
				+        if result.get('source_chapter_id') is not None:
			
 
				+            data['source_chapter_id'] = result.get('source_chapter_id')
			
 
				+        if meta.get('type') is not None:
			
 
				+            content = formatcontent(result['content'])
			
 
				+            data['content'] = content
			
 
				+            data['size'] = len(removePunctuation(content))
			
 
				+            self.mysqlHelper.update_content(meta['chapter_content_id'], meta['name'], result['content'])
			
 
				+            if meta.get('cid') is not None:
			
 
				+                data['cid'] = meta['cid']
			
 
				+            self.mysqlHelper.update_chapter(data)
			
 
				+        else:
			
 
				+            data['prev_cid'] = 0
			
 
				+            data['next_cid'] = 0
			
 
				+            data['updated_at'] = now
			
 
				+            data['created_at'] = now
			
 
				+            chapter_item = ChapterItem(data)
			
 
				+            yield chapter_item
			
 
				+
			
 
				+    def get_chapter_list_url(self, bid):
			
 
				+        raise NotImplementedError
			
 
				+
			
 
				+    def chapter_list_result(self, response):
			
 
				+        raise NotImplementedError
			
 
				+
			
 
				+    def get_chapter_content_url(self, bid, cid):
			
 
				+        raise NotImplementedError
			
 
				+
			
 
				+    def chapter_content_result(self, response):
			
 
				+        raise NotImplementedError
			
 
				+
			
 
				+
			
 
				+class fixBookInfoSpider(scrapy.Spider):
			
 
				+    name = ''
			
 
				+    source = ''
			
 
				+    source_name = ''
			
 
				+    source_id = 0
			
 
				+
			
 
				+    def __init__(self, host, user, password, db, bid_list, stats):
			
 
				+        scrapy.Spider.__init__(self)
			
 
				+        source = self.source
			
 
				+        self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db, source=source,source_id=self.source_id)
			
 
				+        self.bid_list = bid_list
			
 
				+
			
 
				+    @classmethod
			
 
				+    def from_crawler(cls, crawler, *args, **kwargs):
			
 
				+        settings = crawler.settings
			
 
				+        host = settings.get('MYSQL_HOST')
			
 
				+        user = settings.get('MYSQL_USER')
			
 
				+        password = settings.get('MYSQL_PWD')
			
 
				+        db = settings.get('MYSQL_DB')
			
 
				+        bid = kwargs.get('bid')
			
 
				+        if bid is not None:
			
 
				+            bid_list = bid.split(',')
			
 
				+        else:
			
 
				+            bid_list = []
			
 
				+        return cls(host=host, user=user, password=password, db=db, bid_list=bid_list, stats=crawler.stats)
			
 
				+
			
 
				+    def start_requests(self):
			
 
				+        if self.bid_list is None:
			
 
				+            yield
			
 
				+            return
			
 
				+        for book in self.bid_list:
			
 
				+            info = self.mysqlHelper.get_book_info_by_id(book)
			
 
				+            if info is None:
			
 
				+                continue
			
 
				+            url = self.get_book_info_url(info['cp_bid'])
			
 
				+            meta = {'bid': book, 'cp_bid': info['cp_bid']}
			
 
				+            yield scrapy.Request(url, callback=self.parse_book_info, meta=meta)
			
 
				+
			
 
				+    def parse_book_info(self, response):
			
 
				+        if response.text == '':
			
 
				+            return None
			
 
				+        result = self.book_info_result(response)
			
 
				+        if result is None:
			
 
				+            yield
			
 
				+            return
			
 
				+        result['bid'] = response.meta['bid']
			
 
				+        self.mysqlHelper.update_book_info(result)
			
 
				+
			
 
				+    def get_book_info_url(self, bid):
			
 
				+        raise NotImplementedError
			
 
				+
			
 
				+    def book_info_result(self, response):
			
 
				+        raise NotImplementedError
			
--- a/content_spider/bash/command.sh
+++ b/content_spider/bash/command.sh
@@ -0,0 +1,5 @@
 
				+#!/bin/bash
			
 
				+pwd=/home/www/zw_content_spider
			
 
				+cd $pwd
			
 
				+spider_name=$1
			
 
				+/root/.pyenv/shims/scrapy crawl $spider_name
			
--- a/content_spider/bash/fixCommand.sh
+++ b/content_spider/bash/fixCommand.sh
@@ -0,0 +1,6 @@
 
				+#!/bin/bash
			
 
				+pwd=/home/www/zw_content_spider
			
 
				+cd $pwd
			
 
				+spider_name=$1
			
 
				+bid=$2
			
 
				+/root/.pyenv/shims/scrapy crawl $spider_name -a bid=$bid
			
--- a/content_spider/items.py
+++ b/content_spider/items.py
@@ -0,0 +1,45 @@
 
				+# Define here the models for your scraped items
			
 
				+#
			
 
				+# See documentation in:
			
 
				+# https://docs.scrapy.org/en/latest/topics/items.html
			
 
				+
			
 
				+import scrapy
			
 
				+
			
 
				+
			
 
				+class BookInfoItem(scrapy.Item):
			
 
				+    cp_name = scrapy.Field()
			
 
				+    cp_bid = scrapy.Field()
			
 
				+    cp_id = scrapy.Field()
			
 
				+    name = scrapy.Field()
			
 
				+    author = scrapy.Field()
			
 
				+    intro = scrapy.Field()
			
 
				+    cover = scrapy.Field()
			
 
				+    keyword = scrapy.Field()
			
 
				+    category_id = scrapy.Field()
			
 
				+    category_name = scrapy.Field()
			
 
				+    status = scrapy.Field()
			
 
				+    chapter_count = scrapy.Field()
			
 
				+    first_cid = scrapy.Field()
			
 
				+    last_cid = scrapy.Field()
			
 
				+    size = scrapy.Field()
			
 
				+    channel = scrapy.Field()
			
 
				+    last_chapter = scrapy.Field()
			
 
				+    updated_at = scrapy.Field()
			
 
				+    created_at = scrapy.Field()
			
 
				+
			
 
				+
			
 
				+class ChapterItem(scrapy.Item):
			
 
				+    bid = scrapy.Field()
			
 
				+    name = scrapy.Field()
			
 
				+    sequence = scrapy.Field()
			
 
				+    size = scrapy.Field()
			
 
				+    is_vip = scrapy.Field()
			
 
				+    prev_cid = scrapy.Field()
			
 
				+    next_cid = scrapy.Field()
			
 
				+    recent_update_at = scrapy.Field()
			
 
				+    content = scrapy.Field()
			
 
				+    chapter_content_id = scrapy.Field()
			
 
				+    source_chapter_id = scrapy.Field()
			
 
				+    created_at = scrapy.Field()
			
 
				+    updated_at = scrapy.Field()
			
 
				+
			
--- a/content_spider/middlewares.py
+++ b/content_spider/middlewares.py
@@ -0,0 +1,313 @@
 
				+# Define here the models for your spider middleware
			
 
				+#
			
 
				+# See documentation in:
			
 
				+# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
			
 
				+import re
			
 
				+import hashlib
			
 
				+from urllib.parse import urlencode
			
 
				+import urllib
			
 
				+import time
			
 
				+from scrapy import signals
			
 
				+
			
 
				+
			
 
				+# useful for handling different item types with a single interface
			
 
				+# from itemadapter import is_item, ItemAdapter
			
 
				+from content_spider.Util import hashUtil
			
 
				+
			
 
				+
			
 
				+class ContentSpiderSpiderMiddleware:
			
 
				+    # Not all methods need to be defined. If a method is not defined,
			
 
				+    # scrapy acts as if the spider middleware does not modify the
			
 
				+    # passed objects.
			
 
				+
			
 
				+    @classmethod
			
 
				+    def from_crawler(cls, crawler):
			
 
				+        # This method is used by Scrapy to create your spiders.
			
 
				+        s = cls()
			
 
				+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
			
 
				+        return s
			
 
				+
			
 
				+    def process_spider_input(self, response, spider):
			
 
				+        # Called for each response that goes through the spider
			
 
				+        # middleware and into the spider.
			
 
				+
			
 
				+        # Should return None or raise an exception.
			
 
				+        return None
			
 
				+
			
 
				+    def process_spider_output(self, response, result, spider):
			
 
				+        # Called with the results returned from the Spider, after
			
 
				+        # it has processed the response.
			
 
				+
			
 
				+        # Must return an iterable of Request, or item objects.
			
 
				+        for i in result:
			
 
				+            yield i
			
 
				+
			
 
				+    def process_spider_exception(self, response, exception, spider):
			
 
				+        # Called when a spider or process_spider_input() method
			
 
				+        # (from other spider middleware) raises an exception.
			
 
				+
			
 
				+        # Should return either None or an iterable of Request or item objects.
			
 
				+        pass
			
 
				+
			
 
				+    def process_start_requests(self, start_requests, spider):
			
 
				+        # Called with the start requests of the spider, and works
			
 
				+        # similarly to the process_spider_output() method, except
			
 
				+        # that it doesn’t have a response associated.
			
 
				+
			
 
				+        # Must return only requests (not items).
			
 
				+        for r in start_requests:
			
 
				+            yield r
			
 
				+
			
 
				+    def spider_opened(self, spider):
			
 
				+        spider.logger.info('Spider opened: %s' % spider.name)
			
 
				+
			
 
				+
			
 
				+class ContentSpiderDownloaderMiddleware:
			
 
				+    # Not all methods need to be defined. If a method is not defined,
			
 
				+    # scrapy acts as if the downloader middleware does not modify the
			
 
				+    # passed objects.
			
 
				+
			
 
				+    @classmethod
			
 
				+    def from_crawler(cls, crawler):
			
 
				+        # This method is used by Scrapy to create your spiders.
			
 
				+        s = cls()
			
 
				+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
			
 
				+        return s
			
 
				+
			
 
				+    def process_request(self, request, spider):
			
 
				+        # Called for each request that goes through the downloader
			
 
				+        # middleware.
			
 
				+
			
 
				+        # Must either:
			
 
				+        # - return None: continue processing this request
			
 
				+        # - or return a Response object
			
 
				+        # - or return a Request object
			
 
				+        # - or raise IgnoreRequest: process_exception() methods of
			
 
				+        #   installed downloader middleware will be called
			
 
				+        return None
			
 
				+
			
 
				+    def process_response(self, request, response, spider):
			
 
				+        # Called with the response returned from the downloader.
			
 
				+
			
 
				+        # Must either;
			
 
				+        # - return a Response object
			
 
				+        # - return a Request object
			
 
				+        # - or raise IgnoreRequest
			
 
				+        return response
			
 
				+
			
 
				+    def process_exception(self, request, exception, spider):
			
 
				+        # Called when a download handler or a process_request()
			
 
				+        # (from other downloader middleware) raises an exception.
			
 
				+
			
 
				+        # Must either:
			
 
				+        # - return None: continue processing this exception
			
 
				+        # - return a Response object: stops process_exception() chain
			
 
				+        # - return a Request object: stops process_exception() chain
			
 
				+        pass
			
 
				+
			
 
				+    def spider_opened(self, spider):
			
 
				+        spider.logger.info('Spider opened: %s' % spider.name)
			
 
				+
			
 
				+
			
 
				+class LianshangSpiderDownloaderMiddleware:
			
 
				+    # Not all methods need to be defined. If a method is not defined,
			
 
				+    # scrapy acts as if the downloader middleware does not modify the
			
 
				+    # passed objects.
			
 
				+
			
 
				+    @classmethod
			
 
				+    def from_crawler(cls, crawler):
			
 
				+        # This method is used by Scrapy to create your spiders.
			
 
				+        s = cls()
			
 
				+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
			
 
				+        return s
			
 
				+
			
 
				+    def process_request(self, request, spider):
			
 
				+        url = request._url
			
 
				+        ts = int(time.time())
			
 
				+        request._url = re.sub(r'oauth_timestamp=\d+', 'oauth_timestamp={}'.format(ts), url)
			
 
				+        return None
			
 
				+
			
 
				+    def process_response(self, request, response, spider):
			
 
				+        # Called with the response returned from the downloader.
			
 
				+        # Must either;
			
 
				+        # - return a Response object
			
 
				+        # - return a Request object
			
 
				+        # - or raise IgnoreRequest
			
 
				+        return response
			
 
				+
			
 
				+    def process_exception(self, request, exception, spider):
			
 
				+        # Called when a download handler or a process_request()
			
 
				+        # (from other downloader middleware) raises an exception.
			
 
				+
			
 
				+        # Must either:
			
 
				+        # - return None: continue processing this exception
			
 
				+        # - return a Response object: stops process_exception() chain
			
 
				+        # - return a Request object: stops process_exception() chain
			
 
				+        pass
			
 
				+
			
 
				+    def spider_opened(self, spider):
			
 
				+        spider.logger.info('Spider opened: %s' % spider.name)
			
 
				+
			
 
				+
			
 
				+class WangyiSpiderDownloaderMiddleware:
			
 
				+    consumerKey = "58434765"
			
 
				+    secretKey = "AECnczs1GpBGDSXz"
			
 
				+
			
 
				+    @classmethod
			
 
				+    def from_crawler(cls, crawler):
			
 
				+        # This method is used by Scrapy to create your spiders.
			
 
				+        s = cls()
			
 
				+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
			
 
				+        return s
			
 
				+
			
 
				+    def process_request(self, request, spider):
			
 
				+        url = request._url
			
 
				+        request._url = self.re_sign(url)
			
 
				+        return None
			
 
				+
			
 
				+    def process_response(self, request, response, spider):
			
 
				+        return response
			
 
				+
			
 
				+    def process_exception(self, request, exception, spider):
			
 
				+        pass
			
 
				+
			
 
				+    def spider_opened(self, spider):
			
 
				+        spider.logger.info('Spider opened: %s' % spider.name)
			
 
				+
			
 
				+    def get_sign(self, primary_url, param):
			
 
				+        url = 'GET' + primary_url
			
 
				+        param = sorted(param.items(), key=lambda x: x[0])
			
 
				+        string = ''
			
 
				+        m = hashlib.md5()
			
 
				+        for item in param:
			
 
				+            string = string + '{}={}'.format(str(item[0]), str(item[1]))
			
 
				+        string = url + string + self.secretKey
			
 
				+        string = urllib.parse.quote(string, '')
			
 
				+        m.update(string.encode('utf-8'))
			
 
				+        sign = m.hexdigest()
			
 
				+        return sign
			
 
				+
			
 
				+    def re_sign(self, url):
			
 
				+        res = urllib.parse.urlsplit(url)
			
 
				+        simple_url = '{}://{}{}'.format(res[0], res[1], res[2])
			
 
				+        query_param = urllib.parse.parse_qs(res[3])
			
 
				+        timestamp = int(time.time() * 1000)
			
 
				+        param = {}
			
 
				+        for item in query_param:
			
 
				+            if item == 'timestamp':
			
 
				+                param['timestamp'] = timestamp
			
 
				+            elif item == 'expires':
			
 
				+                param['expires'] = timestamp + 10 * 60 * 1000
			
 
				+            elif item == 'sign':
			
 
				+                continue
			
 
				+            else:
			
 
				+                param[item] = query_param[item][0]
			
 
				+        param['sign'] = self.get_sign(simple_url, param)
			
 
				+        return simple_url + '?' + urlencode(param)
			
 
				+
			
 
				+
			
 
				+class BimoSpiderDownloaderMiddleware:
			
 
				+    api_key = 'ZmfJQZaF8FuQSuUx'
			
 
				+    api_secret = 'JgBETPKMPgsRBqlNtKajwQf4zFQuMwYr'
			
 
				+
			
 
				+    @classmethod
			
 
				+    def from_crawler(cls, crawler):
			
 
				+        # This method is used by Scrapy to create your spiders.
			
 
				+        s = cls()
			
 
				+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
			
 
				+        return s
			
 
				+
			
 
				+    def process_request(self, request, spider):
			
 
				+        url = request._url
			
 
				+        request._url = self.re_sign(url)
			
 
				+        return None
			
 
				+
			
 
				+    def process_response(self, request, response, spider):
			
 
				+        return response
			
 
				+
			
 
				+    def process_exception(self, request, exception, spider):
			
 
				+        pass
			
 
				+
			
 
				+    def spider_opened(self, spider):
			
 
				+        spider.logger.info('Spider opened: %s' % spider.name)
			
 
				+
			
 
				+    def re_sign(self, url):
			
 
				+        res = urllib.parse.urlsplit(url)
			
 
				+        simple_url = '{}://{}{}'.format(res[0], res[1], res[2])
			
 
				+        query_param = urllib.parse.parse_qs(res[3])
			
 
				+        param = dict(time=int(time.time()))
			
 
				+        for item in query_param:
			
 
				+            if item == 'sign' or item == 'signType' or item == 'time':
			
 
				+                continue
			
 
				+            else:
			
 
				+                param[item] = query_param[item][0]
			
 
				+        param['sign'] = self.sign(param)
			
 
				+        param['signType'] = 'MD5'
			
 
				+        return simple_url + '?' + urlencode(param)
			
 
				+
			
 
				+    def sign(self, param):
			
 
				+        param = sorted(param.items(), key=lambda x: x[0])
			
 
				+        string = ''
			
 
				+        for item in param:
			
 
				+            string = string + str(item[0]) + '=' + str(item[1]) + '&'
			
 
				+        string = string + 'apiSecret=' + self.api_secret
			
 
				+        return hashUtil.md5(string)
			
 
				+
			
 
				+
			
 
				+class MotieSpiderDownloaderMiddleware:
			
 
				+    customerId = '922'
			
 
				+    customerSecret = 'kQSdaERniXSxzr20IJgtkvru1nSLFLjR'
			
 
				+
			
 
				+    @classmethod
			
 
				+    def from_crawler(cls, crawler):
			
 
				+        # This method is used by Scrapy to create your spiders.
			
 
				+        s = cls()
			
 
				+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
			
 
				+        return s
			
 
				+
			
 
				+    def process_request(self, request, spider):
			
 
				+        url = request._url
			
 
				+        new_url = self.re_sign(url)
			
 
				+        if new_url != '' and new_url is not None:
			
 
				+            request._url = self.re_sign(url)
			
 
				+        return None
			
 
				+
			
 
				+    def process_response(self, request, response, spider):
			
 
				+        return response
			
 
				+
			
 
				+    def process_exception(self, request, exception, spider):
			
 
				+        pass
			
 
				+
			
 
				+    def spider_opened(self, spider):
			
 
				+        spider.logger.info('Spider opened: %s' % spider.name)
			
 
				+
			
 
				+    def re_sign(self, url):
			
 
				+        res = urllib.parse.urlsplit(url)
			
 
				+        simple_url = '{}://{}{}'.format(res[0], res[1], res[2])
			
 
				+        query_param = urllib.parse.parse_qs(res[3])
			
 
				+        timestamp = int(time.time()) * 1000
			
 
				+        param = {}
			
 
				+        sign_string = ''
			
 
				+        if res[2] == '/api/motie/get/book':
			
 
				+            return simple_url + '?customerId={}'.format(query_param['customerId'][0])
			
 
				+        for item in query_param:
			
 
				+            if item == 'timestamp' or item == 'sign':
			
 
				+                continue
			
 
				+            param[item] = query_param[item][0]
			
 
				+            param['timestamp'] = timestamp
			
 
				+        if res[2] == '/api/motie/get/bookinfo' or res[2] == '/api/motie/get/chapterlist':
			
 
				+            sign_string = '{}#{}#{}#{}'.format(
			
 
				+                query_param['bookId'][0],
			
 
				+                query_param['customerId'][0],
			
 
				+                timestamp, self.customerSecret)
			
 
				+
			
 
				+        if res[2] == '/api/motie/get/chapter':
			
 
				+            sign_string = '{}#{}#{}#{}#{}'.format(
			
 
				+                query_param['bookId'][0],
			
 
				+                query_param['chapterId'][0],
			
 
				+                query_param['customerId'][0],
			
 
				+                timestamp, self.customerSecret)
			
 
				+        if sign_string == '':
			
 
				+            return None
			
 
				+        param['sign'] = hashUtil.md5(sign_string)
			
 
				+        return simple_url + '?' + urlencode(param)
			
--- a/content_spider/mysqlHelper.py
+++ b/content_spider/mysqlHelper.py
@@ -0,0 +1,201 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+import time
			
 
				+
			
 
				+import pymysql.cursors
			
 
				+
			
 
				+
			
 
				+class MysqlHelper(object):
			
 
				+    def __init__(self, host, user, password, db, source,source_id):
			
 
				+        self.__conn = pymysql.connect(host=host, user=user, password=password, db=db, charset='utf8mb4',
			
 
				+                                      cursorclass=pymysql.cursors.DictCursor)
			
 
				+        self.source = source
			
 
				+        self.source_id = source_id
			
 
				+
			
 
				+    def get_book_info_by_source(self, source_bid):
			
 
				+        sql = 'select id from zy_books where cp_bid="%s" and  cp_id = %s'
			
 
				+        with self.__conn.cursor() as cursor:
			
 
				+            cursor.execute(sql, (int(source_bid), self.source_id))
			
 
				+            result = cursor.fetchone()
			
 
				+        self.__conn.commit()
			
 
				+        return result
			
 
				+
			
 
				+    def get_book_info_by_id(self,bid):
			
 
				+        sql = 'select id,cp_bid from zy_books where id= %s and  cp_id = %s'
			
 
				+        with self.__conn.cursor() as cursor:
			
 
				+            cursor.execute(sql, (int(bid), self.source_id))
			
 
				+            result = cursor.fetchone()
			
 
				+        self.__conn.commit()
			
 
				+        return result
			
 
				+
			
 
				+    def get_need_update_book_list(self):
			
 
				+        sql = 'select id,cp_bid from zy_books where cp_id=%s and `status` = 0'
			
 
				+        with self.__conn.cursor() as cursor:
			
 
				+            cursor.execute(sql, (self.source_id,))
			
 
				+            result = cursor.fetchall()
			
 
				+        self.__conn.commit()
			
 
				+        return result
			
 
				+
			
 
				+
			
 
				+    def get_last_cid_by_bid(self, bid):
			
 
				+        sql_format = "select id,bid,`name`,sequence,source_chapter_id from zy_book_chapters where bid = {} order by sequence desc limit 1"
			
 
				+
			
 
				+        sql = sql_format.format(bid)
			
 
				+        with self.__conn.cursor() as cursor:
			
 
				+            cursor.execute(sql)
			
 
				+            result = cursor.fetchone()
			
 
				+        self.__conn.commit()
			
 
				+        return result
			
 
				+
			
 
				+
			
 
				+    def get_cid_by_bid_sequence(self, bid, sequence):
			
 
				+        sql = "select id,chapter_content_id  from zy_book_chapters where  bid = %s and  sequence=%s"
			
 
				+        with self.__conn.cursor() as cursor:
			
 
				+            cursor.execute(sql, (int(bid), int(sequence)))
			
 
				+            result = cursor.fetchone()
			
 
				+        self.__conn.commit()
			
 
				+        return result
			
 
				+
			
 
				+
			
 
				+    def insert_book(self, item):
			
 
				+        now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
			
 
				+        sql = '''
			
 
				+        insert into zy_books(cp_id,cp_name,cp_bid,`name`,author, intro, cover ,keyword , category_id,status
			
 
				+        ,`size`,category_name,updated_at,created_at)
			
 
				+        values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
			
 
				+        '''
			
 
				+        with self.__conn.cursor() as cursor:
			
 
				+            cursor.execute(sql, self.source_id,self.source,(item.get('cp_bid'),
			
 
				+                                 item.get('name'),
			
 
				+                                 item.get('author'),
			
 
				+                                 item.get('intro'),
			
 
				+                                 item.get('cover'),
			
 
				+                                 item.get('keyword'),
			
 
				+                                 item.get('category_id'),
			
 
				+                                 item.get('status'),
			
 
				+                                 item.get('size'),
			
 
				+                                 item.get('category_name'),
			
 
				+                                 now,now
			
 
				+                                 ))
			
 
				+            bid = int(cursor.lastrowid)
			
 
				+        self.__conn.commit()
			
 
				+        return bid
			
 
				+
			
 
				+    def insert_chapter(self, item):
			
 
				+        chapter_content_id = self.insert_content(item)
			
 
				+        sql = "INSERT INTO `zy_book_chapters` (`bid`, `name`,`sequence`,`size`,`is_vip`,`prev_cid`,`next_cid`," \
			
 
				+              "`recent_update_at`,`created_at`,`updated_at`,`chapter_content_id`,source_chapter_id) " \
			
 
				+              "VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
			
 
				+        with self.__conn.cursor() as cursor:
			
 
				+            cursor.execute(sql, (
			
 
				+                item['bid'], item['name'], item['sequence'], item['size'], item['is_vip'], item['prev_cid'],
			
 
				+                item['next_cid'], item['recent_update_at'], item['created_at'], item['updated_at'],
			
 
				+                chapter_content_id,
			
 
				+                item['source_chapter_id']))
			
 
				+            cid = int(cursor.lastrowid)
			
 
				+        self.__conn.commit()
			
 
				+        return cid
			
 
				+
			
 
				+    def insert_content(self, item):
			
 
				+        sql = "insert into zy_book_chapter_contents (bid,chapter_name,content,created_at,updated_at) values (%s,%s,%s,%s,%s)"
			
 
				+        with self.__conn.cursor() as cursor:
			
 
				+            cursor.execute(sql, (
			
 
				+                item['bid'],item['name'], item['content'], item['created_at'], item['updated_at']))
			
 
				+            content_id = int(cursor.lastrowid)
			
 
				+        self.__conn.commit()
			
 
				+        return content_id
			
 
				+
			
 
				+
			
 
				+
			
 
				+    def update_content(self, content_id, chapter_name, content):
			
 
				+        now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
			
 
				+        sql = 'update zy_book_chapter_contents set chapter_name=%s,content=%s,updated_at=%s  where id=%s'
			
 
				+        with self.__conn.cursor() as cursor:
			
 
				+            cursor.execute(sql, (
			
 
				+                chapter_name, content, now, int(content_id)))
			
 
				+        self.__conn.commit()
			
 
				+
			
 
				+    def update_chapter(self, item):
			
 
				+        now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
			
 
				+        sql = 'update zy_book_chapters set `name`=%s,`sequence`=%s,`size`=%s,`is_vip`=%s,' \
			
 
				+              'updated_at=%s,`source_chapter_id`=%s where id = %s'
			
 
				+        with self.__conn.cursor() as cursor:
			
 
				+            cid = int(item['cid'])
			
 
				+            cursor.execute(sql, (
			
 
				+                item['name'], item['sequence'], item['size'], item['is_vip'],  now,
			
 
				+                item['source_chapter_id'],  cid))
			
 
				+        self.__conn.commit()
			
 
				+
			
 
				+    def update_book_info(self, book_info):
			
 
				+        sql = 'update zy_books set `name`=%s,author=%s,intro=%s,cover=%s,gender=%s,category_name=%s where id =%s'
			
 
				+        with self.__conn.cursor() as cursor:
			
 
				+            cursor.execute(sql, (
			
 
				+                book_info['name'], book_info['author'], book_info['intro'], book_info['cover'],
			
 
				+                book_info['gender'], book_info['category'], int(book_info['bid'])))
			
 
				+        self.__conn.commit()
			
 
				+
			
 
				+
			
 
				+    def re_sequence(self, bid):
			
 
				+        sql = '''
			
 
				+        update zy_book_chapters a join ( 
			
 
				+SELECT id,(@a:=@a+1) as sequence FROM zy_book_chapters,(SELECT @a:=0) as a WHERE bid = {}  ORDER BY sequence
			
 
				+ ) b on a.id  = b.id set a.sequence = b.sequence where a.bid = {}
			
 
				+        '''.format(bid, bid)
			
 
				+        with self.__conn.cursor() as cursor:
			
 
				+            cursor.execute(sql)
			
 
				+        self.__conn.commit()
			
 
				+
			
 
				+    
			
 
				+    def after_spider(self,bid,start=1):
			
 
				+        chapter_list = self.get_simple_chapter_list(bid,start)
			
 
				+        if chapter_list is None:
			
 
				+            return None
			
 
				+        point = 0
			
 
				+        for chapter_item in chapter_list:
			
 
				+            if point == 0:
			
 
				+                point = chapter_item['id']
			
 
				+                continue
			
 
				+            sql1 = 'update zy_book_chapters set next_cid={} where id={}'.format(chapter_item['id'],point)
			
 
				+            sql2 = 'update zy_book_chapters set prev_cid={},next_cid=0 where id={}'.format(point,chapter_item['id'])
			
 
				+            self.simple_update(sql1)
			
 
				+            self.simple_update(sql2)
			
 
				+            point = chapter_item['id']
			
 
				+
			
 
				+        
			
 
				+        book_info = self.get_base_info(bid)
			
 
				+        book_sql = '''
			
 
				+            update zw_books set size={},chapter_count={},first_cid={},last_cid={},last_chapter='{}'  where  id = {}
			
 
				+        '''.format(book_info['size'],book_info['chapter_count'],book_info['first_cid'],book_info['last_cid'],book_info['last_chapter'],bid)
			
 
				+
			
 
				+        self.simple_update(book_sql)
			
 
				+
			
 
				+    def after_fix_delete_unnecessary(self,bid,start):
			
 
				+        sql = 'update zy_book_chapters set bid=-bid where bid={} and sequence > {}'.format(bid,start)
			
 
				+        self.simple_update(sql)
			
 
				+
			
 
				+
			
 
				+    def get_base_info(self,bid):
			
 
				+        sql = '''
			
 
				+        SELECT id as last_cid,`name` as  last_chapter,( SELECT id FROM zy_book_chapters WHERE bid = {} ORDER BY sequence limit 1 ) as first_cid ,
			
 
				+( SELECT count(*) FROM zy_book_chapters WHERE bid = {} ) as chapter_count,
			
 
				+( SELECT sum(size) FROM zy_book_chapters WHERE bid = {} ) as size
			
 
				+FROM zy_book_chapters where   bid = {} ORDER BY sequence desc  LIMIT 1
			
 
				+        '''.format(bid,bid,bid,bid) 
			
 
				+        with self.__conn.cursor() as cursor:
			
 
				+            cursor.execute(sql)
			
 
				+            result = cursor.fetchone()
			
 
				+        self.__conn.commit()
			
 
				+        return result
			
 
				+    
			
 
				+    def get_simple_chapter_list(self,bid,start):
			
 
				+        sql = 'select id from zy_book_chapters order by sequence where bid = {} where sequence >={}'.format(bid,start)
			
 
				+        with self.__conn.cursor() as cursor:
			
 
				+            cursor.execute(sql)
			
 
				+            result = cursor.fetchall()
			
 
				+        self.__conn.commit()
			
 
				+        return result
			
 
				+
			
 
				+    def simple_update(self,sql):
			
 
				+        with self.__conn.cursor() as cursor:
			
 
				+            cursor.execute(sql)
			
 
				+        self.__conn.commit()
			
 
				+
			
--- a/content_spider/pipelines.py
+++ b/content_spider/pipelines.py
@@ -0,0 +1,90 @@
 
				+# Define your item pipelines here
			
 
				+#
			
 
				+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
			
 
				+# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
			
 
				+
			
 
				+
			
 
				+# useful for handling different item types with a single interface
			
 
				+# from itemadapter import ItemAdapter
			
 
				+
			
 
				+import os
			
 
				+import re
			
 
				+import redis
			
 
				+from content_spider.mysqlHelper import MysqlHelper
			
 
				+from content_spider.Util import my_log
			
 
				+
			
 
				+redis_crawl_flag_key = 'book:crawl:stats'
			
 
				+redis_update_flag_key = 'book:update:stats'
			
 
				+redis_fix_flag_key = 'book:fix:stats'
			
 
				+
			
 
				+
			
 
				+def formatcontent(content):
			
 
				+    content = content.replace('&nbsp;', '')
			
 
				+    content = content.replace('<p>', '')
			
 
				+    content = content.replace('</p>', "\r\n")
			
 
				+    content = content.splitlines()
			
 
				+    content = map(lambda s: s.strip(), content)
			
 
				+    content = filter(lambda s: s != '', content)
			
 
				+    content = '\r\n'.join(content)
			
 
				+    return content.strip()
			
 
				+
			
 
				+
			
 
				+def removePunctuation(text):
			
 
				+    punctuation = '!,;:?"\'、，；！”“。？,'
			
 
				+    text = re.sub(r'[{}]+'.format(punctuation), ' ', text)
			
 
				+    return text.strip().replace('\r\n', '').replace('\n', '').replace('\r', '')
			
 
				+
			
 
				+
			
 
				+class ContentSpiderPipeline:
			
 
				+    def process_item(self, item, spider):
			
 
				+        return item
			
 
				+
			
 
				+
			
 
				+class ChapterItemPipeline:
			
 
				+
			
 
				+    def __init__(self, host, user, password, db, source, stats,settings):
			
 
				+        self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db, source=source,source_id=0)
			
 
				+        self.__stats = stats
			
 
				+
			
 
				+    @classmethod
			
 
				+    def from_crawler(cls, crawler):
			
 
				+        stats = crawler.stats
			
 
				+        return cls(
			
 
				+            host=crawler.settings.get('MYSQL_HOST'),
			
 
				+            user=crawler.settings.get('MYSQL_USER'),
			
 
				+            password=crawler.settings.get('MYSQL_PWD'),
			
 
				+            db=crawler.settings.get('MYSQL_DB'),
			
 
				+            source=crawler.settings.get('SOURCE'),
			
 
				+            stats=stats,
			
 
				+            settings=crawler.settings
			
 
				+        )
			
 
				+
			
 
				+    def process_item(self, item, spider):
			
 
				+        if item.get('content') != "":
			
 
				+            content = formatcontent(item['content'])
			
 
				+            item['content'] = content
			
 
				+            item['size'] = len(removePunctuation(content))
			
 
				+            self.mysqlHelper.insert_chapter(item)
			
 
				+
			
 
				+    def close_spider(self, spider):
			
 
				+        spider_type = self.__stats.get_value('spider_type')
			
 
				+        if spider_type == 'update':
			
 
				+            self.__redis_conn.hset(redis_update_flag_key, spider.name, 0)
			
 
				+            my_log(spider.name, 'update end ....')
			
 
				+            book_list = self.__stats.get_value('bid_list')
			
 
				+            if book_list is not None:
			
 
				+                for book in book_list:
			
 
				+                    spider.mysqlHelper.after_spider(book['bid'],book['start_sequence'])
			
 
				+        if spider_type == 'add':
			
 
				+            self.__redis_conn.hset(redis_crawl_flag_key, spider.name, 0)
			
 
				+            my_log(spider.name, 'crawl end ....')
			
 
				+            bid_list = self.__stats.get_value('bid_list')
			
 
				+            if bid_list is not None:
			
 
				+                for bid in bid_list:
			
 
				+                    spider.mysqlHelper.after_spider(bid,1)
			
 
				+        if spider_type == 'fix':
			
 
				+            bid_list = self.__stats.get_value('bid_list')
			
 
				+            if bid_list is not None:
			
 
				+                for book in bid_list:
			
 
				+                    spider.mysqlHelper.after_fix_delete_unnecessary(book['bid'],book['end'])
			
 
				+                    spider.mysqlHelper.after_spider(book['bid'],1)
			
--- a/content_spider/profile/profile
+++ b/content_spider/profile/profile
--- a/content_spider/settings.py
+++ b/content_spider/settings.py
@@ -0,0 +1,101 @@
 
				+# Scrapy settings for wangdu_spider project
			
 
				+#
			
 
				+# For simplicity, this file contains only settings considered important or
			
 
				+# commonly used. You can find more settings consulting the documentation:
			
 
				+#
			
 
				+#     https://docs.scrapy.org/en/latest/topics/settings.html
			
 
				+#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
			
 
				+#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
			
 
				+
			
 
				+BOT_NAME = 'content_spider'
			
 
				+
			
 
				+SPIDER_MODULES = ['content_spider.spiders']
			
 
				+NEWSPIDER_MODULE = 'content_spider.spiders'
			
 
				+
			
 
				+# Crawl responsibly by identifying yourself (and your website) on the user-agent
			
 
				+#USER_AGENT = 'content_spider (+http://www.yourdomain.com)'
			
 
				+
			
 
				+# Obey robots.txt rules
			
 
				+ROBOTSTXT_OBEY = False
			
 
				+
			
 
				+# Configure maximum concurrent requests performed by Scrapy (default: 16)
			
 
				+# CONCURRENT_REQUESTS = 32
			
 
				+
			
 
				+# Configure a delay for requests for the same website (default: 0)
			
 
				+# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
			
 
				+# See also autothrottle settings and docs
			
 
				+# DOWNLOAD_DELAY = 3
			
 
				+# The download delay setting will honor only one of:
			
 
				+# CONCURRENT_REQUESTS_PER_DOMAIN = 16
			
 
				+# CONCURRENT_REQUESTS_PER_IP = 16
			
 
				+
			
 
				+# Disable cookies (enabled by default)
			
 
				+COOKIES_ENABLED = False
			
 
				+
			
 
				+# Disable Telnet Console (enabled by default)
			
 
				+# TELNETCONSOLE_ENABLED = False
			
 
				+
			
 
				+# Override the default request headers:
			
 
				+DEFAULT_REQUEST_HEADERS = {
			
 
				+   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
			
 
				+   'Accept-Language': 'zh-CN,zh;q=0.8',
			
 
				+   'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'
			
 
				+}
			
 
				+
			
 
				+# Enable or disable spider middlewares
			
 
				+# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
			
 
				+# SPIDER_MIDDLEWARES = {
			
 
				+#    'content_spider.middlewares.WangduSpiderSpiderMiddleware': 543,
			
 
				+# }
			
 
				+
			
 
				+# Enable or disable downloader middlewares
			
 
				+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
			
 
				+# DOWNLOADER_MIDDLEWARES = {
			
 
				+#    'content_spider.middlewares.ContentSpiderDownloaderMiddleware': 543,
			
 
				+# }
			
 
				+
			
 
				+# Enable or disable extensions
			
 
				+# See https://docs.scrapy.org/en/latest/topics/extensions.html
			
 
				+# EXTENSIONS = {
			
 
				+#    'scrapy.extensions.telnet.TelnetConsole': None,
			
 
				+# }
			
 
				+
			
 
				+# Configure item pipelines
			
 
				+# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
			
 
				+ITEM_PIPELINES = {
			
 
				+    'content_spider.pipelines.ChapterItemPipeline': 300,
			
 
				+}
			
 
				+
			
 
				+# Enable and configure the AutoThrottle extension (disabled by default)
			
 
				+# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
			
 
				+# AUTOTHROTTLE_ENABLED = True
			
 
				+# The initial download delay
			
 
				+# AUTOTHROTTLE_START_DELAY = 5
			
 
				+# The maximum download delay to be set in case of high latencies
			
 
				+# AUTOTHROTTLE_MAX_DELAY = 60
			
 
				+# The average number of requests Scrapy should be sending in parallel to
			
 
				+# each remote server
			
 
				+# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
			
 
				+# Enable showing throttling stats for every response received:
			
 
				+# AUTOTHROTTLE_DEBUG = False
			
 
				+
			
 
				+# Enable and configure HTTP caching (disabled by default)
			
 
				+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
			
 
				+# HTTPCACHE_ENABLED = True
			
 
				+# HTTPCACHE_EXPIRATION_SECS = 0
			
 
				+# HTTPCACHE_DIR = 'httpcache'
			
 
				+# HTTPCACHE_IGNORE_HTTP_CODES = []
			
 
				+# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
			
 
				+
			
 
				+MYSQL_HOST = 'rm-bp121c5u43zdqh4fs.mysql.rds.aliyuncs.com'
			
 
				+MYSQL_USER = 'zwcontent'
			
 
				+MYSQL_PWD = 'zwcontent_2021!ZW2021'
			
 
				+MYSQL_DB = 'zwcontent'
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+# MYSQL_HOST = 'rm-bp1z1dto3n2rdb02f.mysql.rds.aliyuncs.com'
			
 
				+# MYSQL_USER = 'yueduyun'
			
 
				+# MYSQL_PWD = "yueduyun2017#Ydy"
			
 
				+# MYSQL_DB = 'book_club'
			
--- a/content_spider/spiders/__init__.py
+++ b/content_spider/spiders/__init__.py
@@ -0,0 +1,4 @@
 
				+# This package will contain the spiders of your Scrapy project
			
 
				+#
			
 
				+# Please refer to the documentation for information on how to create and manage
			
 
				+# your spiders.
			
--- a/content_spider/spiders/kanshu/__init__.py
+++ b/content_spider/spiders/kanshu/__init__.py
--- a/content_spider/spiders/kanshu/book.py
+++ b/content_spider/spiders/kanshu/book.py
@@ -0,0 +1,81 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+
			
 
				+from content_spider.baseSpider import baseSpider
			
 
				+import json
			
 
				+import time
			
 
				+
			
 
				+
			
 
				+class BookSpider(baseSpider):
			
 
				+    name = 'kanshu'
			
 
				+    allowed_domains = ['hezuo.lunjian.com']
			
 
				+    source = 'kanshu'
			
 
				+    source_name = '看书'
			
 
				+    source_id = 19
			
 
				+    base_url = 'http://hezuo.lunjian.com/open/ksbook/{}?channel_id=10054'
			
 
				+
			
 
				+    custom_settings = {
			
 
				+        'DOWNLOAD_DELAY': 0.01,
			
 
				+        'SOURCE': source,
			
 
				+        'LOG_FILE': 'content_spider/log/' + name + time.strftime("%Y-%m-%d", time.localtime()) + '.log'
			
 
				+    }
			
 
				+
			
 
				+    def get_start_url(self):
			
 
				+        return self.base_url.format('bookLists')
			
 
				+
			
 
				+    def bid_list_result(self, response):
			
 
				+        result = json.loads(response.text)
			
 
				+        if result is None:
			
 
				+            return []
			
 
				+        result_list = []
			
 
				+        for item in result['data']:
			
 
				+            result_list.append({'id': item['id']})
			
 
				+        return result_list
			
 
				+
			
 
				+    def get_book_info_url(self, bid):
			
 
				+        return self.base_url.format('BookDetail') + '&bookid={}'.format(bid)
			
 
				+
			
 
				+    def book_info_result(self, response):
			
 
				+        result = json.loads(response.text)
			
 
				+        if result is None or result.get('data') is None:
			
 
				+            return None
			
 
				+        result = result['data']
			
 
				+
			
 
				+        return {
			
 
				+            'bid': result['id'], 'name': result['bookTitle'], 'author': result['author'],
			
 
				+            'intro': result['introduction'], 'cover': result['cover'], 'keyword': result['labels'],
			
 
				+            'status': result['state'], 'category': result['category'],'category_id':1,
			
 
				+            'channel': result['channelId']
			
 
				+        }
			
 
				+
			
 
				+    def get_chapter_list_url(self, bid):
			
 
				+        return self.base_url.format('ChapterLists') + '&bookid={}'.format(bid)
			
 
				+
			
 
				+    def chapter_list_result(self, response):
			
 
				+        result = json.loads(response.text)
			
 
				+        if result is None or result.get('data') is None:
			
 
				+            return []
			
 
				+
			
 
				+        result_list = []
			
 
				+        i = 0
			
 
				+        for chapter_item in result['data']:
			
 
				+            i = i+1
			
 
				+            result_list.append({
			
 
				+                'source_chapter_id': chapter_item['id'], 'name': chapter_item['title'],
			
 
				+                'sequence': i, 'is_vip': 1 if chapter_item['isVip'] else 0,
			
 
				+                'size': 0, 'recent_update_at': chapter_item['lastUpdateTime']
			
 
				+            })
			
 
				+        return result_list
			
 
				+
			
 
				+    def get_chapter_content_url(self, bid, cid):
			
 
				+        return self.base_url.format('ChapterContent') + '&bookid={}&chapterid={}'.format(bid, cid)
			
 
				+
			
 
				+    def chapter_content_result(self, response):
			
 
				+        result = json.loads(response.text)
			
 
				+        if result is None:
			
 
				+            return {'content': ''}
			
 
				+
			
 
				+        return {
			
 
				+            'content': result['data']['content'],
			
 
				+            'size': len(result['data']['content'])
			
 
				+        }
			
 
				+
			
--- a/content_spider/spiders/kanshu/bookFix.py
+++ b/content_spider/spiders/kanshu/bookFix.py
@@ -0,0 +1,52 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+
			
 
				+from content_spider.baseSpider import fixChapterSpider
			
 
				+import json
			
 
				+import time
			
 
				+
			
 
				+
			
 
				+class bookFixSpider(fixChapterSpider):
			
 
				+    name = 'kanshufix'
			
 
				+    allowed_domains = ['hezuo.lunjian.com']
			
 
				+    source = 'kanshu'
			
 
				+    source_name = '看书'
			
 
				+    base_url = 'http://hezuo.lunjian.com/open/ksbook/{}?channel_id=10054'
			
 
				+
			
 
				+    custom_settings = {
			
 
				+        'DOWNLOAD_DELAY': 0.01,
			
 
				+        'SOURCE': source,
			
 
				+        'LOG_FILE': 'content_spider/log/' + name + time.strftime("%Y-%m-%d", time.localtime()) + '.log'
			
 
				+    }
			
 
				+
			
 
				+    def get_chapter_list_url(self, bid):
			
 
				+        return self.base_url.format('ChapterLists') + '&bookid={}'.format(bid)
			
 
				+
			
 
				+    def chapter_list_result(self, response):
			
 
				+        result = json.loads(response.text)
			
 
				+        if result is None or result.get('data') is None:
			
 
				+            return []
			
 
				+
			
 
				+        result_list = []
			
 
				+        i = 0
			
 
				+        for chapter_item in result['data']:
			
 
				+            i = i+1
			
 
				+            result_list.append({
			
 
				+                'source_chapter_id': chapter_item['id'], 'name': chapter_item['title'],
			
 
				+                'sequence': i, 'is_vip': 1 if chapter_item['isVip'] else 0,
			
 
				+                'size': 0, 'recent_update_at': chapter_item['lastUpdateTime']
			
 
				+            })
			
 
				+        return result_list
			
 
				+
			
 
				+    def get_chapter_content_url(self, bid, cid):
			
 
				+        return self.base_url.format('ChapterContent') + '&bookid={}&chapterid={}'.format(bid, cid)
			
 
				+
			
 
				+    def chapter_content_result(self, response):
			
 
				+        result = json.loads(response.text)
			
 
				+        if result is None:
			
 
				+            return {'content': ''}
			
 
				+
			
 
				+        return {
			
 
				+            'content': result['data']['content'],
			
 
				+            'size': len(result['data']['content'])
			
 
				+        }
			
 
				+
			
--- a/content_spider/spiders/kanshu/bookupdate.py
+++ b/content_spider/spiders/kanshu/bookupdate.py
@@ -0,0 +1,53 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+
			
 
				+import random
			
 
				+from content_spider.baseSpider import baseUpdateSpider
			
 
				+import json
			
 
				+import time
			
 
				+
			
 
				+
			
 
				+class BookupdateSpider(baseUpdateSpider):
			
 
				+    name = 'kanshuupdate'
			
 
				+    allowed_domains = ['hezuo.lunjian.com']
			
 
				+    source = 'kanshu'
			
 
				+    source_name = '看书'
			
 
				+    source_id = 19
			
 
				+    base_url = 'http://hezuo.lunjian.com/open/ksbook/{}?channel_id=10054'
			
 
				+
			
 
				+    custom_settings = {
			
 
				+        'DOWNLOAD_DELAY': 0.01,
			
 
				+        'SOURCE': source,
			
 
				+        'LOG_FILE': 'content_spider/log/' + name + time.strftime("%Y-%m-%d", time.localtime()) + '.log'
			
 
				+    }
			
 
				+
			
 
				+    def get_chapter_list_url(self, bid):
			
 
				+        return self.base_url.format('ChapterLists') + '&bookid={}&random={}'.format(bid,random.randint(1,10000))
			
 
				+
			
 
				+    def chapter_list_result(self, response):
			
 
				+        result = json.loads(response.text)
			
 
				+        if result is None or result.get('data') is None:
			
 
				+            return []
			
 
				+
			
 
				+        result_list = []
			
 
				+        i = 0
			
 
				+        for chapter_item in result['data']:
			
 
				+            i = i + 1
			
 
				+            result_list.append({
			
 
				+                'source_chapter_id': chapter_item['id'], 'name': chapter_item['title'],
			
 
				+                'sequence': i, 'is_vip': 1 if chapter_item['isVip'] else 0,
			
 
				+                'size': 0, 'recent_update_at': chapter_item['lastUpdateTime']
			
 
				+            })
			
 
				+        return result_list
			
 
				+
			
 
				+    def get_chapter_content_url(self, bid, cid):
			
 
				+        return self.base_url.format('ChapterContent') + '&bookid={}&chapterid={}'.format(bid, cid)
			
 
				+
			
 
				+    def chapter_content_result(self, response):
			
 
				+        result = json.loads(response.text)
			
 
				+        if result is None:
			
 
				+            return {'content': ''}
			
 
				+
			
 
				+        return {
			
 
				+            'content': result['data']['content'],
			
 
				+            'size': len(result['data']['content'])
			
 
				+        }
			
--- a/content_spider/spiders/kanshu/【看书网输出】看书网书籍分类.xls
+++ b/content_spider/spiders/kanshu/【看书网输出】看书网书籍分类.xls
--- a/content_spider/spiders/kanshu/【看书网输出】看书网小说接口文档【最新版】.docx
+++ b/content_spider/spiders/kanshu/【看书网输出】看书网小说接口文档【最新版】.docx
--- a/content_spider/temp_test.py
+++ b/content_spider/temp_test.py
@@ -0,0 +1,241 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+import hashlib
			
 
				+import json
			
 
				+import re
			
 
				+from urllib.parse import urlencode
			
 
				+from urllib.parse import urlsplit
			
 
				+from urllib.parse import unquote
			
 
				+from urllib.parse import parse_qs
			
 
				+
			
 
				+import urllib
			
 
				+from xml.dom.minidom import parseString
			
 
				+from content_spider.Util import hashUtil, random_str
			
 
				+
			
 
				+import time
			
 
				+from content_spider.Util import my_log
			
 
				+
			
 
				+cate = [
			
 
				+		{
			
 
				+			"id": 1,
			
 
				+			"name": "都市生活",
			
 
				+			"channel": "男频",
			
 
				+			"childs": [
			
 
				+				{
			
 
				+					"id": 1,
			
 
				+					"name": "爱情婚姻"
			
 
				+				},
			
 
				+				{
			
 
				+					"id": 2,
			
 
				+					"name": "商战风云"
			
 
				+				},
			
 
				+				{
			
 
				+					"id": 3,
			
 
				+					"name": "职场励志"
			
 
				+				},
			
 
				+				{
			
 
				+					"id": 4,
			
 
				+					"name": "官场沉浮"
			
 
				+				},
			
 
				+				{
			
 
				+					"id": 5,
			
 
				+					"name": "现实百态"
			
 
				+				},
			
 
				+				{
			
 
				+					"id": 6,
			
 
				+					"name": "八卦杂谈"
			
 
				+				},
			
 
				+				{
			
 
				+					"id": 32,
			
 
				+					"name": "都市异能"
			
 
				+				},
			
 
				+				{
			
 
				+					"id": 40,
			
 
				+					"name": "现代修真"
			
 
				+				}
			
 
				+			]
			
 
				+		},
			
 
				+		{
			
 
				+			"id": 2,
			
 
				+			"name": "女性言情",
			
 
				+			"channel": "女频",
			
 
				+			"childs": [
			
 
				+				{
			
 
				+					"id": 9,
			
 
				+					"name": "总裁豪门"
			
 
				+				},
			
 
				+				{
			
 
				+					"id": 8,
			
 
				+					"name": "穿越时空"
			
 
				+				},
			
 
				+				{
			
 
				+					"id": 11,
			
 
				+					"name": "青春纯爱"
			
 
				+				},
			
 
				+				{
			
 
				+					"id": 12,
			
 
				+					"name": "架空历史"
			
 
				+				},
			
 
				+				{
			
 
				+					"id": 7,
			
 
				+					"name": "悬疑推理"
			
 
				+				},
			
 
				+				{
			
 
				+					"id": 10,
			
 
				+					"name": "综合其他"
			
 
				+				}
			
 
				+			]
			
 
				+		},
			
 
				+		{
			
 
				+			"id": 3,
			
 
				+			"name": "玄幻小说",
			
 
				+			"channel": "男频",
			
 
				+			"childs": [
			
 
				+				{
			
 
				+					"id": 13,
			
 
				+					"name": "东方玄幻"
			
 
				+				},
			
 
				+				{
			
 
				+					"id": 14,
			
 
				+					"name": "异界大陆"
			
 
				+				},
			
 
				+				{
			
 
				+					"id": 15,
			
 
				+					"name": "西方奇幻"
			
 
				+				},
			
 
				+				{
			
 
				+					"id": 16,
			
 
				+					"name": "异术超能"
			
 
				+				},
			
 
				+				{
			
 
				+					"id": 33,
			
 
				+					"name": "转世重生"
			
 
				+				}
			
 
				+			]
			
 
				+		},
			
 
				+		{
			
 
				+			"id": 4,
			
 
				+			"name": "历史军事",
			
 
				+			"channel": "男频",
			
 
				+			"childs": [
			
 
				+				{
			
 
				+					"id": 19,
			
 
				+					"name": "架空历史"
			
 
				+				},
			
 
				+				{
			
 
				+					"id": 20,
			
 
				+					"name": "历史传记"
			
 
				+				},
			
 
				+				{
			
 
				+					"id": 21,
			
 
				+					"name": "论古谈今"
			
 
				+				},
			
 
				+				{
			
 
				+					"id": 22,
			
 
				+					"name": "军事战争"
			
 
				+				},
			
 
				+				{
			
 
				+					"id": 23,
			
 
				+					"name": "军旅生活"
			
 
				+				},
			
 
				+				{
			
 
				+					"id": 24,
			
 
				+					"name": "抗战烽火"
			
 
				+				}
			
 
				+			]
			
 
				+		},
			
 
				+		{
			
 
				+			"id": 5,
			
 
				+			"name": "科幻灵异",
			
 
				+			"channel": "男频",
			
 
				+			"childs": [
			
 
				+				{
			
 
				+					"id": 25,
			
 
				+					"name": "未来世界"
			
 
				+				},
			
 
				+				{
			
 
				+					"id": 26,
			
 
				+					"name": "星际战争"
			
 
				+				},
			
 
				+				{
			
 
				+					"id": 27,
			
 
				+					"name": "古武机甲"
			
 
				+				},
			
 
				+				{
			
 
				+					"id": 28,
			
 
				+					"name": "灵异奇谈"
			
 
				+				},
			
 
				+				{
			
 
				+					"id": 29,
			
 
				+					"name": "恐怖惊悚"
			
 
				+				},
			
 
				+				{
			
 
				+					"id": 30,
			
 
				+					"name": "悬疑探险"
			
 
				+				},
			
 
				+				{
			
 
				+					"id": 35,
			
 
				+					"name": "侦探推理"
			
 
				+				},
			
 
				+				{
			
 
				+					"id": 36,
			
 
				+					"name": "末日危临"
			
 
				+				}
			
 
				+			]
			
 
				+		},
			
 
				+		{
			
 
				+			"id": 6,
			
 
				+			"name": "武侠仙侠",
			
 
				+			"channel": "男频",
			
 
				+			"childs": [
			
 
				+				{
			
 
				+					"id": 17,
			
 
				+					"name": "奇幻修真"
			
 
				+				},
			
 
				+				{
			
 
				+					"id": 18,
			
 
				+					"name": "古典仙侠"
			
 
				+				},
			
 
				+				{
			
 
				+					"id": 34,
			
 
				+					"name": "经典武侠"
			
 
				+				}
			
 
				+			]
			
 
				+		},
			
 
				+		{
			
 
				+			"id": 7,
			
 
				+			"name": "综合其他",
			
 
				+			"channel": "男频",
			
 
				+			"childs": [
			
 
				+				{
			
 
				+					"id": 31,
			
 
				+					"name": "其他类别"
			
 
				+				},
			
 
				+				{
			
 
				+					"id": 37,
			
 
				+					"name": "电子竞技"
			
 
				+				},
			
 
				+				{
			
 
				+					"id": 38,
			
 
				+					"name": "虚拟网游"
			
 
				+				},
			
 
				+				{
			
 
				+					"id": 39,
			
 
				+					"name": "体育竞技"
			
 
				+				}
			
 
				+			]
			
 
				+		}
			
 
				+	]
			
 
				+
			
 
				+
			
 
				+res = []
			
 
				+for i in cate:
			
 
				+
			
 
				+    if i['channel'] == '男频':
			
 
				+        channel_id = 1
			
 
				+    else:
			
 
				+        channel_id = 2
			
 
				+    for j in i['childs']:
			
 
				+        temp  = {'id': j['id'], 'name': j['name'], 'my_category_id': 0, 'ncategory_id': 0, 'my_category_name': '古代言情', 'channel_id': channel_id}
			
 
				+        res.append(temp)
			
 
				+
			
 
				+print(res)
			
--- a/scrapy.cfg
+++ b/scrapy.cfg
@@ -0,0 +1,11 @@
 
				+# Automatically created by: scrapy startproject
			
 
				+#
			
 
				+# For more information about the [deploy] section see:
			
 
				+# https://scrapyd.readthedocs.io/en/latest/deploy.html
			
 
				+
			
 
				+[settings]
			
 
				+default = content_spider.settings
			
 
				+
			
 
				+[deploy]
			
 
				+#url = http://localhost:6800/
			
 
				+project = content_spider