zhaoyang 2 éve
commit
ab0342a55b

+ 8 - 0
.gitignore

@@ -0,0 +1,8 @@
+.idea/
+content_spider/log
+content_spider/*.pyc
+content_spider/__pycache__
+content_spider/spiders/__pycache__
+content_spider/spiders/*/__pycache__
+content_spider/spiders/*/*.pyc
+.vscode/

+ 11 - 0
README.md

@@ -0,0 +1,11 @@
+# 采集脚本
+**cd /home/www/wangdu_spider**
+
+## 7lou采集
+* 全部采集,有去重 scrapy crawl 7lou 
+* 部分采集,不去重 scrapy crawl zbone -a bid=xx,xx,xxx
+
+## 趣阅采集
+* 全部采集,有去重 scrapy crawl shuangduxs
+* 部分采集,不去重 scrapy crawl sdone -a bid=xxx,xxx,xxx
+

+ 114 - 0
content_spider/Util.py

@@ -0,0 +1,114 @@
+# -*- coding: utf-8 -*-
+
+import logging
+import hashlib
+import random
+import time
+
+base_category = [
+    {"id": 1, "category_name": "玄幻仙侠", "channel_id": 1, "pid": 0},
+    {"id": 2, "category_name": "热血校园", "channel_id": 1, "pid": 0},
+    {"id": 3, "category_name": "都市暧昧", "channel_id": 1, "pid": 0},
+    {"id": 4, "category_name": "军事历史", "channel_id": 1, "pid": 0},
+    {"id": 7, "category_name": "游戏竞技", "channel_id": 1, "pid": 0},
+    {"id": 8, "category_name": "悬疑灵异", "channel_id": 1, "pid": 0},
+    {"id": 12, "category_name": "灵异鬼怪", "channel_id": 1, "pid": 8},
+    {"id": 14, "category_name": "历史穿越", "channel_id": 1, "pid": 4},
+    {"id": 19, "category_name": "游戏竞技", "channel_id": 1, "pid": 7},
+    {"id": 21, "category_name": "武侠仙侠", "channel_id": 1, "pid": 1},
+    {"id": 22, "category_name": "西方玄幻", "channel_id": 1, "pid": 1},
+    {"id": 23, "category_name": "玄幻奇幻", "channel_id": 1, "pid": 1},
+    {"id": 30, "category_name": "校园黑道", "channel_id": 1, "pid": 2},
+    {"id": 49, "category_name": "抗战烽火", "channel_id": 1, "pid": 4},
+    {"id": 51, "category_name": "特种军旅", "channel_id": 1, "pid": 4},
+    {"id": 54, "category_name": "都市爱情", "channel_id": 1, "pid": 3},
+    {"id": 55, "category_name": "官场沉浮", "channel_id": 1, "pid": 3},
+    {"id": 57, "category_name": "乡土风情", "channel_id": 1, "pid": 3},
+    {"id": 67, "category_name": "校园励志", "channel_id": 1, "pid": 2},
+    {"id": 68, "category_name": "现代修真", "channel_id": 1, "pid": 3},
+    {"id": 81, "category_name": "灵异恐怖", "channel_id": 1, "pid": 8},
+    {"id": 82, "category_name": "古代言情", "channel_id": 2, "pid": 0},
+    {"id": 83, "category_name": "穿越重生", "channel_id": 2, "pid": 82},
+    {"id": 84, "category_name": "经商种田", "channel_id": 2, "pid": 82},
+    {"id": 87, "category_name": "现代言情", "channel_id": 2, "pid": 0},
+    {"id": 88, "category_name": "豪门总裁", "channel_id": 2, "pid": 87},
+    {"id": 92, "category_name": "女生灵异", "channel_id": 2, "pid": 0},
+    {"id": 93, "category_name": "幻想言情", "channel_id": 2, "pid": 0},
+    {"id": 94, "category_name": "青春爱情", "channel_id": 1, "pid": 2},
+    {"id": 95, "category_name": "女生灵异", "channel_id": 2, "pid": 92},
+    {"id": 96, "category_name": "东方玄幻", "channel_id": 2, "pid": 93},
+    {"id": 97, "category_name": "古典仙侠", "channel_id": 2, "pid": 93},
+    {"id": 98, "category_name": "婚恋情感", "channel_id": 2, "pid": 87},
+    {"id": 99, "category_name": "民国爱情", "channel_id": 2, "pid": 87},
+    {"id": 100, "category_name": "其他", "channel_id": 2, "pid": 0},
+    {"id": 101, "category_name": "浪漫青春", "channel_id": 2, "pid": 0},
+    {"id": 102, "category_name": "耽美同人", "channel_id": 2, "pid": 0},
+    {"id": 103, "category_name": "青春纯爱", "channel_id": 2, "pid": 101},
+    {"id": 104, "category_name": "青春校园", "channel_id": 2, "pid": 101},
+    {"id": 105, "category_name": "蜕变成长", "channel_id": 2, "pid": 101},
+    {"id": 106, "category_name": "耽美同人", "channel_id": 2, "pid": 102},
+    {"id": 107, "category_name": "其他", "channel_id": 2, "pid": 100},
+    {"id": 108, "category_name": "异世大陆", "channel_id": 2, "pid": 93},
+    {"id": 109, "category_name": "远古神话", "channel_id": 2, "pid": 93},
+    {"id": 110, "category_name": "上古蛮荒", "channel_id": 2, "pid": 93},
+    {"id": 111, "category_name": "侦探推理", "channel_id": 2, "pid": 92},
+    {"id": 112, "category_name": "神秘文化", "channel_id": 2, "pid": 92},
+    {"id": 113, "category_name": "悬疑探险", "channel_id": 2, "pid": 92},
+    {"id": 114, "category_name": "恐怖惊悚", "channel_id": 2, "pid": 92},
+    {"id": 115, "category_name": "鬼夫言情", "channel_id": 2, "pid": 92},
+    {"id": 116, "category_name": "都市职场", "channel_id": 2, "pid": 87},
+    {"id": 117, "category_name": "娱乐明星", "channel_id": 2, "pid": 87},
+    {"id": 118, "category_name": "都市异能", "channel_id": 2, "pid": 87},
+    {"id": 119, "category_name": "游戏", "channel_id": 2, "pid": 87},
+    {"id": 120, "category_name": "宫斗宅斗", "channel_id": 2, "pid": 82},
+    {"id": 121, "category_name": "古典架空", "channel_id": 2, "pid": 82},
+    {"id": 122, "category_name": "清穿民国", "channel_id": 2, "pid": 82},
+    {"id": 123, "category_name": "女尊王朝", "channel_id": 2, "pid": 82},
+    {"id": 124, "category_name": "其他", "channel_id": 1, "pid": 0},
+    {"id": 125, "category_name": "衍生同人", "channel_id": 1, "pid": 124},
+    {"id": 126, "category_name": "轻小说", "channel_id": 1, "pid": 124},
+    {"id": 127, "category_name": "其他作品", "channel_id": 1, "pid": 124},
+
+]
+
+
+def get_category_by_name(category_name):
+    for item in base_category:
+        if item['category_name'] == category_name:
+            return item
+    return None
+
+
+def get_category_by_id(category_id):
+    for item in base_category:
+        if item['id'] == int(category_id):
+            return item
+    return None
+
+def random_str(slen=10):
+    seed = "1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
+    sa = []
+    for i in range(slen):
+        sa.append(random.choice(seed))
+    return ''.join(sa)
+
+
+def my_log(name, msg, level='info'):
+    logger = logging.getLogger('stats')
+    logger.setLevel("DEBUG")
+    file = 'content_spider/log/common' + time.strftime("%Y-%m-%d", time.localtime()) + '.log'
+    logger.addHandler(logging.FileHandler(file))
+    now_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+    s = now_time + ' [{}] '.format(name) + level.upper() + ':' + str(msg)
+    if level == 'error':
+        logger.error(s)
+    else:
+        logger.info(s)
+
+
+def md5(keystr):
+    m = hashlib.md5()
+    m.update(keystr.encode('utf-8'))
+    return m.hexdigest()
+
+

+ 0 - 0
content_spider/__init__.py


+ 466 - 0
content_spider/baseSpider.py

@@ -0,0 +1,466 @@
+import scrapy
+import time
+import random
+from content_spider.mysqlHelper import MysqlHelper
+from content_spider.items import BookInfoItem, ChapterItem
+from content_spider.pipelines import formatcontent, removePunctuation
+from content_spider.Util import my_log
+
+
+
+class baseSpider(scrapy.Spider):
+    name = ''
+    source = ''
+    source_name = ''
+    source_id = 0
+    custom_settings = {
+        'DOWNLOAD_DELAY': 0.01,
+        'SOURCE': source,
+    }
+
+    def __init__(self, host, user, password, db, stats, settings):
+        scrapy.Spider.__init__(self)
+        source = self.source
+        source_id = self.source_id
+        self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db, source=source,source_id=source_id)
+        self.__stats = stats
+        self.__stats.set_value('spider_type', 'add')
+        self.__stats.set_value('bid_list', [])
+        self.__is_first = True
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        settings = crawler.settings
+        host = settings.get('MYSQL_HOST')
+        user = settings.get('MYSQL_USER')
+        password = settings.get('MYSQL_PWD')
+        db = settings.get('MYSQL_DB')
+        return cls(host=host, user=user, password=password, db=db, stats=crawler.stats, settings=settings)
+
+    def start_requests(self):
+        yield scrapy.Request(self.get_start_url(), callback=self.parse_book_list)
+
+    def parse_book_list(self, response):
+        result = self.bid_list_result(response)
+        for item in result:
+            bid = item['id']
+            result = self.mysqlHelper.get_book_info_by_source(bid)
+            if result is not None:
+                continue
+            url = self.get_book_info_url(bid)
+            yield scrapy.Request(url, callback=self.parse_book_info)
+            # break
+
+    def parse_book_info(self, response):
+        if response.text == '':
+            return None
+        now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+        result = self.book_info_result(response)
+        if result is None:
+            return None
+
+        category_id = 0 if result.get('category_id') is None else result.get('category_id')
+
+        book_info_item = BookInfoItem()
+        source_bid = result.get('bid')
+        book_info_item['cp_bid'] = source_bid
+        book_info_item['cp_name'] = self.source_name
+        book_info_item['cp_id'] = self.source_id
+        book_info_item['name'] = result['name']
+        book_info_item['author'] = result['author']
+        book_info_item['intro'] = result['intro']
+        book_info_item['cover'] = result['cover']
+        book_info_item['keyword'] = result['keyword']
+        book_info_item['category_id'] = category_id
+        book_info_item['status'] = result['status']
+        book_info_item['size'] = 0
+        book_info_item['category_name'] = result['category']
+        book_info_item['last_chapter'] = ''
+        book_info_item['chapter_count'] = 0
+        book_info_item['first_cid'] = 0
+        book_info_item['last_cid'] = 0
+        book_info_item['channel'] = result['channel']
+        book_info_item['updated_at'] = now
+        book_info_item['created_at'] = now
+        bid = self.mysqlHelper.insert_book(book_info_item)
+        if self.__is_first:
+            self.__stats.set_value('bid_start', bid)
+            self.__is_first = False
+        self.__stats.get_value('bid_list').append(bid)
+        url = self.get_chapter_list_url(source_bid)
+        meta = {'bid': bid, 'source_bid': source_bid}
+        yield scrapy.Request(url, callback=self.parse_chapter_list, meta=meta)
+
+    def parse_chapter_list(self, response):
+        if response.text == '':
+            return None
+        result = self.chapter_list_result(response)
+        bid = response.meta['bid']
+        source_bid = response.meta['source_bid']
+        i = 0
+        for chapter_item in result:
+            i = i + 1
+            cid = chapter_item['source_chapter_id']
+            meta = chapter_item
+            meta['bid'] = bid
+            url = self.get_chapter_content_url(source_bid, cid)
+            yield scrapy.Request(url, callback=self.parse_chapter_content, meta=meta)
+
+    def parse_chapter_content(self, response):
+        if response.text == '':
+            return None
+        now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+        result = self.chapter_content_result(response)
+        meta = response.meta
+        chapter_item = ChapterItem()
+        chapter_item['bid'] = meta['bid']
+        chapter_item['name'] = meta['name']
+        chapter_item['sequence'] = meta['sequence']
+        chapter_item['size'] = meta['size']
+        chapter_item['is_vip'] = meta['is_vip']
+        chapter_item['source_chapter_id'] = meta['source_chapter_id']
+        chapter_item['chapter_content_id'] = 0
+        chapter_item['content'] = formatcontent(result['content'])
+
+        if result.get('size') is not None:
+            chapter_item['size'] = result.get('size')
+        if result.get('is_vip') is not None:
+            chapter_item['is_vip'] = result.get('is_vip')
+        if result.get('name') is not None:
+            chapter_item['name'] = result.get('name')
+        if result.get('recent_update_at') is not None:
+            chapter_item['recent_update_at'] = result.get('recent_update_at')
+        if result.get('source_chapter_id') is not None:
+            chapter_item['source_chapter_id'] = result.get('source_chapter_id')
+        chapter_item['prev_cid'] = 0
+        chapter_item['next_cid'] = 0
+        chapter_item['updated_at'] = now
+        chapter_item['created_at'] = now
+        chapter_item = ChapterItem(meta)
+        yield chapter_item
+
+    def get_start_url(self):
+        raise NotImplementedError
+
+    def bid_list_result(self, response):
+        raise NotImplementedError
+
+    def get_book_info_url(self, bid):
+        raise NotImplementedError
+
+    def book_info_result(self, response):
+        raise NotImplementedError
+
+    def get_chapter_list_url(self, bid):
+        raise NotImplementedError
+
+    def chapter_list_result(self, response):
+        raise NotImplementedError
+
+    def get_chapter_content_url(self, bid, cid):
+        raise NotImplementedError
+
+    def chapter_content_result(self, response):
+        raise NotImplementedError
+
+
+class baseUpdateSpider(scrapy.Spider):
+    name = ''
+    source = ''
+    source_id = 0
+    custom_settings = {
+        'DOWNLOAD_DELAY': 0.01,
+        'SOURCE': source,
+    }
+
+    def __init__(self, host, user, password, db, stats, settings):
+        scrapy.Spider.__init__(self)
+        source = self.source
+        self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db, source=source,source_id=self.source_id)
+        self.__stats = stats
+        self.__stats.set_value('spider_type', 'update')
+        self.__stats.set_value('bid_list', [])
+        self.__is_first = True
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        settings = crawler.settings
+        host = settings.get('MYSQL_HOST')
+        user = settings.get('MYSQL_USER')
+        password = settings.get('MYSQL_PWD')
+        db = settings.get('MYSQL_DB')
+        return cls(host=host, user=user, password=password, db=db, stats=crawler.stats, settings=settings)
+
+    def start_requests(self):
+        book_list = self.mysqlHelper.get_need_update_book_list()
+        if book_list is not None:
+            for book in book_list:
+                url = self.get_chapter_list_url(book['copilot'])
+                meta = {'bid': book['id'], 'cp_bid': book['cp_bid']}
+                yield scrapy.Request(url, callback=self.parse_chapter_list, meta=meta)
+
+    def parse_chapter_list(self, response):
+        if response.text == '':
+            return None
+        chapter_list = self.chapter_list_result(response)
+        meta = response.meta
+        if chapter_list is not None:
+            bid = response.meta.get('bid')
+            cp_bid = response.meta.get('cp_bid')
+            last_chapter = self.mysqlHelper.get_last_cid_by_bid(bid)
+            start = False
+            if last_chapter is None:
+                start = True
+                last_source_cid = ''
+                last_sequence = 0
+                last_chapter_id = 0
+            else:
+                last_source_cid = str(last_chapter['source_chapter_id'])
+                last_sequence = last_chapter['sequence']
+                last_chapter_id = last_chapter['id']
+
+            has_new_chapter = False
+            for chapter_item in chapter_list:
+                if not start:
+                    if len(last_source_cid) > 0:
+                        if str(chapter_item['source_chapter_id']) == str(last_source_cid):
+                            start = True
+                    else:
+                        if int(chapter_item['sequence']) == last_sequence:
+                            start = True
+                    continue
+                if not has_new_chapter:
+                    self.__stats.get_value('bid_list').append(
+                        {"bid": meta['bid'], 'start': last_chapter_id, 'start_sequence': last_sequence})
+                    has_new_chapter = True
+                cid = chapter_item['source_chapter_id']
+                last_sequence = last_sequence + 1
+                meta = chapter_item
+                meta['bid'] = bid
+                url = self.get_chapter_content_url(cp_bid, cid)
+                yield scrapy.Request(url, callback=self.parse_chapter_content, meta=meta)
+
+    def parse_chapter_content(self, response):
+        if response.text == '':
+            return None
+        now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+        result = self.chapter_content_result(response)
+        meta = response.meta
+        chapter_item = ChapterItem()
+        chapter_item['bid'] = meta['bid']
+        chapter_item['name'] = meta['name']
+        chapter_item['sequence'] = meta['sequence']
+        chapter_item['size'] = meta['size']
+        chapter_item['is_vip'] = meta['is_vip']
+        chapter_item['source_chapter_id'] = meta['source_chapter_id']
+        chapter_item['chapter_content_id'] = 0
+        chapter_item['content'] = formatcontent(result['content'])
+
+        if result.get('size') is not None:
+            chapter_item['size'] = result.get('size')
+        if result.get('is_vip') is not None:
+            chapter_item['is_vip'] = result.get('is_vip')
+        if result.get('name') is not None:
+            chapter_item['name'] = result.get('name')
+        if result.get('recent_update_at') is not None:
+            chapter_item['recent_update_at'] = result.get('recent_update_at')
+        if result.get('source_chapter_id') is not None:
+            chapter_item['source_chapter_id'] = result.get('source_chapter_id')
+        chapter_item['prev_cid'] = 0
+        chapter_item['next_cid'] = 0
+        chapter_item['updated_at'] = now
+        chapter_item['created_at'] = now
+        chapter_item = ChapterItem(meta)
+        yield chapter_item
+
+
+    def get_chapter_list_url(self, bid):
+        raise NotImplementedError
+
+    def chapter_list_result(self, response):
+        raise NotImplementedError
+
+    def get_chapter_content_url(self, bid, cid):
+        raise NotImplementedError
+
+    def chapter_content_result(self, response):
+        raise NotImplementedError
+
+
+class fixChapterSpider(scrapy.Spider):
+    name = ''
+    source = ''
+    source_id = 0
+    custom_settings = {
+        'DOWNLOAD_DELAY': 0.01,
+        'SOURCE': source,
+    }
+
+    def __init__(self, host, user, password, db, bid_list, stats, settings):
+        scrapy.Spider.__init__(self)
+        source = self.source
+        self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db, source=source,source_id=self.source_id)
+        self.__stats = stats
+        self.__stats.set_value('spider_type', 'fix')
+        self.__stats.set_value('bid_list', [])
+        self.__is_first = True
+        self.bid_list = bid_list
+
+    @classmethod
+    def from_crawler(cls, crawler, *args, **kwargs):
+        settings = crawler.settings
+        host = settings.get('MYSQL_HOST')
+        user = settings.get('MYSQL_USER')
+        password = settings.get('MYSQL_PWD')
+        db = settings.get('MYSQL_DB')
+        bid = kwargs.get('bid')
+        if bid is not None:
+            bid_list = bid.split(',')
+        else:
+            bid_list = []
+        return cls(host=host, user=user, password=password, db=db,
+                   bid_list=bid_list, stats=crawler.stats, settings=settings)
+
+    def start_requests(self):
+        if self.bid_list is None:
+            yield
+            return
+        for book in self.bid_list:
+            info = self.mysqlHelper.get_book_info_by_id(book)
+            if info is None:
+                continue
+            self.mysqlHelper.re_sequence(book)
+            url = self.get_chapter_list_url(info['cp_bid'])
+            meta = {'bid': book, 'cp_bid': info['cp_bid']}
+            yield scrapy.Request(url, callback=self.parse_chapter_list, meta=meta)
+
+    def parse_chapter_list(self, response):
+        if response.text == '':
+            return None
+        chapter_list = self.chapter_list_result(response)
+        if chapter_list is not None:
+            bid = response.meta.get('bid')
+            cp_bid = response.meta.get('cp_bid')
+            last_sequence = 0
+            for chapter_item in chapter_list:
+                last_sequence = last_sequence + 1
+                chapter_info = self.mysqlHelper.get_cid_by_bid_sequence(bid, chapter_item['sequence'])
+                cid = chapter_item['source_chapter_id']
+                meta = chapter_item
+                if chapter_info is not None:
+                    meta['type'] = 'update'
+                    meta['chapter_content_id'] = chapter_info['chapter_content_id']
+                    meta['cid'] = chapter_info['id']
+                meta['bid'] = bid
+                url = self.get_chapter_content_url(cp_bid, cid)
+                yield scrapy.Request(url, callback=self.parse_chapter_content, meta=meta)
+            self.__stats.get_value('bid_list').append({'bid':bid,'end':last_sequence})
+
+    def parse_chapter_content(self, response):
+        if response.text == '':
+            return None
+        now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+        result = self.chapter_content_result(response)
+        meta = response.meta
+        data = {}
+        data['bid'] = meta['bid']
+        data['name'] = meta['name']
+        data['size'] = meta['size']
+        data['is_vip'] = meta['is_vip']
+        data['sequence'] = meta['sequence']
+        data['source_chapter_id'] = meta['source_chapter_id']
+        data['recent_update_at'] = meta['recent_update_at']
+        data['content'] = formatcontent(result['content'])
+
+        if result.get('size') is not None:
+            data['size'] = result.get('size')
+        if result.get('is_vip') is not None:
+            data['is_vip'] = result.get('is_vip')
+        if result.get('name') is not None:
+            data['name'] = result.get('name')
+        if result.get('recent_update_at') is not None:
+            data['recent_update_at'] = result.get('recent_update_at')
+        if result.get('source_chapter_id') is not None:
+            data['source_chapter_id'] = result.get('source_chapter_id')
+        if meta.get('type') is not None:
+            content = formatcontent(result['content'])
+            data['content'] = content
+            data['size'] = len(removePunctuation(content))
+            self.mysqlHelper.update_content(meta['chapter_content_id'], meta['name'], result['content'])
+            if meta.get('cid') is not None:
+                data['cid'] = meta['cid']
+            self.mysqlHelper.update_chapter(data)
+        else:
+            data['prev_cid'] = 0
+            data['next_cid'] = 0
+            data['updated_at'] = now
+            data['created_at'] = now
+            chapter_item = ChapterItem(data)
+            yield chapter_item
+
+    def get_chapter_list_url(self, bid):
+        raise NotImplementedError
+
+    def chapter_list_result(self, response):
+        raise NotImplementedError
+
+    def get_chapter_content_url(self, bid, cid):
+        raise NotImplementedError
+
+    def chapter_content_result(self, response):
+        raise NotImplementedError
+
+
+class fixBookInfoSpider(scrapy.Spider):
+    name = ''
+    source = ''
+    source_name = ''
+    source_id = 0
+
+    def __init__(self, host, user, password, db, bid_list, stats):
+        scrapy.Spider.__init__(self)
+        source = self.source
+        self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db, source=source,source_id=self.source_id)
+        self.bid_list = bid_list
+
+    @classmethod
+    def from_crawler(cls, crawler, *args, **kwargs):
+        settings = crawler.settings
+        host = settings.get('MYSQL_HOST')
+        user = settings.get('MYSQL_USER')
+        password = settings.get('MYSQL_PWD')
+        db = settings.get('MYSQL_DB')
+        bid = kwargs.get('bid')
+        if bid is not None:
+            bid_list = bid.split(',')
+        else:
+            bid_list = []
+        return cls(host=host, user=user, password=password, db=db, bid_list=bid_list, stats=crawler.stats)
+
+    def start_requests(self):
+        if self.bid_list is None:
+            yield
+            return
+        for book in self.bid_list:
+            info = self.mysqlHelper.get_book_info_by_id(book)
+            if info is None:
+                continue
+            url = self.get_book_info_url(info['cp_bid'])
+            meta = {'bid': book, 'cp_bid': info['cp_bid']}
+            yield scrapy.Request(url, callback=self.parse_book_info, meta=meta)
+
+    def parse_book_info(self, response):
+        if response.text == '':
+            return None
+        result = self.book_info_result(response)
+        if result is None:
+            yield
+            return
+        result['bid'] = response.meta['bid']
+        self.mysqlHelper.update_book_info(result)
+
+    def get_book_info_url(self, bid):
+        raise NotImplementedError
+
+    def book_info_result(self, response):
+        raise NotImplementedError

+ 5 - 0
content_spider/bash/command.sh

@@ -0,0 +1,5 @@
+#!/bin/bash
+pwd=/home/www/zw_content_spider
+cd $pwd
+spider_name=$1
+/root/.pyenv/shims/scrapy crawl $spider_name

+ 6 - 0
content_spider/bash/fixCommand.sh

@@ -0,0 +1,6 @@
+#!/bin/bash
+pwd=/home/www/zw_content_spider
+cd $pwd
+spider_name=$1
+bid=$2
+/root/.pyenv/shims/scrapy crawl $spider_name -a bid=$bid

+ 45 - 0
content_spider/items.py

@@ -0,0 +1,45 @@
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class BookInfoItem(scrapy.Item):
+    cp_name = scrapy.Field()
+    cp_bid = scrapy.Field()
+    cp_id = scrapy.Field()
+    name = scrapy.Field()
+    author = scrapy.Field()
+    intro = scrapy.Field()
+    cover = scrapy.Field()
+    keyword = scrapy.Field()
+    category_id = scrapy.Field()
+    category_name = scrapy.Field()
+    status = scrapy.Field()
+    chapter_count = scrapy.Field()
+    first_cid = scrapy.Field()
+    last_cid = scrapy.Field()
+    size = scrapy.Field()
+    channel = scrapy.Field()
+    last_chapter = scrapy.Field()
+    updated_at = scrapy.Field()
+    created_at = scrapy.Field()
+
+
+class ChapterItem(scrapy.Item):
+    bid = scrapy.Field()
+    name = scrapy.Field()
+    sequence = scrapy.Field()
+    size = scrapy.Field()
+    is_vip = scrapy.Field()
+    prev_cid = scrapy.Field()
+    next_cid = scrapy.Field()
+    recent_update_at = scrapy.Field()
+    content = scrapy.Field()
+    chapter_content_id = scrapy.Field()
+    source_chapter_id = scrapy.Field()
+    created_at = scrapy.Field()
+    updated_at = scrapy.Field()
+

+ 313 - 0
content_spider/middlewares.py

@@ -0,0 +1,313 @@
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+import re
+import hashlib
+from urllib.parse import urlencode
+import urllib
+import time
+from scrapy import signals
+
+
+# useful for handling different item types with a single interface
+# from itemadapter import is_item, ItemAdapter
+from content_spider.Util import hashUtil
+
+
+class ContentSpiderSpiderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, or item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Request or item objects.
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+
+
+class ContentSpiderDownloaderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+
+
+class LianshangSpiderDownloaderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        url = request._url
+        ts = int(time.time())
+        request._url = re.sub(r'oauth_timestamp=\d+', 'oauth_timestamp={}'.format(ts), url)
+        return None
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+
+
+class WangyiSpiderDownloaderMiddleware:
+    consumerKey = "58434765"
+    secretKey = "AECnczs1GpBGDSXz"
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        url = request._url
+        request._url = self.re_sign(url)
+        return None
+
+    def process_response(self, request, response, spider):
+        return response
+
+    def process_exception(self, request, exception, spider):
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+
+    def get_sign(self, primary_url, param):
+        url = 'GET' + primary_url
+        param = sorted(param.items(), key=lambda x: x[0])
+        string = ''
+        m = hashlib.md5()
+        for item in param:
+            string = string + '{}={}'.format(str(item[0]), str(item[1]))
+        string = url + string + self.secretKey
+        string = urllib.parse.quote(string, '')
+        m.update(string.encode('utf-8'))
+        sign = m.hexdigest()
+        return sign
+
+    def re_sign(self, url):
+        res = urllib.parse.urlsplit(url)
+        simple_url = '{}://{}{}'.format(res[0], res[1], res[2])
+        query_param = urllib.parse.parse_qs(res[3])
+        timestamp = int(time.time() * 1000)
+        param = {}
+        for item in query_param:
+            if item == 'timestamp':
+                param['timestamp'] = timestamp
+            elif item == 'expires':
+                param['expires'] = timestamp + 10 * 60 * 1000
+            elif item == 'sign':
+                continue
+            else:
+                param[item] = query_param[item][0]
+        param['sign'] = self.get_sign(simple_url, param)
+        return simple_url + '?' + urlencode(param)
+
+
+class BimoSpiderDownloaderMiddleware:
+    api_key = 'ZmfJQZaF8FuQSuUx'
+    api_secret = 'JgBETPKMPgsRBqlNtKajwQf4zFQuMwYr'
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        url = request._url
+        request._url = self.re_sign(url)
+        return None
+
+    def process_response(self, request, response, spider):
+        return response
+
+    def process_exception(self, request, exception, spider):
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+
+    def re_sign(self, url):
+        res = urllib.parse.urlsplit(url)
+        simple_url = '{}://{}{}'.format(res[0], res[1], res[2])
+        query_param = urllib.parse.parse_qs(res[3])
+        param = dict(time=int(time.time()))
+        for item in query_param:
+            if item == 'sign' or item == 'signType' or item == 'time':
+                continue
+            else:
+                param[item] = query_param[item][0]
+        param['sign'] = self.sign(param)
+        param['signType'] = 'MD5'
+        return simple_url + '?' + urlencode(param)
+
+    def sign(self, param):
+        param = sorted(param.items(), key=lambda x: x[0])
+        string = ''
+        for item in param:
+            string = string + str(item[0]) + '=' + str(item[1]) + '&'
+        string = string + 'apiSecret=' + self.api_secret
+        return hashUtil.md5(string)
+
+
+class MotieSpiderDownloaderMiddleware:
+    customerId = '922'
+    customerSecret = 'kQSdaERniXSxzr20IJgtkvru1nSLFLjR'
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        url = request._url
+        new_url = self.re_sign(url)
+        if new_url != '' and new_url is not None:
+            request._url = self.re_sign(url)
+        return None
+
+    def process_response(self, request, response, spider):
+        return response
+
+    def process_exception(self, request, exception, spider):
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+
+    def re_sign(self, url):
+        res = urllib.parse.urlsplit(url)
+        simple_url = '{}://{}{}'.format(res[0], res[1], res[2])
+        query_param = urllib.parse.parse_qs(res[3])
+        timestamp = int(time.time()) * 1000
+        param = {}
+        sign_string = ''
+        if res[2] == '/api/motie/get/book':
+            return simple_url + '?customerId={}'.format(query_param['customerId'][0])
+        for item in query_param:
+            if item == 'timestamp' or item == 'sign':
+                continue
+            param[item] = query_param[item][0]
+            param['timestamp'] = timestamp
+        if res[2] == '/api/motie/get/bookinfo' or res[2] == '/api/motie/get/chapterlist':
+            sign_string = '{}#{}#{}#{}'.format(
+                query_param['bookId'][0],
+                query_param['customerId'][0],
+                timestamp, self.customerSecret)
+
+        if res[2] == '/api/motie/get/chapter':
+            sign_string = '{}#{}#{}#{}#{}'.format(
+                query_param['bookId'][0],
+                query_param['chapterId'][0],
+                query_param['customerId'][0],
+                timestamp, self.customerSecret)
+        if sign_string == '':
+            return None
+        param['sign'] = hashUtil.md5(sign_string)
+        return simple_url + '?' + urlencode(param)

+ 201 - 0
content_spider/mysqlHelper.py

@@ -0,0 +1,201 @@
+# -*- coding: utf-8 -*-
+import time
+
+import pymysql.cursors
+
+
+class MysqlHelper(object):
+    def __init__(self, host, user, password, db, source,source_id):
+        self.__conn = pymysql.connect(host=host, user=user, password=password, db=db, charset='utf8mb4',
+                                      cursorclass=pymysql.cursors.DictCursor)
+        self.source = source
+        self.source_id = source_id
+
+    def get_book_info_by_source(self, source_bid):
+        sql = 'select id from zy_books where cp_bid="%s" and  cp_id = %s'
+        with self.__conn.cursor() as cursor:
+            cursor.execute(sql, (int(source_bid), self.source_id))
+            result = cursor.fetchone()
+        self.__conn.commit()
+        return result
+
+    def get_book_info_by_id(self,bid):
+        sql = 'select id,cp_bid from zy_books where id= %s and  cp_id = %s'
+        with self.__conn.cursor() as cursor:
+            cursor.execute(sql, (int(bid), self.source_id))
+            result = cursor.fetchone()
+        self.__conn.commit()
+        return result
+
+    def get_need_update_book_list(self):
+        sql = 'select id,cp_bid from zy_books where cp_id=%s and `status` = 0'
+        with self.__conn.cursor() as cursor:
+            cursor.execute(sql, (self.source_id,))
+            result = cursor.fetchall()
+        self.__conn.commit()
+        return result
+
+
+    def get_last_cid_by_bid(self, bid):
+        sql_format = "select id,bid,`name`,sequence,source_chapter_id from zy_book_chapters where bid = {} order by sequence desc limit 1"
+
+        sql = sql_format.format(bid)
+        with self.__conn.cursor() as cursor:
+            cursor.execute(sql)
+            result = cursor.fetchone()
+        self.__conn.commit()
+        return result
+
+
+    def get_cid_by_bid_sequence(self, bid, sequence):
+        sql = "select id,chapter_content_id  from zy_book_chapters where  bid = %s and  sequence=%s"
+        with self.__conn.cursor() as cursor:
+            cursor.execute(sql, (int(bid), int(sequence)))
+            result = cursor.fetchone()
+        self.__conn.commit()
+        return result
+
+
+    def insert_book(self, item):
+        now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+        sql = '''
+        insert into zy_books(cp_id,cp_name,cp_bid,`name`,author, intro, cover ,keyword , category_id,status
+        ,`size`,category_name,updated_at,created_at)
+        values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
+        '''
+        with self.__conn.cursor() as cursor:
+            cursor.execute(sql, self.source_id,self.source,(item.get('cp_bid'),
+                                 item.get('name'),
+                                 item.get('author'),
+                                 item.get('intro'),
+                                 item.get('cover'),
+                                 item.get('keyword'),
+                                 item.get('category_id'),
+                                 item.get('status'),
+                                 item.get('size'),
+                                 item.get('category_name'),
+                                 now,now
+                                 ))
+            bid = int(cursor.lastrowid)
+        self.__conn.commit()
+        return bid
+
+    def insert_chapter(self, item):
+        chapter_content_id = self.insert_content(item)
+        sql = "INSERT INTO `zy_book_chapters` (`bid`, `name`,`sequence`,`size`,`is_vip`,`prev_cid`,`next_cid`," \
+              "`recent_update_at`,`created_at`,`updated_at`,`chapter_content_id`,source_chapter_id) " \
+              "VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
+        with self.__conn.cursor() as cursor:
+            cursor.execute(sql, (
+                item['bid'], item['name'], item['sequence'], item['size'], item['is_vip'], item['prev_cid'],
+                item['next_cid'], item['recent_update_at'], item['created_at'], item['updated_at'],
+                chapter_content_id,
+                item['source_chapter_id']))
+            cid = int(cursor.lastrowid)
+        self.__conn.commit()
+        return cid
+
+    def insert_content(self, item):
+        sql = "insert into zy_book_chapter_contents (bid,chapter_name,content,created_at,updated_at) values (%s,%s,%s,%s,%s)"
+        with self.__conn.cursor() as cursor:
+            cursor.execute(sql, (
+                item['bid'],item['name'], item['content'], item['created_at'], item['updated_at']))
+            content_id = int(cursor.lastrowid)
+        self.__conn.commit()
+        return content_id
+
+
+
+    def update_content(self, content_id, chapter_name, content):
+        now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+        sql = 'update zy_book_chapter_contents set chapter_name=%s,content=%s,updated_at=%s  where id=%s'
+        with self.__conn.cursor() as cursor:
+            cursor.execute(sql, (
+                chapter_name, content, now, int(content_id)))
+        self.__conn.commit()
+
+    def update_chapter(self, item):
+        now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+        sql = 'update zy_book_chapters set `name`=%s,`sequence`=%s,`size`=%s,`is_vip`=%s,' \
+              'updated_at=%s,`source_chapter_id`=%s where id = %s'
+        with self.__conn.cursor() as cursor:
+            cid = int(item['cid'])
+            cursor.execute(sql, (
+                item['name'], item['sequence'], item['size'], item['is_vip'],  now,
+                item['source_chapter_id'],  cid))
+        self.__conn.commit()
+
+    def update_book_info(self, book_info):
+        sql = 'update zy_books set `name`=%s,author=%s,intro=%s,cover=%s,gender=%s,category_name=%s where id =%s'
+        with self.__conn.cursor() as cursor:
+            cursor.execute(sql, (
+                book_info['name'], book_info['author'], book_info['intro'], book_info['cover'],
+                book_info['gender'], book_info['category'], int(book_info['bid'])))
+        self.__conn.commit()
+
+
+    def re_sequence(self, bid):
+        sql = '''
+        update zy_book_chapters a join ( 
+SELECT id,(@a:=@a+1) as sequence FROM zy_book_chapters,(SELECT @a:=0) as a WHERE bid = {}  ORDER BY sequence
+ ) b on a.id  = b.id set a.sequence = b.sequence where a.bid = {}
+        '''.format(bid, bid)
+        with self.__conn.cursor() as cursor:
+            cursor.execute(sql)
+        self.__conn.commit()
+
+    
+    def after_spider(self,bid,start=1):
+        chapter_list = self.get_simple_chapter_list(bid,start)
+        if chapter_list is None:
+            return None
+        point = 0
+        for chapter_item in chapter_list:
+            if point == 0:
+                point = chapter_item['id']
+                continue
+            sql1 = 'update zy_book_chapters set next_cid={} where id={}'.format(chapter_item['id'],point)
+            sql2 = 'update zy_book_chapters set prev_cid={},next_cid=0 where id={}'.format(point,chapter_item['id'])
+            self.simple_update(sql1)
+            self.simple_update(sql2)
+            point = chapter_item['id']
+
+        
+        book_info = self.get_base_info(bid)
+        book_sql = '''
+            update zw_books set size={},chapter_count={},first_cid={},last_cid={},last_chapter='{}'  where  id = {}
+        '''.format(book_info['size'],book_info['chapter_count'],book_info['first_cid'],book_info['last_cid'],book_info['last_chapter'],bid)
+
+        self.simple_update(book_sql)
+
+    def after_fix_delete_unnecessary(self,bid,start):
+        sql = 'update zy_book_chapters set bid=-bid where bid={} and sequence > {}'.format(bid,start)
+        self.simple_update(sql)
+
+
+    def get_base_info(self,bid):
+        sql = '''
+        SELECT id as last_cid,`name` as  last_chapter,( SELECT id FROM zy_book_chapters WHERE bid = {} ORDER BY sequence limit 1 ) as first_cid ,
+( SELECT count(*) FROM zy_book_chapters WHERE bid = {} ) as chapter_count,
+( SELECT sum(size) FROM zy_book_chapters WHERE bid = {} ) as size
+FROM zy_book_chapters where   bid = {} ORDER BY sequence desc  LIMIT 1
+        '''.format(bid,bid,bid,bid) 
+        with self.__conn.cursor() as cursor:
+            cursor.execute(sql)
+            result = cursor.fetchone()
+        self.__conn.commit()
+        return result
+    
+    def get_simple_chapter_list(self,bid,start):
+        sql = 'select id from zy_book_chapters order by sequence where bid = {} where sequence >={}'.format(bid,start)
+        with self.__conn.cursor() as cursor:
+            cursor.execute(sql)
+            result = cursor.fetchall()
+        self.__conn.commit()
+        return result
+
+    def simple_update(self,sql):
+        with self.__conn.cursor() as cursor:
+            cursor.execute(sql)
+        self.__conn.commit()
+

+ 90 - 0
content_spider/pipelines.py

@@ -0,0 +1,90 @@
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+
+
+# useful for handling different item types with a single interface
+# from itemadapter import ItemAdapter
+
+import os
+import re
+import redis
+from content_spider.mysqlHelper import MysqlHelper
+from content_spider.Util import my_log
+
+redis_crawl_flag_key = 'book:crawl:stats'
+redis_update_flag_key = 'book:update:stats'
+redis_fix_flag_key = 'book:fix:stats'
+
+
+def formatcontent(content):
+    content = content.replace(' ', '')
+    content = content.replace('<p>', '')
+    content = content.replace('</p>', "\r\n")
+    content = content.splitlines()
+    content = map(lambda s: s.strip(), content)
+    content = filter(lambda s: s != '', content)
+    content = '\r\n'.join(content)
+    return content.strip()
+
+
+def removePunctuation(text):
+    punctuation = '!,;:?"\'、,;!”“。?,'
+    text = re.sub(r'[{}]+'.format(punctuation), ' ', text)
+    return text.strip().replace('\r\n', '').replace('\n', '').replace('\r', '')
+
+
+class ContentSpiderPipeline:
+    def process_item(self, item, spider):
+        return item
+
+
+class ChapterItemPipeline:
+
+    def __init__(self, host, user, password, db, source, stats,settings):
+        self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db, source=source,source_id=0)
+        self.__stats = stats
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        stats = crawler.stats
+        return cls(
+            host=crawler.settings.get('MYSQL_HOST'),
+            user=crawler.settings.get('MYSQL_USER'),
+            password=crawler.settings.get('MYSQL_PWD'),
+            db=crawler.settings.get('MYSQL_DB'),
+            source=crawler.settings.get('SOURCE'),
+            stats=stats,
+            settings=crawler.settings
+        )
+
+    def process_item(self, item, spider):
+        if item.get('content') != "":
+            content = formatcontent(item['content'])
+            item['content'] = content
+            item['size'] = len(removePunctuation(content))
+            self.mysqlHelper.insert_chapter(item)
+
+    def close_spider(self, spider):
+        spider_type = self.__stats.get_value('spider_type')
+        if spider_type == 'update':
+            self.__redis_conn.hset(redis_update_flag_key, spider.name, 0)
+            my_log(spider.name, 'update end ....')
+            book_list = self.__stats.get_value('bid_list')
+            if book_list is not None:
+                for book in book_list:
+                    spider.mysqlHelper.after_spider(book['bid'],book['start_sequence'])
+        if spider_type == 'add':
+            self.__redis_conn.hset(redis_crawl_flag_key, spider.name, 0)
+            my_log(spider.name, 'crawl end ....')
+            bid_list = self.__stats.get_value('bid_list')
+            if bid_list is not None:
+                for bid in bid_list:
+                    spider.mysqlHelper.after_spider(bid,1)
+        if spider_type == 'fix':
+            bid_list = self.__stats.get_value('bid_list')
+            if bid_list is not None:
+                for book in bid_list:
+                    spider.mysqlHelper.after_fix_delete_unnecessary(book['bid'],book['end'])
+                    spider.mysqlHelper.after_spider(book['bid'],1)

BIN
content_spider/profile/profile


+ 101 - 0
content_spider/settings.py

@@ -0,0 +1,101 @@
+# Scrapy settings for wangdu_spider project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://docs.scrapy.org/en/latest/topics/settings.html
+#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'content_spider'
+
+SPIDER_MODULES = ['content_spider.spiders']
+NEWSPIDER_MODULE = 'content_spider.spiders'
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'content_spider (+http://www.yourdomain.com)'
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = False
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+# CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+# DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+# CONCURRENT_REQUESTS_PER_DOMAIN = 16
+# CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+# TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+DEFAULT_REQUEST_HEADERS = {
+   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+   'Accept-Language': 'zh-CN,zh;q=0.8',
+   'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'
+}
+
+# Enable or disable spider middlewares
+# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+# SPIDER_MIDDLEWARES = {
+#    'content_spider.middlewares.WangduSpiderSpiderMiddleware': 543,
+# }
+
+# Enable or disable downloader middlewares
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+# DOWNLOADER_MIDDLEWARES = {
+#    'content_spider.middlewares.ContentSpiderDownloaderMiddleware': 543,
+# }
+
+# Enable or disable extensions
+# See https://docs.scrapy.org/en/latest/topics/extensions.html
+# EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+# }
+
+# Configure item pipelines
+# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+    'content_spider.pipelines.ChapterItemPipeline': 300,
+}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
+# AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+# AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+# AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+# AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+# HTTPCACHE_ENABLED = True
+# HTTPCACHE_EXPIRATION_SECS = 0
+# HTTPCACHE_DIR = 'httpcache'
+# HTTPCACHE_IGNORE_HTTP_CODES = []
+# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
+
+MYSQL_HOST = 'rm-bp121c5u43zdqh4fs.mysql.rds.aliyuncs.com'
+MYSQL_USER = 'zwcontent'
+MYSQL_PWD = 'zwcontent_2021!ZW2021'
+MYSQL_DB = 'zwcontent'
+
+
+
+
+# MYSQL_HOST = 'rm-bp1z1dto3n2rdb02f.mysql.rds.aliyuncs.com'
+# MYSQL_USER = 'yueduyun'
+# MYSQL_PWD = "yueduyun2017#Ydy"
+# MYSQL_DB = 'book_club'

+ 4 - 0
content_spider/spiders/__init__.py

@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.

+ 0 - 0
content_spider/spiders/kanshu/__init__.py


+ 81 - 0
content_spider/spiders/kanshu/book.py

@@ -0,0 +1,81 @@
+# -*- coding: utf-8 -*-
+
+from content_spider.baseSpider import baseSpider
+import json
+import time
+
+
+class BookSpider(baseSpider):
+    name = 'kanshu'
+    allowed_domains = ['hezuo.lunjian.com']
+    source = 'kanshu'
+    source_name = '看书'
+    source_id = 19
+    base_url = 'http://hezuo.lunjian.com/open/ksbook/{}?channel_id=10054'
+
+    custom_settings = {
+        'DOWNLOAD_DELAY': 0.01,
+        'SOURCE': source,
+        'LOG_FILE': 'content_spider/log/' + name + time.strftime("%Y-%m-%d", time.localtime()) + '.log'
+    }
+
+    def get_start_url(self):
+        return self.base_url.format('bookLists')
+
+    def bid_list_result(self, response):
+        result = json.loads(response.text)
+        if result is None:
+            return []
+        result_list = []
+        for item in result['data']:
+            result_list.append({'id': item['id']})
+        return result_list
+
+    def get_book_info_url(self, bid):
+        return self.base_url.format('BookDetail') + '&bookid={}'.format(bid)
+
+    def book_info_result(self, response):
+        result = json.loads(response.text)
+        if result is None or result.get('data') is None:
+            return None
+        result = result['data']
+
+        return {
+            'bid': result['id'], 'name': result['bookTitle'], 'author': result['author'],
+            'intro': result['introduction'], 'cover': result['cover'], 'keyword': result['labels'],
+            'status': result['state'], 'category': result['category'],'category_id':1,
+            'channel': result['channelId']
+        }
+
+    def get_chapter_list_url(self, bid):
+        return self.base_url.format('ChapterLists') + '&bookid={}'.format(bid)
+
+    def chapter_list_result(self, response):
+        result = json.loads(response.text)
+        if result is None or result.get('data') is None:
+            return []
+
+        result_list = []
+        i = 0
+        for chapter_item in result['data']:
+            i = i+1
+            result_list.append({
+                'source_chapter_id': chapter_item['id'], 'name': chapter_item['title'],
+                'sequence': i, 'is_vip': 1 if chapter_item['isVip'] else 0,
+                'size': 0, 'recent_update_at': chapter_item['lastUpdateTime']
+            })
+        return result_list
+
+    def get_chapter_content_url(self, bid, cid):
+        return self.base_url.format('ChapterContent') + '&bookid={}&chapterid={}'.format(bid, cid)
+
+    def chapter_content_result(self, response):
+        result = json.loads(response.text)
+        if result is None:
+            return {'content': ''}
+
+        return {
+            'content': result['data']['content'],
+            'size': len(result['data']['content'])
+        }
+

+ 52 - 0
content_spider/spiders/kanshu/bookFix.py

@@ -0,0 +1,52 @@
+# -*- coding: utf-8 -*-
+
+from content_spider.baseSpider import fixChapterSpider
+import json
+import time
+
+
+class bookFixSpider(fixChapterSpider):
+    name = 'kanshufix'
+    allowed_domains = ['hezuo.lunjian.com']
+    source = 'kanshu'
+    source_name = '看书'
+    base_url = 'http://hezuo.lunjian.com/open/ksbook/{}?channel_id=10054'
+
+    custom_settings = {
+        'DOWNLOAD_DELAY': 0.01,
+        'SOURCE': source,
+        'LOG_FILE': 'content_spider/log/' + name + time.strftime("%Y-%m-%d", time.localtime()) + '.log'
+    }
+
+    def get_chapter_list_url(self, bid):
+        return self.base_url.format('ChapterLists') + '&bookid={}'.format(bid)
+
+    def chapter_list_result(self, response):
+        result = json.loads(response.text)
+        if result is None or result.get('data') is None:
+            return []
+
+        result_list = []
+        i = 0
+        for chapter_item in result['data']:
+            i = i+1
+            result_list.append({
+                'source_chapter_id': chapter_item['id'], 'name': chapter_item['title'],
+                'sequence': i, 'is_vip': 1 if chapter_item['isVip'] else 0,
+                'size': 0, 'recent_update_at': chapter_item['lastUpdateTime']
+            })
+        return result_list
+
+    def get_chapter_content_url(self, bid, cid):
+        return self.base_url.format('ChapterContent') + '&bookid={}&chapterid={}'.format(bid, cid)
+
+    def chapter_content_result(self, response):
+        result = json.loads(response.text)
+        if result is None:
+            return {'content': ''}
+
+        return {
+            'content': result['data']['content'],
+            'size': len(result['data']['content'])
+        }
+

+ 53 - 0
content_spider/spiders/kanshu/bookupdate.py

@@ -0,0 +1,53 @@
+# -*- coding: utf-8 -*-
+
+import random
+from content_spider.baseSpider import baseUpdateSpider
+import json
+import time
+
+
+class BookupdateSpider(baseUpdateSpider):
+    name = 'kanshuupdate'
+    allowed_domains = ['hezuo.lunjian.com']
+    source = 'kanshu'
+    source_name = '看书'
+    source_id = 19
+    base_url = 'http://hezuo.lunjian.com/open/ksbook/{}?channel_id=10054'
+
+    custom_settings = {
+        'DOWNLOAD_DELAY': 0.01,
+        'SOURCE': source,
+        'LOG_FILE': 'content_spider/log/' + name + time.strftime("%Y-%m-%d", time.localtime()) + '.log'
+    }
+
+    def get_chapter_list_url(self, bid):
+        return self.base_url.format('ChapterLists') + '&bookid={}&random={}'.format(bid,random.randint(1,10000))
+
+    def chapter_list_result(self, response):
+        result = json.loads(response.text)
+        if result is None or result.get('data') is None:
+            return []
+
+        result_list = []
+        i = 0
+        for chapter_item in result['data']:
+            i = i + 1
+            result_list.append({
+                'source_chapter_id': chapter_item['id'], 'name': chapter_item['title'],
+                'sequence': i, 'is_vip': 1 if chapter_item['isVip'] else 0,
+                'size': 0, 'recent_update_at': chapter_item['lastUpdateTime']
+            })
+        return result_list
+
+    def get_chapter_content_url(self, bid, cid):
+        return self.base_url.format('ChapterContent') + '&bookid={}&chapterid={}'.format(bid, cid)
+
+    def chapter_content_result(self, response):
+        result = json.loads(response.text)
+        if result is None:
+            return {'content': ''}
+
+        return {
+            'content': result['data']['content'],
+            'size': len(result['data']['content'])
+        }

BIN
content_spider/spiders/kanshu/【看书网输出】看书网书籍分类.xls


BIN
content_spider/spiders/kanshu/【看书网输出】看书网小说接口文档【最新版】.docx


+ 241 - 0
content_spider/temp_test.py

@@ -0,0 +1,241 @@
+# -*- coding: utf-8 -*-
+import hashlib
+import json
+import re
+from urllib.parse import urlencode
+from urllib.parse import urlsplit
+from urllib.parse import unquote
+from urllib.parse import parse_qs
+
+import urllib
+from xml.dom.minidom import parseString
+from content_spider.Util import hashUtil, random_str
+
+import time
+from content_spider.Util import my_log
+
+cate = [
+		{
+			"id": 1,
+			"name": "都市生活",
+			"channel": "男频",
+			"childs": [
+				{
+					"id": 1,
+					"name": "爱情婚姻"
+				},
+				{
+					"id": 2,
+					"name": "商战风云"
+				},
+				{
+					"id": 3,
+					"name": "职场励志"
+				},
+				{
+					"id": 4,
+					"name": "官场沉浮"
+				},
+				{
+					"id": 5,
+					"name": "现实百态"
+				},
+				{
+					"id": 6,
+					"name": "八卦杂谈"
+				},
+				{
+					"id": 32,
+					"name": "都市异能"
+				},
+				{
+					"id": 40,
+					"name": "现代修真"
+				}
+			]
+		},
+		{
+			"id": 2,
+			"name": "女性言情",
+			"channel": "女频",
+			"childs": [
+				{
+					"id": 9,
+					"name": "总裁豪门"
+				},
+				{
+					"id": 8,
+					"name": "穿越时空"
+				},
+				{
+					"id": 11,
+					"name": "青春纯爱"
+				},
+				{
+					"id": 12,
+					"name": "架空历史"
+				},
+				{
+					"id": 7,
+					"name": "悬疑推理"
+				},
+				{
+					"id": 10,
+					"name": "综合其他"
+				}
+			]
+		},
+		{
+			"id": 3,
+			"name": "玄幻小说",
+			"channel": "男频",
+			"childs": [
+				{
+					"id": 13,
+					"name": "东方玄幻"
+				},
+				{
+					"id": 14,
+					"name": "异界大陆"
+				},
+				{
+					"id": 15,
+					"name": "西方奇幻"
+				},
+				{
+					"id": 16,
+					"name": "异术超能"
+				},
+				{
+					"id": 33,
+					"name": "转世重生"
+				}
+			]
+		},
+		{
+			"id": 4,
+			"name": "历史军事",
+			"channel": "男频",
+			"childs": [
+				{
+					"id": 19,
+					"name": "架空历史"
+				},
+				{
+					"id": 20,
+					"name": "历史传记"
+				},
+				{
+					"id": 21,
+					"name": "论古谈今"
+				},
+				{
+					"id": 22,
+					"name": "军事战争"
+				},
+				{
+					"id": 23,
+					"name": "军旅生活"
+				},
+				{
+					"id": 24,
+					"name": "抗战烽火"
+				}
+			]
+		},
+		{
+			"id": 5,
+			"name": "科幻灵异",
+			"channel": "男频",
+			"childs": [
+				{
+					"id": 25,
+					"name": "未来世界"
+				},
+				{
+					"id": 26,
+					"name": "星际战争"
+				},
+				{
+					"id": 27,
+					"name": "古武机甲"
+				},
+				{
+					"id": 28,
+					"name": "灵异奇谈"
+				},
+				{
+					"id": 29,
+					"name": "恐怖惊悚"
+				},
+				{
+					"id": 30,
+					"name": "悬疑探险"
+				},
+				{
+					"id": 35,
+					"name": "侦探推理"
+				},
+				{
+					"id": 36,
+					"name": "末日危临"
+				}
+			]
+		},
+		{
+			"id": 6,
+			"name": "武侠仙侠",
+			"channel": "男频",
+			"childs": [
+				{
+					"id": 17,
+					"name": "奇幻修真"
+				},
+				{
+					"id": 18,
+					"name": "古典仙侠"
+				},
+				{
+					"id": 34,
+					"name": "经典武侠"
+				}
+			]
+		},
+		{
+			"id": 7,
+			"name": "综合其他",
+			"channel": "男频",
+			"childs": [
+				{
+					"id": 31,
+					"name": "其他类别"
+				},
+				{
+					"id": 37,
+					"name": "电子竞技"
+				},
+				{
+					"id": 38,
+					"name": "虚拟网游"
+				},
+				{
+					"id": 39,
+					"name": "体育竞技"
+				}
+			]
+		}
+	]
+
+
+res = []
+for i in cate:
+
+    if i['channel'] == '男频':
+        channel_id = 1
+    else:
+        channel_id = 2
+    for j in i['childs']:
+        temp  = {'id': j['id'], 'name': j['name'], 'my_category_id': 0, 'ncategory_id': 0, 'my_category_name': '古代言情', 'channel_id': channel_id}
+        res.append(temp)
+
+print(res)

+ 11 - 0
scrapy.cfg

@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+
+[settings]
+default = content_spider.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = content_spider