преди 3 години · 53eb588352
--- a/ydyspider/baseSpider.py
+++ b/ydyspider/baseSpider.py
@@ -0,0 +1,418 @@
 
																+# -*- coding: utf-8 -*-
															
 
																+
															
 
																+import time
															
 
																+import scrapy
															
 
																+from ydyspider.items import BookInfoItem, ChapterItem
															
 
																+from ydyspider.mysqlHelper import MysqlHelper
															
 
																+import hashlib
															
 
																+import random
															
 
																+
															
 
																+from ydyspider.pipelines import formatcontent, removePunctuation
															
 
																+
															
 
																+
															
 
																+def md5(token):
															
 
																+    m = hashlib.md5()
															
 
																+    m.update(token.encode('utf-8'))
															
 
																+    return m.hexdigest()
															
 
																+
															
 
																+
															
 
																+def sha1(token):
															
 
																+    m = hashlib.sha1()
															
 
																+    m.update(token.encode('utf-8'))
															
 
																+    return m.hexdigest()
															
 
																+
															
 
																+
															
 
																+def random_str(slen=10):
															
 
																+    seed = "1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
															
 
																+    sa = []
															
 
																+    for i in range(slen):
															
 
																+        sa.append(random.choice(seed))
															
 
																+    return ''.join(sa)
															
 
																+
															
 
																+
															
 
																+class baseSpider(scrapy.Spider):
															
 
																+    name = ''
															
 
																+    allowed_domains = []
															
 
																+    base_url = ''
															
 
																+    source = ''
															
 
																+    source_name = ''
															
 
																+
															
 
																+    def __init__(self, host, user, password, db, stats):
															
 
																+        scrapy.Spider.__init__(self)
															
 
																+        source = self.source
															
 
																+        self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db,source=source)
															
 
																+        self.__stats = stats
															
 
																+        self.__source = self.source
															
 
																+        self.__stats.set_value('bid_list', [])
															
 
																+        self.__stats.set_value('spider_type', 'add')
															
 
																+
															
 
																+    @classmethod
															
 
																+    def from_crawler(cls, crawler):
															
 
																+        settings = crawler.settings
															
 
																+        host = settings.get('MYSQL_HOST')
															
 
																+        user = settings.get('MYSQL_USER')
															
 
																+        password = settings.get('MYSQL_PWD')
															
 
																+        db = settings.get('MYSQL_DB')
															
 
																+
															
 
																+        return cls(host=host, user=user, password=password, db=db, stats=crawler.stats)
															
 
																+
															
 
																+    def start_requests(self):
															
 
																+        yield scrapy.Request(self.get_start_url(), callback=self.parse_book_list)
															
 
																+
															
 
																+    def parse_book_list(self, response):
															
 
																+        result = self.bid_list_result(response)
															
 
																+        for item in result:
															
 
																+            bid = item['id']
															
 
																+            result = self.mysqlHelper.get_book_info_by_source(bid)
															
 
																+            if result is not None:
															
 
																+                continue
															
 
																+            url = self.get_book_info_url(bid)
															
 
																+            yield scrapy.Request(url, callback=self.parse_book_info)
															
 
																+
															
 
																+    def parse_book_info(self, response):
															
 
																+        if response.text == '':
															
 
																+            return None
															
 
																+        now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
															
 
																+        result = self.book_info_result(response)
															
 
																+        if result is None:
															
 
																+            return None
															
 
																+        source_bid = result.get('bid')
															
 
																+        book_info_item = BookInfoItem()
															
 
																+        book_info_item['source_bid'] = source_bid
															
 
																+        book_info_item['name'] = result['name']
															
 
																+        book_info_item['author'] = result['author']
															
 
																+        book_info_item['intro'] = result['intro']
															
 
																+        book_info_item['cover'] = result['cover']
															
 
																+        book_info_item['keyword'] = result['keyword']
															
 
																+        book_info_item['category_id'] = 0 if result.get('category_id') is None else result.get('category_id')
															
 
																+        book_info_item['status'] = result['status']
															
 
																+        book_info_item['chapter_count'] = 0
															
 
																+        book_info_item['first_cid'] = 0
															
 
																+        book_info_item['last_cid'] = 0
															
 
																+        book_info_item['size'] = 0
															
 
																+        book_info_item['last_chapter'] = ''
															
 
																+        book_info_item['category_name'] = result['category']
															
 
																+        book_info_item['source'] = self.source
															
 
																+        book_info_item['updated_at'] = now
															
 
																+        book_info_item['created_at'] = now
															
 
																+        bid = self.mysqlHelper.insert_book(book_info_item)
															
 
																+        self.__stats.get_value('bid_list').append(bid)
															
 
																+        url = self.get_chapter_list_url(source_bid)
															
 
																+        meta = {'bid': bid, 'source_bid': source_bid}
															
 
																+        yield scrapy.Request(url, self.parse_chapter_list, meta=meta)
															
 
																+
															
 
																+    def parse_chapter_list(self, response):
															
 
																+        if response.text == '':
															
 
																+            return None
															
 
																+        result = self.chapter_list_result(response)
															
 
																+        if result is None:
															
 
																+            return None
															
 
																+        bid = response.meta['bid']
															
 
																+        source_bid = response.meta['source_bid']
															
 
																+        for chapter_item in result:
															
 
																+            meta = chapter_item
															
 
																+            cid = chapter_item['source_chapter_id']
															
 
																+            meta['bid'] = bid
															
 
																+            url = self.get_chapter_content_url(source_bid, cid)
															
 
																+            yield scrapy.Request(url, self.parse_chapter_content, meta=meta)
															
 
																+
															
 
																+    def parse_chapter_content(self, response):
															
 
																+        if response.text == '':
															
 
																+            return None
															
 
																+        now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
															
 
																+        result = self.chapter_content_result(response)
															
 
																+        meta = response.meta
															
 
																+        chapter_item = ChapterItem()
															
 
																+        chapter_item['bid'] = meta['bid']
															
 
																+        chapter_item['name'] = meta['name']
															
 
																+        chapter_item['sequence'] = meta['sequence']
															
 
																+        chapter_item['is_vip'] = meta['is_vip']
															
 
																+        chapter_item['prev_cid'] = 0
															
 
																+        chapter_item['next_cid'] = 0
															
 
																+        chapter_item['recent_update_at'] = meta['recent_update_at']
															
 
																+        content = formatcontent(result['content'])
															
 
																+        chapter_item['content'] = content
															
 
																+        chapter_item['size'] = len(removePunctuation(content))
															
 
																+        chapter_item['chapter_content_id'] = 0
															
 
																+        chapter_item['source_chapter_id'] = meta['source_chapter_id']
															
 
																+        chapter_item['created_at'] = now
															
 
																+        chapter_item['updated_at'] = now
															
 
																+        if result.get('size') is not None:
															
 
																+            chapter_item['size'] = result.get('size')
															
 
																+        if result.get('is_vip') is not None:
															
 
																+            chapter_item['is_vip'] = result.get('is_vip')
															
 
																+        if result.get('name') is not None:
															
 
																+            chapter_item['name'] = result.get('name')
															
 
																+        if result.get('recent_update_at') is not None:
															
 
																+            chapter_item['recent_update_at'] = result.get('recent_update_at')
															
 
																+        if result.get('source_chapter_id') is not None:
															
 
																+            chapter_item['source_chapter_id'] = result.get('source_chapter_id')
															
 
																+        yield chapter_item
															
 
																+
															
 
																+    def get_start_url(self):
															
 
																+        raise NotImplementedError
															
 
																+
															
 
																+    def bid_list_result(self, response):
															
 
																+        raise NotImplementedError
															
 
																+
															
 
																+    def get_book_info_url(self, bid):
															
 
																+        raise NotImplementedError
															
 
																+
															
 
																+    def book_info_result(self, response):
															
 
																+        raise NotImplementedError
															
 
																+
															
 
																+    def get_chapter_list_url(self, bid):
															
 
																+        raise NotImplementedError
															
 
																+
															
 
																+    def chapter_list_result(self, response):
															
 
																+        raise NotImplementedError
															
 
																+
															
 
																+    def get_chapter_content_url(self, bid, cid):
															
 
																+        raise NotImplementedError
															
 
																+
															
 
																+    def chapter_content_result(self, response):
															
 
																+        raise NotImplementedError
															
 
																+
															
 
																+
															
 
																+class baseUpdateSpider(scrapy.Spider):
															
 
																+    name = ''
															
 
																+    allowed_domains = []
															
 
																+    base_url = ''
															
 
																+    source = ''
															
 
																+    source_name = ''
															
 
																+
															
 
																+    def __init__(self, host, user, password, db, stats):
															
 
																+        scrapy.Spider.__init__(self)
															
 
																+        source = self.source
															
 
																+        self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db, source=source)
															
 
																+        self.__stats = stats
															
 
																+        self.__stats.set_value('spider_type', 'update')
															
 
																+        self.__stats.set_value('bid_list', [])
															
 
																+        self.__is_first = True
															
 
																+
															
 
																+    @classmethod
															
 
																+    def from_crawler(cls, crawler):
															
 
																+        settings = crawler.settings
															
 
																+        host = settings.get('MYSQL_HOST')
															
 
																+        user = settings.get('MYSQL_USER')
															
 
																+        password = settings.get('MYSQL_PWD')
															
 
																+        db = settings.get('MYSQL_DB')
															
 
																+        return cls(host=host, user=user, password=password, db=db, stats=crawler.stats)
															
 
																+
															
 
																+    def start_requests(self):
															
 
																+        book_list = self.mysqlHelper.get_need_update_book_list()
															
 
																+        if book_list is not None:
															
 
																+            for book in book_list:
															
 
																+                url = self.get_chapter_list_url(book['source_bid'])
															
 
																+                meta = {'bid': book['id'], 'source_bid': book['source_bid']}
															
 
																+                yield scrapy.Request(url, callback=self.parse_chapter_list, meta=meta)
															
 
																+
															
 
																+    def parse_chapter_list(self, response):
															
 
																+        if response.text == '':
															
 
																+            return None
															
 
																+        chapter_list = self.chapter_list_result(response)
															
 
																+        meta = response.meta
															
 
																+        if chapter_list is not None:
															
 
																+            bid = response.meta.get('bid')
															
 
																+            source_bid = response.meta.get('source_bid')
															
 
																+            last_chapter = self.mysqlHelper.get_last_cid_by_bid(bid)
															
 
																+            start = False
															
 
																+            if last_chapter is None:
															
 
																+                start = True
															
 
																+                last_source_cid = 0
															
 
																+                last_sequence = 0
															
 
																+                last_chapter_id = 0
															
 
																+            else:
															
 
																+                last_source_cid = last_chapter['source_chapter_id']
															
 
																+                last_sequence = last_chapter['sequence']
															
 
																+                last_chapter_id = last_chapter['id']
															
 
																+
															
 
																+            has_new_chapter = False
															
 
																+            for chapter_item in chapter_list:
															
 
																+                if not start:
															
 
																+                    if int(chapter_item['source_chapter_id']) == int(last_source_cid):
															
 
																+                        start = True
															
 
																+                    continue
															
 
																+                if not has_new_chapter:
															
 
																+                    self.__stats.get_value('bid_list').append(
															
 
																+                        {"bid": meta['bid'], 'start': last_chapter_id})
															
 
																+                    has_new_chapter = True
															
 
																+                cid = chapter_item['source_chapter_id']
															
 
																+                last_sequence = last_sequence + 1
															
 
																+                if chapter_item['sequence'] == 0:
															
 
																+                    chapter_item['sequence'] = last_sequence
															
 
																+                meta = chapter_item
															
 
																+                meta['bid'] = bid
															
 
																+                url = self.get_chapter_content_url(source_bid, cid)
															
 
																+                yield scrapy.Request(url, callback=self.parse_chapter_content, meta=meta)
															
 
																+
															
 
																+    def parse_chapter_content(self, response):
															
 
																+        if response.text == '':
															
 
																+            return None
															
 
																+        now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
															
 
																+        result = self.chapter_content_result(response)
															
 
																+        meta = response.meta
															
 
																+        chapter_item = ChapterItem()
															
 
																+        chapter_item['bid'] = meta['bid']
															
 
																+        chapter_item['name'] = meta['name']
															
 
																+        chapter_item['sequence'] = meta['sequence']
															
 
																+        chapter_item['is_vip'] = meta['is_vip']
															
 
																+        chapter_item['prev_cid'] = 0
															
 
																+        chapter_item['next_cid'] = 0
															
 
																+        chapter_item['recent_update_at'] = meta['recent_update_at']
															
 
																+        content = formatcontent(result['content'])
															
 
																+        chapter_item['size'] = len(removePunctuation(content))
															
 
																+        chapter_item['content'] = content
															
 
																+        chapter_item['chapter_content_id'] = 0
															
 
																+        chapter_item['source_chapter_id'] = meta['source_chapter_id']
															
 
																+        chapter_item['created_at'] = now
															
 
																+        chapter_item['updated_at'] = now
															
 
																+        if result.get('is_vip') is not None:
															
 
																+            chapter_item['is_vip'] = result.get('is_vip')
															
 
																+        if result.get('name') is not None:
															
 
																+            chapter_item['name'] = result.get('name')
															
 
																+        if result.get('recent_update_at') is not None:
															
 
																+            chapter_item['recent_update_at'] = result.get('recent_update_at')
															
 
																+        if result.get('source_chapter_id') is not None:
															
 
																+            chapter_item['source_chapter_id'] = result.get('source_chapter_id')
															
 
																+        yield chapter_item
															
 
																+
															
 
																+    def get_chapter_list_url(self, bid):
															
 
																+        raise NotImplementedError
															
 
																+
															
 
																+    def chapter_list_result(self, response):
															
 
																+        raise NotImplementedError
															
 
																+
															
 
																+    def get_chapter_content_url(self, bid, cid):
															
 
																+        raise NotImplementedError
															
 
																+
															
 
																+    def chapter_content_result(self, response):
															
 
																+        raise NotImplementedError
															
 
																+
															
 
																+
															
 
																+class fixChapterSpider(scrapy.Spider):
															
 
																+    name = ''
															
 
																+    source = ''
															
 
																+
															
 
																+    def __init__(self, host, user, password, db, bid_list, stats):
															
 
																+        scrapy.Spider.__init__(self)
															
 
																+        source = self.source
															
 
																+        self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db, source=source)
															
 
																+        self.__stats = stats
															
 
																+        self.__is_first = True
															
 
																+        self.bid_list = bid_list
															
 
																+
															
 
																+    @classmethod
															
 
																+    def from_crawler(cls, crawler, *args, **kwargs):
															
 
																+        settings = crawler.settings
															
 
																+        host = settings.get('MYSQL_HOST')
															
 
																+        user = settings.get('MYSQL_USER')
															
 
																+        password = settings.get('MYSQL_PWD')
															
 
																+        db = settings.get('MYSQL_DB')
															
 
																+        bid = kwargs.get('bid')
															
 
																+        if bid is not None:
															
 
																+            bid_list = bid.split(',')
															
 
																+        else:
															
 
																+            bid_list = []
															
 
																+        return cls(host=host, user=user, password=password, db=db, bid_list=bid_list, stats=crawler.stats)
															
 
																+
															
 
																+    def start_requests(self):
															
 
																+        if self.bid_list is None:
															
 
																+            yield
															
 
																+            return
															
 
																+        for book in self.bid_list:
															
 
																+            info = self.mysqlHelper.get_book_info_by_id(book)
															
 
																+            if info is None:
															
 
																+                continue
															
 
																+            url = self.get_chapter_list_url(info['source_bid'])
															
 
																+            meta = {'bid': book, 'source_bid': info['source_bid']}
															
 
																+            yield scrapy.Request(url, callback=self.parse_chapter_list, meta=meta)
															
 
																+
															
 
																+    def parse_chapter_list(self, response):
															
 
																+        if response.text == '':
															
 
																+            return None
															
 
																+        chapter_list = self.chapter_list_result(response)
															
 
																+        if chapter_list is not None:
															
 
																+            bid = response.meta.get('bid')
															
 
																+            source_bid = response.meta.get('source_bid')
															
 
																+            last_sequence = 0
															
 
																+            for chapter_item in chapter_list:
															
 
																+                last_sequence = last_sequence + 1
															
 
																+
															
 
																+                if chapter_item['sequence'] == 0:
															
 
																+                    chapter_item['sequence'] = last_sequence
															
 
																+                chapter_info = self.mysqlHelper.get_cid_by_bid_sequence(bid, last_sequence)
															
 
																+                cid = chapter_item['source_chapter_id']
															
 
																+
															
 
																+                meta = chapter_item
															
 
																+                if chapter_info is not None:
															
 
																+                    meta['type'] = 'update'
															
 
																+                    meta['chapter_content_id'] = chapter_info['chapter_content_id']
															
 
																+                    meta['cid'] = chapter_info['id']
															
 
																+                meta['bid'] = bid
															
 
																+                url = self.get_chapter_content_url(source_bid, cid)
															
 
																+                yield scrapy.Request(url, callback=self.parse_chapter_content, meta=meta)
															
 
																+
															
 
																+    def parse_chapter_content(self, response):
															
 
																+        if response.text == '':
															
 
																+            return None
															
 
																+        now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
															
 
																+        result = self.chapter_content_result(response)
															
 
																+        meta = response.meta
															
 
																+        content = formatcontent(result['content'])
															
 
																+        meta['size'] = len(removePunctuation(content))
															
 
																+        meta['content'] = content
															
 
																+        if result.get('size') is not None:
															
 
																+            meta['size'] = result.get('size')
															
 
																+        if result.get('is_vip') is not None:
															
 
																+            meta['is_vip'] = result.get('is_vip')
															
 
																+        if result.get('name') is not None:
															
 
																+            meta['name'] = result.get('name')
															
 
																+        if result.get('recent_update_at') is not None:
															
 
																+            meta['recent_update_at'] = result.get('recent_update_at')
															
 
																+        if result.get('source_chapter_id') is not None:
															
 
																+            meta['source_chapter_id'] = result.get('source_chapter_id')
															
 
																+        if meta.get('type') is not None:
															
 
																+            self.mysqlHelper.update_content(meta['chapter_content_id'], meta['name'], result['content'])
															
 
																+            self.mysqlHelper.update_chapter(meta)
															
 
																+        else:
															
 
																+            chapter_item = ChapterItem()
															
 
																+            chapter_item['bid'] = meta['bid']
															
 
																+            chapter_item['name'] = meta['name']
															
 
																+            chapter_item['sequence'] = meta['sequence']
															
 
																+            chapter_item['size'] = meta['size']
															
 
																+            chapter_item['is_vip'] = meta['is_vip']
															
 
																+            chapter_item['prev_cid'] = 0
															
 
																+            chapter_item['next_cid'] = 0
															
 
																+            chapter_item['recent_update_at'] = meta['recent_update_at']
															
 
																+            chapter_item['content'] = result['content']
															
 
																+            if meta.get('chapter_content_id') is not None:
															
 
																+                chapter_item['chapter_content_id'] = meta['chapter_content_id']
															
 
																+            else:
															
 
																+                chapter_item['chapter_content_id'] = 0
															
 
																+            chapter_item['source_chapter_id'] = meta['source_chapter_id']
															
 
																+            chapter_item['created_at'] = now
															
 
																+            chapter_item['updated_at'] = now
															
 
																+            if result.get('size') is not None:
															
 
																+                chapter_item['size'] = result.get('size')
															
 
																+            if result.get('is_vip') is not None:
															
 
																+                chapter_item['is_vip'] = result.get('is_vip')
															
 
																+            if result.get('name') is not None:
															
 
																+                chapter_item['name'] = result.get('name')
															
 
																+            if result.get('recent_update_at') is not None:
															
 
																+                chapter_item['recent_update_at'] = result.get('recent_update_at')
															
 
																+            if result.get('source_chapter_id') is not None:
															
 
																+                chapter_item['source_chapter_id'] = result.get('source_chapter_id')
															
 
																+            yield chapter_item
															
 
																+
															
 
																+    def get_chapter_list_url(self, bid):
															
 
																+        raise NotImplementedError
															
 
																+
															
 
																+    def chapter_list_result(self, response):
															
 
																+        raise NotImplementedError
															
 
																+
															
 
																+    def get_chapter_content_url(self, bid, cid):
															
 
																+        raise NotImplementedError
															
 
																+
															
 
																+    def chapter_content_result(self, response):
															
 
																+        raise NotImplementedError
															
--- a/ydyspider/items.py
+++ b/ydyspider/items.py
@@ -12,3 +12,38 @@ class YdyspiderItem(scrapy.Item):
 
																     # define the fields for your item here like:
															
 
																     # name = scrapy.Field()
															
 
																     pass
															
 
																+
															
 
																+class BookInfoItem(scrapy.Item):
															
 
																+    source_bid = scrapy.Field()
															
 
																+    source = scrapy.Field()
															
 
																+    name = scrapy.Field()
															
 
																+    author = scrapy.Field()
															
 
																+    intro = scrapy.Field()
															
 
																+    cover = scrapy.Field()
															
 
																+    keyword = scrapy.Field()
															
 
																+    category_id = scrapy.Field()
															
 
																+    status = scrapy.Field()
															
 
																+    chapter_count = scrapy.Field()
															
 
																+    first_cid = scrapy.Field()
															
 
																+    last_cid = scrapy.Field()
															
 
																+    size = scrapy.Field()
															
 
																+    last_chapter = scrapy.Field()
															
 
																+    category_name = scrapy.Field()
															
 
																+    updated_at = scrapy.Field()
															
 
																+    created_at = scrapy.Field()
															
 
																+
															
 
																+
															
 
																+class ChapterItem(scrapy.Item):
															
 
																+    bid = scrapy.Field()
															
 
																+    name = scrapy.Field()
															
 
																+    sequence = scrapy.Field()
															
 
																+    size = scrapy.Field()
															
 
																+    is_vip = scrapy.Field()
															
 
																+    prev_cid = scrapy.Field()
															
 
																+    next_cid = scrapy.Field()
															
 
																+    recent_update_at = scrapy.Field()
															
 
																+    content = scrapy.Field()
															
 
																+    chapter_content_id = scrapy.Field()
															
 
																+    source_chapter_id = scrapy.Field()
															
 
																+    created_at = scrapy.Field()
															
 
																+    updated_at = scrapy.Field()
															
--- a/ydyspider/mysqlHelper.py
+++ b/ydyspider/mysqlHelper.py
@@ -0,0 +1,141 @@
 
																+# -*- coding: utf-8 -*-
															
 
																+import time
															
 
																+
															
 
																+import pymysql.cursors
															
 
																+
															
 
																+
															
 
																+class MysqlHelper(object):
															
 
																+    def __init__(self, host, user, password, db, source):
															
 
																+        self.__conn = pymysql.connect(host=host, user=user, password=password, db=db, charset='utf8mb4',
															
 
																+                                      cursorclass=pymysql.cursors.DictCursor)
															
 
																+        self.source = source
															
 
																+
															
 
																+    def get_connection(self):
															
 
																+        return self.__conn
															
 
																+
															
 
																+    def get_book_info_by_source(self, source_bid):
															
 
																+        sql = 'select id from books where source_bid=%s and  source = %s'
															
 
																+        with self.__conn.cursor() as cursor:
															
 
																+            cursor.execute(sql, (source_bid, self.source))
															
 
																+            result = cursor.fetchone()
															
 
																+        self.__conn.commit()
															
 
																+        return result
															
 
																+
															
 
																+    def get_need_update_book_list(self):
															
 
																+        sql = 'select id,source_bid from books where source=%s and `status` = 0'
															
 
																+        with self.__conn.cursor() as cursor:
															
 
																+            cursor.execute(sql, (self.source, ))
															
 
																+            result = cursor.fetchall()
															
 
																+        self.__conn.commit()
															
 
																+        return result
															
 
																+
															
 
																+    def get_last_cid_by_bid(self, bid):
															
 
																+        sql = "select id,bid,`name`,sequence,source_chapter_id from chapters where bid = %s" \
															
 
																+              " order by sequence desc limit 1"
															
 
																+        with self.__conn.cursor() as cursor:
															
 
																+            cursor.execute(sql, (int(bid), ))
															
 
																+            result = cursor.fetchone()
															
 
																+        self.__conn.commit()
															
 
																+        return result
															
 
																+
															
 
																+    def insert_book(self, item):
															
 
																+        sql = '''
															
 
																+        insert into books(source_bid, `name`,author, intro, cover ,keyword , category_id,status,
															
 
																+        chapter_count,first_cid,last_cid,`size`,last_chapter,category_name,source,updated_at,created_at)
															
 
																+        values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
															
 
																+        '''
															
 
																+        with self.__conn.cursor() as cursor:
															
 
																+            cursor.execute(sql, (item.get('source_bid'),
															
 
																+                                 item.get('name'),
															
 
																+                                 item.get('author'),
															
 
																+                                 item.get('intro'),
															
 
																+                                 item.get('cover'),
															
 
																+                                 item.get('keyword'),
															
 
																+                                 item.get('category_id'),
															
 
																+                                 item.get('status'),
															
 
																+                                 item.get('chapter_count'),
															
 
																+                                 item.get('first_cid'),
															
 
																+                                 item.get('last_cid'),
															
 
																+                                 item.get('size'),
															
 
																+                                 item.get('last_chapter'),
															
 
																+                                 item.get('category_name'),
															
 
																+                                 item.get('source'),
															
 
																+                                 item.get('updated_at'),
															
 
																+                                 item.get('created_at')
															
 
																+                                 ))
															
 
																+            bid = int(cursor.lastrowid)
															
 
																+        self.__conn.commit()
															
 
																+        return bid
															
 
																+
															
 
																+    def insert_chapter(self, item):
															
 
																+        chapter_content_id = self.insert_content(item)
															
 
																+        sql = "INSERT INTO `chapters` (`bid`, `name`,`sequence`,`size`,`is_vip`,`prev_cid`,`next_cid`,`recent_update_at`,`created_at`,`updated_at`,`chapter_content_id`,source_chapter_id) " \
															
 
																+              "VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
															
 
																+        with self.__conn.cursor() as cursor:
															
 
																+            cursor.execute(sql, (
															
 
																+                item['bid'], item['name'], item['sequence'], item['size'], item['is_vip'], item['prev_cid'],
															
 
																+                item['next_cid'], item['recent_update_at'], item['created_at'], item['updated_at'], chapter_content_id,
															
 
																+                item['source_chapter_id']))
															
 
																+            cid = int(cursor.lastrowid)
															
 
																+        self.__conn.commit()
															
 
																+        return cid
															
 
																+
															
 
																+    def insert_content(self, item):
															
 
																+        sql = "insert into chapter_contents (chapter_name,content,created_at,updated_at) values (%s,%s,%s,%s)"
															
 
																+        with self.__conn.cursor() as cursor:
															
 
																+            cursor.execute(sql, (
															
 
																+                item['name'], item['content'], item['created_at'], item['updated_at']))
															
 
																+            content_id = int(cursor.lastrowid)
															
 
																+        self.__conn.commit()
															
 
																+        return content_id
															
 
																+
															
 
																+    def get_book_list(self):
															
 
																+        sql = "select id,source_bid from books where source=%s"
															
 
																+        with self.__conn.cursor() as cursor:
															
 
																+            cursor.execute(sql, (self.source,))
															
 
																+            result = cursor.fetchall()
															
 
																+        self.__conn.commit()
															
 
																+        return result
															
 
																+
															
 
																+    def get_chapter_info_by_source_cid(self, bid, source_chapter_id):
															
 
																+        sql = 'select id from chapters where bid=%s and  source_chapter_id = %s'
															
 
																+        with self.__conn.cursor() as cursor:
															
 
																+            cursor.execute(sql, (bid, source_chapter_id))
															
 
																+            result = cursor.fetchone()
															
 
																+        self.__conn.commit()
															
 
																+        return result
															
 
																+
															
 
																+    def get_book_info_by_id(self, bid):
															
 
																+        sql = 'select source_bid from books where   id = %s and source=%s '
															
 
																+        with self.__conn.cursor() as cursor:
															
 
																+            cursor.execute(sql, (int(bid), self.source))
															
 
																+            result = cursor.fetchone()
															
 
																+        self.__conn.commit()
															
 
																+        return result
															
 
																+
															
 
																+    def get_cid_by_bid_sequence(self, bid, sequence):
															
 
																+        sql = "select id,chapter_content_id from chapters where  bid = %s and sequence=%s"
															
 
																+        with self.__conn.cursor() as cursor:
															
 
																+            cursor.execute(sql, (int(bid), int(sequence)))
															
 
																+            result = cursor.fetchone()
															
 
																+        self.__conn.commit()
															
 
																+        return result
															
 
																+
															
 
																+    def update_content(self, content_id, chapter_name, content):
															
 
																+        now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
															
 
																+        sql = 'update chapter_contents set chapter_name=%s,content=%s,updated_at=%s  where id=%s'
															
 
																+        with self.__conn.cursor() as cursor:
															
 
																+            cursor.execute(sql, (
															
 
																+                chapter_name, content, now, int(content_id)))
															
 
																+        self.__conn.commit()
															
 
																+
															
 
																+    def update_chapter(self, item):
															
 
																+        now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
															
 
																+        sql = 'update chapters set `name`=%s,`sequence`=%s,`size`=%s,`is_vip`=%s,' \
															
 
																+              'updated_at=%s,`source_chapter_id`=%s where id = %s'
															
 
																+        with self.__conn.cursor() as cursor:
															
 
																+            cid = int(item['cid'])
															
 
																+            cursor.execute(sql, (
															
 
																+                item['name'], item['sequence'], item['size'], item['is_vip'], now,
															
 
																+                item['source_chapter_id'], cid))
															
 
																+        self.__conn.commit()
															
--- a/ydyspider/pipelines.py
+++ b/ydyspider/pipelines.py
@@ -5,6 +5,24 @@
 
																 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
															
 
																 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
															
 
																 import os
															
 
																+import re
															
 
																+
															
 
																+
															
 
																+def formatcontent(content):
															
 
																+    content = content.replace('&nbsp;', '')
															
 
																+    content = content.replace('<p>', '')
															
 
																+    content = content.replace('</p>', "\r\n")
															
 
																+    content = content.splitlines()
															
 
																+    content = map(lambda s: s.strip(), content)
															
 
																+    content = filter(lambda s: s != '', content)
															
 
																+    content = '\r\n'.join(content)
															
 
																+    return content.strip()
															
 
																+
															
 
																+
															
 
																+def removePunctuation(text):
															
 
																+    punctuation = '!,;:?"\'、，；！”“。？,'
															
 
																+    text = re.sub(r'[{}]+'.format(punctuation), ' ', text)
															
 
																+    return text.strip().replace('\r\n', '').replace('\n', '').replace('\r', '')
															
 
																 class YdyspiderPipeline(object):