zhiyu
/
zhiyu_content_spider


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990
							# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
# from itemadapter import ItemAdapter

import os
import re
import redis
from content_spider.mysqlHelper import MysqlHelper
from content_spider.Util import my_log

redis_crawl_flag_key = 'book:crawl:stats'
redis_update_flag_key = 'book:update:stats'
redis_fix_flag_key = 'book:fix:stats'


def formatcontent(content):
    content = content.replace('&nbsp;', '')
    content = content.replace('<p>', '')
    content = content.replace('</p>', "\r\n")
    content = content.splitlines()
    content = map(lambda s: s.strip(), content)
    content = filter(lambda s: s != '', content)
    content = '\r\n'.join(content)
    return content.strip()


def removePunctuation(text):
    punctuation = '!,;:?"\'、，；！”“。？,'
    text = re.sub(r'[{}]+'.format(punctuation), ' ', text)
    return text.strip().replace('\r\n', '').replace('\n', '').replace('\r', '')


class ContentSpiderPipeline:
    def process_item(self, item, spider):
        return item


class ChapterItemPipeline:

    def __init__(self, host, user, password, db, source, stats,settings):
        self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db, source=source,source_id=0)
        self.__stats = stats

    @classmethod
    def from_crawler(cls, crawler):
        stats = crawler.stats
        return cls(
            host=crawler.settings.get('MYSQL_HOST'),
            user=crawler.settings.get('MYSQL_USER'),
            password=crawler.settings.get('MYSQL_PWD'),
            db=crawler.settings.get('MYSQL_DB'),
            source=crawler.settings.get('SOURCE'),
            stats=stats,
            settings=crawler.settings
        )

    def process_item(self, item, spider):
        if item.get('content') != "":
            content = formatcontent(item['content'])
            item['content'] = content
            item['size'] = len(removePunctuation(content))
            self.mysqlHelper.insert_chapter(item)

    def close_spider(self, spider):
        spider_type = self.__stats.get_value('spider_type')
        if spider_type == 'update':
            self.__redis_conn.hset(redis_update_flag_key, spider.name, 0)
            my_log(spider.name, 'update end ....')
            book_list = self.__stats.get_value('bid_list')
            if book_list is not None:
                for book in book_list:
                    spider.mysqlHelper.after_spider(book['bid'],book['start_sequence'])
        if spider_type == 'add':
            self.__redis_conn.hset(redis_crawl_flag_key, spider.name, 0)
            my_log(spider.name, 'crawl end ....')
            bid_list = self.__stats.get_value('bid_list')
            if bid_list is not None:
                for bid in bid_list:
                    spider.mysqlHelper.after_spider(bid,1)
        if spider_type == 'fix':
            bid_list = self.__stats.get_value('bid_list')
            if bid_list is not None:
                for book in bid_list:
                    spider.mysqlHelper.after_fix_delete_unnecessary(book['bid'],book['end'])
                    spider.mysqlHelper.after_spider(book['bid'],1)