# Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html # useful for handling different item types with a single interface # from itemadapter import ItemAdapter import os import re from content_spider.mysqlHelper import MysqlHelper from content_spider.Util import my_log def formatcontent(content): content = content.replace(' ', '') content = content.replace('

', '') content = content.replace('

', "\r\n") content = content.splitlines() content = map(lambda s: s.strip(), content) content = filter(lambda s: s != '', content) content = '\r\n'.join(content) return content.strip() def removePunctuation(text): punctuation = '!,;:?"\'、,;!”“。?,' text = re.sub(r'[{}]+'.format(punctuation), ' ', text) return text.strip().replace('\r\n', '').replace('\n', '').replace('\r', '') class ContentSpiderPipeline: def process_item(self, item, spider): return item class ChapterItemPipeline: def __init__(self, host, user, password, db, source, stats,settings): self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db, source=source,source_id=0) self.__stats = stats @classmethod def from_crawler(cls, crawler): stats = crawler.stats return cls( host=crawler.settings.get('MYSQL_HOST'), user=crawler.settings.get('MYSQL_USER'), password=crawler.settings.get('MYSQL_PWD'), db=crawler.settings.get('MYSQL_DB'), source=crawler.settings.get('SOURCE'), stats=stats, settings=crawler.settings ) def process_item(self, item, spider): if item.get('content') != "": content = formatcontent(item['content']) item['content'] = content item['size'] = len(removePunctuation(content)) self.mysqlHelper.insert_chapter(item) def close_spider(self, spider): spider_type = self.__stats.get_value('spider_type') if spider_type == 'update': my_log(spider.name, 'update end ....') book_list = self.__stats.get_value('bid_list') if book_list is not None: for book in book_list: spider.mysqlHelper.after_spider(book['bid'],book['start_sequence']) if spider_type == 'add': my_log(spider.name, 'crawl end ....') bid_list = self.__stats.get_value('bid_list') if bid_list is not None: for bid in bid_list: spider.mysqlHelper.after_spider(bid,1) if spider_type == 'fix': bid_list = self.__stats.get_value('bid_list') if bid_list is not None: for book in bid_list: spider.mysqlHelper.after_fix_delete_unnecessary(book['bid'],book['end']) spider.mysqlHelper.after_spider(book['bid'],1)