1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283 |
- # Define your item pipelines here
- #
- # Don't forget to add your pipeline to the ITEM_PIPELINES setting
- # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
- # useful for handling different item types with a single interface
- # from itemadapter import ItemAdapter
- import os
- import re
- from content_spider.mysqlHelper import MysqlHelper
- from content_spider.Util import my_log
- def formatcontent(content):
- content = content.replace(' ', '')
- content = content.replace('<p>', '')
- content = content.replace('</p>', "\r\n")
- content = content.splitlines()
- content = map(lambda s: s.strip(), content)
- content = filter(lambda s: s != '', content)
- content = '\r\n'.join(content)
- return content.strip()
- def removePunctuation(text):
- punctuation = '!,;:?"\'、,;!”“。?,'
- text = re.sub(r'[{}]+'.format(punctuation), ' ', text)
- return text.strip().replace('\r\n', '').replace('\n', '').replace('\r', '')
- class ContentSpiderPipeline:
- def process_item(self, item, spider):
- return item
- class ChapterItemPipeline:
- def __init__(self, host, user, password, db, source, stats,settings):
- self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db, source=source,source_id=0)
- self.__stats = stats
- @classmethod
- def from_crawler(cls, crawler):
- stats = crawler.stats
- return cls(
- host=crawler.settings.get('MYSQL_HOST'),
- user=crawler.settings.get('MYSQL_USER'),
- password=crawler.settings.get('MYSQL_PWD'),
- db=crawler.settings.get('MYSQL_DB'),
- source=crawler.settings.get('SOURCE'),
- stats=stats,
- settings=crawler.settings
- )
- def process_item(self, item, spider):
- if item.get('content') != "":
- content = formatcontent(item['content'])
- item['content'] = content
- item['size'] = len(removePunctuation(content))
- self.mysqlHelper.insert_chapter(item)
- def close_spider(self, spider):
- spider_type = self.__stats.get_value('spider_type')
- if spider_type == 'update':
- my_log(spider.name, 'update end ....')
- book_list = self.__stats.get_value('bid_list')
- if book_list is not None:
- for book in book_list:
- spider.mysqlHelper.after_spider(book['bid'],book['start_sequence'])
- if spider_type == 'add':
- my_log(spider.name, 'crawl end ....')
- bid_list = self.__stats.get_value('bid_list')
- if bid_list is not None:
- for bid in bid_list:
- spider.mysqlHelper.after_spider(bid,1)
- if spider_type == 'fix':
- bid_list = self.__stats.get_value('bid_list')
- if bid_list is not None:
- for book in bid_list:
- spider.mysqlHelper.after_fix_delete_unnecessary(book['bid'],book['end'])
- spider.mysqlHelper.after_spider(book['bid'],1)
|