import scrapy import time import random from content_spider.mysqlHelper import MysqlHelper from content_spider.items import BookInfoItem, ChapterItem from content_spider.pipelines import formatcontent, removePunctuation from content_spider.Util import my_log class baseSpider(scrapy.Spider): name = '' source = '' source_name = '' source_id = 0 custom_settings = { 'DOWNLOAD_DELAY': 0.01, 'SOURCE': source, } def __init__(self, host, user, password, db, stats, settings): scrapy.Spider.__init__(self) source = self.source source_id = self.source_id self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db, source=source,source_id=source_id) self.__stats = stats self.__stats.set_value('spider_type', 'add') self.__stats.set_value('bid_list', []) self.__is_first = True @classmethod def from_crawler(cls, crawler): settings = crawler.settings host = settings.get('MYSQL_HOST') user = settings.get('MYSQL_USER') password = settings.get('MYSQL_PWD') db = settings.get('MYSQL_DB') return cls(host=host, user=user, password=password, db=db, stats=crawler.stats, settings=settings) def start_requests(self): yield scrapy.Request(self.get_start_url(), callback=self.parse_book_list) def parse_book_list(self, response): result = self.bid_list_result(response) for item in result: bid = item['id'] result = self.mysqlHelper.get_book_info_by_source(bid) if result is not None: continue url = self.get_book_info_url(bid) yield scrapy.Request(url, callback=self.parse_book_info) # break def parse_book_info(self, response): if response.text == '': return None now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) result = self.book_info_result(response) if result is None: return None category_id = 0 if result.get('category_id') is None else result.get('category_id') book_info_item = BookInfoItem() source_bid = result.get('bid') book_info_item['cp_bid'] = source_bid book_info_item['cp_name'] = self.source_name book_info_item['cp_id'] = self.source_id book_info_item['name'] = result['name'] book_info_item['author'] = result['author'] book_info_item['intro'] = result['intro'] book_info_item['cover'] = result['cover'] book_info_item['keyword'] = result['keyword'] book_info_item['category_id'] = category_id book_info_item['status'] = result['status'] book_info_item['size'] = 0 book_info_item['category_name'] = result['category'] book_info_item['last_chapter'] = '' book_info_item['chapter_count'] = 0 book_info_item['first_cid'] = 0 book_info_item['last_cid'] = 0 book_info_item['channel'] = result['channel'] book_info_item['updated_at'] = now book_info_item['created_at'] = now bid = self.mysqlHelper.insert_book(book_info_item) if self.__is_first: self.__stats.set_value('bid_start', bid) self.__is_first = False self.__stats.get_value('bid_list').append(bid) url = self.get_chapter_list_url(source_bid) meta = {'bid': bid, 'source_bid': source_bid} yield scrapy.Request(url, callback=self.parse_chapter_list, meta=meta) def parse_chapter_list(self, response): if response.text == '': return None result = self.chapter_list_result(response) bid = response.meta['bid'] source_bid = response.meta['source_bid'] i = 0 for chapter_item in result: i = i + 1 cid = chapter_item['source_chapter_id'] meta = chapter_item meta['bid'] = bid url = self.get_chapter_content_url(source_bid, cid) yield scrapy.Request(url, callback=self.parse_chapter_content, meta=meta) def parse_chapter_content(self, response): if response.text == '': return None now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) result = self.chapter_content_result(response) meta = response.meta chapter_item = ChapterItem() chapter_item['bid'] = meta['bid'] chapter_item['name'] = meta['name'] chapter_item['sequence'] = meta['sequence'] chapter_item['size'] = meta['size'] chapter_item['is_vip'] = meta['is_vip'] chapter_item['source_chapter_id'] = meta['source_chapter_id'] chapter_item['recent_update_at'] = meta['recent_update_at'] chapter_item['chapter_content_id'] = 0 chapter_item['content'] = formatcontent(result['content']) if result.get('size') is not None: chapter_item['size'] = result.get('size') if result.get('is_vip') is not None: chapter_item['is_vip'] = result.get('is_vip') if result.get('name') is not None: chapter_item['name'] = result.get('name') if result.get('recent_update_at') is not None: chapter_item['recent_update_at'] = result.get('recent_update_at') if result.get('source_chapter_id') is not None: chapter_item['source_chapter_id'] = result.get('source_chapter_id') chapter_item['prev_cid'] = 0 chapter_item['next_cid'] = 0 chapter_item['updated_at'] = now chapter_item['created_at'] = now yield chapter_item def get_start_url(self): raise NotImplementedError def bid_list_result(self, response): raise NotImplementedError def get_book_info_url(self, bid): raise NotImplementedError def book_info_result(self, response): raise NotImplementedError def get_chapter_list_url(self, bid): raise NotImplementedError def chapter_list_result(self, response): raise NotImplementedError def get_chapter_content_url(self, bid, cid): raise NotImplementedError def chapter_content_result(self, response): raise NotImplementedError class baseUpdateSpider(scrapy.Spider): name = '' source = '' source_id = 0 custom_settings = { 'DOWNLOAD_DELAY': 0.01, 'SOURCE': source, } def __init__(self, host, user, password, db, stats, settings): scrapy.Spider.__init__(self) source = self.source self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db, source=source,source_id=self.source_id) self.__stats = stats self.__stats.set_value('spider_type', 'update') self.__stats.set_value('bid_list', []) self.__is_first = True @classmethod def from_crawler(cls, crawler): settings = crawler.settings host = settings.get('MYSQL_HOST') user = settings.get('MYSQL_USER') password = settings.get('MYSQL_PWD') db = settings.get('MYSQL_DB') return cls(host=host, user=user, password=password, db=db, stats=crawler.stats, settings=settings) def start_requests(self): book_list = self.mysqlHelper.get_need_update_book_list() if book_list is not None: for book in book_list: url = self.get_chapter_list_url(book['cp_bid']) meta = {'bid': book['id'], 'cp_bid': book['cp_bid']} yield scrapy.Request(url, callback=self.parse_chapter_list, meta=meta) def parse_chapter_list(self, response): if response.text == '': return None chapter_list = self.chapter_list_result(response) meta = response.meta if chapter_list is not None: bid = response.meta.get('bid') cp_bid = response.meta.get('cp_bid') last_chapter = self.mysqlHelper.get_last_cid_by_bid(bid) start = False if last_chapter is None: start = True last_source_cid = '' last_sequence = 0 last_chapter_id = 0 else: last_source_cid = str(last_chapter['source_chapter_id']) last_sequence = last_chapter['sequence'] last_chapter_id = last_chapter['id'] has_new_chapter = False for chapter_item in chapter_list: if not start: if len(last_source_cid) > 0: if str(chapter_item['source_chapter_id']) == str(last_source_cid): start = True else: if int(chapter_item['sequence']) == last_sequence: start = True continue if not has_new_chapter: self.__stats.get_value('bid_list').append( {"bid": meta['bid'], 'start': last_chapter_id, 'start_sequence': last_sequence}) has_new_chapter = True cid = chapter_item['source_chapter_id'] last_sequence = last_sequence + 1 meta = chapter_item meta['bid'] = bid url = self.get_chapter_content_url(cp_bid, cid) yield scrapy.Request(url, callback=self.parse_chapter_content, meta=meta) def parse_chapter_content(self, response): if response.text == '': return None now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) result = self.chapter_content_result(response) meta = response.meta chapter_item = ChapterItem() chapter_item['bid'] = meta['bid'] chapter_item['name'] = meta['name'] chapter_item['sequence'] = meta['sequence'] chapter_item['size'] = meta['size'] chapter_item['is_vip'] = meta['is_vip'] chapter_item['recent_update_at'] = meta['recent_update_at'] chapter_item['source_chapter_id'] = meta['source_chapter_id'] chapter_item['chapter_content_id'] = 0 chapter_item['content'] = formatcontent(result['content']) if result.get('size') is not None: chapter_item['size'] = result.get('size') if result.get('is_vip') is not None: chapter_item['is_vip'] = result.get('is_vip') if result.get('name') is not None: chapter_item['name'] = result.get('name') if result.get('recent_update_at') is not None: chapter_item['recent_update_at'] = result.get('recent_update_at') if result.get('source_chapter_id') is not None: chapter_item['source_chapter_id'] = result.get('source_chapter_id') chapter_item['prev_cid'] = 0 chapter_item['next_cid'] = 0 chapter_item['updated_at'] = now chapter_item['created_at'] = now yield chapter_item def get_chapter_list_url(self, bid): raise NotImplementedError def chapter_list_result(self, response): raise NotImplementedError def get_chapter_content_url(self, bid, cid): raise NotImplementedError def chapter_content_result(self, response): raise NotImplementedError class fixChapterSpider(scrapy.Spider): name = '' source = '' source_id = 0 custom_settings = { 'DOWNLOAD_DELAY': 0.01, 'SOURCE': source, } def __init__(self, host, user, password, db, bid_list, stats, settings): scrapy.Spider.__init__(self) source = self.source self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db, source=source,source_id=self.source_id) self.__stats = stats self.__stats.set_value('spider_type', 'fix') self.__stats.set_value('bid_list', []) self.__is_first = True self.bid_list = bid_list @classmethod def from_crawler(cls, crawler, *args, **kwargs): settings = crawler.settings host = settings.get('MYSQL_HOST') user = settings.get('MYSQL_USER') password = settings.get('MYSQL_PWD') db = settings.get('MYSQL_DB') bid = kwargs.get('bid') if bid is not None: bid_list = bid.split(',') else: bid_list = [] return cls(host=host, user=user, password=password, db=db, bid_list=bid_list, stats=crawler.stats, settings=settings) def start_requests(self): if self.bid_list is None: yield return for book in self.bid_list: info = self.mysqlHelper.get_book_info_by_id(book) if info is None: continue self.mysqlHelper.re_sequence(book) url = self.get_chapter_list_url(info['cp_bid']) meta = {'bid': book, 'cp_bid': info['cp_bid']} yield scrapy.Request(url, callback=self.parse_chapter_list, meta=meta) def parse_chapter_list(self, response): if response.text == '': return None chapter_list = self.chapter_list_result(response) if chapter_list is not None: bid = response.meta.get('bid') cp_bid = response.meta.get('cp_bid') last_sequence = 0 for chapter_item in chapter_list: last_sequence = last_sequence + 1 chapter_info = self.mysqlHelper.get_cid_by_bid_sequence(bid, chapter_item['sequence']) cid = chapter_item['source_chapter_id'] meta = chapter_item if chapter_info is not None: meta['type'] = 'update' meta['chapter_content_id'] = chapter_info['chapter_content_id'] meta['cid'] = chapter_info['id'] meta['bid'] = bid url = self.get_chapter_content_url(cp_bid, cid) yield scrapy.Request(url, callback=self.parse_chapter_content, meta=meta) self.__stats.get_value('bid_list').append({'bid':bid,'end':last_sequence}) def parse_chapter_content(self, response): if response.text == '': return None now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) result = self.chapter_content_result(response) meta = response.meta data = {} data['bid'] = meta['bid'] data['name'] = meta['name'] data['size'] = meta['size'] data['is_vip'] = meta['is_vip'] data['sequence'] = meta['sequence'] data['source_chapter_id'] = meta['source_chapter_id'] data['recent_update_at'] = meta['recent_update_at'] data['content'] = formatcontent(result['content']) if result.get('size') is not None: data['size'] = result.get('size') if result.get('is_vip') is not None: data['is_vip'] = result.get('is_vip') if result.get('name') is not None: data['name'] = result.get('name') if result.get('recent_update_at') is not None: data['recent_update_at'] = result.get('recent_update_at') if result.get('source_chapter_id') is not None: data['source_chapter_id'] = result.get('source_chapter_id') if meta.get('type') is not None: content = formatcontent(result['content']) data['content'] = content data['size'] = len(removePunctuation(content)) self.mysqlHelper.update_content(meta['chapter_content_id'], meta['name'], result['content']) if meta.get('cid') is not None: data['cid'] = meta['cid'] self.mysqlHelper.update_chapter(data) else: data['prev_cid'] = 0 data['next_cid'] = 0 data['updated_at'] = now data['created_at'] = now chapter_item = ChapterItem(data) yield chapter_item def get_chapter_list_url(self, bid): raise NotImplementedError def chapter_list_result(self, response): raise NotImplementedError def get_chapter_content_url(self, bid, cid): raise NotImplementedError def chapter_content_result(self, response): raise NotImplementedError class fixBookInfoSpider(scrapy.Spider): name = '' source = '' source_name = '' source_id = 0 def __init__(self, host, user, password, db, bid_list, stats): scrapy.Spider.__init__(self) source = self.source self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db, source=source,source_id=self.source_id) self.bid_list = bid_list @classmethod def from_crawler(cls, crawler, *args, **kwargs): settings = crawler.settings host = settings.get('MYSQL_HOST') user = settings.get('MYSQL_USER') password = settings.get('MYSQL_PWD') db = settings.get('MYSQL_DB') bid = kwargs.get('bid') if bid is not None: bid_list = bid.split(',') else: bid_list = [] return cls(host=host, user=user, password=password, db=db, bid_list=bid_list, stats=crawler.stats) def start_requests(self): if self.bid_list is None: yield return for book in self.bid_list: info = self.mysqlHelper.get_book_info_by_id(book) if info is None: continue url = self.get_book_info_url(info['cp_bid']) meta = {'bid': book, 'cp_bid': info['cp_bid']} yield scrapy.Request(url, callback=self.parse_book_info, meta=meta) def parse_book_info(self, response): if response.text == '': return None result = self.book_info_result(response) if result is None: yield return result['bid'] = response.meta['bid'] self.mysqlHelper.update_book_info(result) def get_book_info_url(self, bid): raise NotImplementedError def book_info_result(self, response): raise NotImplementedError class baseUpdateBookStatusSpider(scrapy.Spider): name = '' source = '' source_name = '' source_id = 0 def __init__(self, host, user, password, db, bid_list, stats): scrapy.Spider.__init__(self) source = self.source self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db, source=source,source_id=self.source_id) self.bid_list = bid_list @classmethod def from_crawler(cls, crawler, *args, **kwargs): settings = crawler.settings host = settings.get('MYSQL_HOST') user = settings.get('MYSQL_USER') password = settings.get('MYSQL_PWD') db = settings.get('MYSQL_DB') bid = kwargs.get('bid') if bid is not None: bid_list = bid.split(',') else: bid_list = [] return cls(host=host, user=user, password=password, db=db, bid_list=bid_list, stats=crawler.stats) def start_requests(self): if len(self.bid_list) >0: for bid in self.bid_list: book = self.mysqlHelper.get_book_info_by_id(bid) if book is not None: url = self.get_book_info_url(book['cp_bid']) meta = {'bid': book['id'], 'cp_bid': book['cp_bid']} yield scrapy.Request(url, callback=self.parse_book_info, meta=meta) else: book_list = self.mysqlHelper.get_need_update_book_list() if book_list is not None: for book in book_list: url = self.get_book_info_url(book['cp_bid']) meta = {'bid': book['id'], 'cp_bid': book['cp_bid']} yield scrapy.Request(url, callback=self.parse_book_info, meta=meta) def parse_book_info(self, response): if response.text == '': return None result = self.book_info_result(response) if result is None: yield return bid = response.meta['bid'] status = result['status'] if int(status) == 1: self.mysqlHelper.update_book_status(bid,status) def get_book_info_url(self, bid): raise NotImplementedError def book_info_result(self, response): raise NotImplementedError