|
@@ -0,0 +1,466 @@
|
|
|
+import scrapy
|
|
|
+import time
|
|
|
+import random
|
|
|
+from content_spider.mysqlHelper import MysqlHelper
|
|
|
+from content_spider.items import BookInfoItem, ChapterItem
|
|
|
+from content_spider.pipelines import formatcontent, removePunctuation
|
|
|
+from content_spider.Util import my_log
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+class baseSpider(scrapy.Spider):
|
|
|
+ name = ''
|
|
|
+ source = ''
|
|
|
+ source_name = ''
|
|
|
+ source_id = 0
|
|
|
+ custom_settings = {
|
|
|
+ 'DOWNLOAD_DELAY': 0.01,
|
|
|
+ 'SOURCE': source,
|
|
|
+ }
|
|
|
+
|
|
|
+ def __init__(self, host, user, password, db, stats, settings):
|
|
|
+ scrapy.Spider.__init__(self)
|
|
|
+ source = self.source
|
|
|
+ source_id = self.source_id
|
|
|
+ self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db, source=source,source_id=source_id)
|
|
|
+ self.__stats = stats
|
|
|
+ self.__stats.set_value('spider_type', 'add')
|
|
|
+ self.__stats.set_value('bid_list', [])
|
|
|
+ self.__is_first = True
|
|
|
+
|
|
|
+ @classmethod
|
|
|
+ def from_crawler(cls, crawler):
|
|
|
+ settings = crawler.settings
|
|
|
+ host = settings.get('MYSQL_HOST')
|
|
|
+ user = settings.get('MYSQL_USER')
|
|
|
+ password = settings.get('MYSQL_PWD')
|
|
|
+ db = settings.get('MYSQL_DB')
|
|
|
+ return cls(host=host, user=user, password=password, db=db, stats=crawler.stats, settings=settings)
|
|
|
+
|
|
|
+ def start_requests(self):
|
|
|
+ yield scrapy.Request(self.get_start_url(), callback=self.parse_book_list)
|
|
|
+
|
|
|
+ def parse_book_list(self, response):
|
|
|
+ result = self.bid_list_result(response)
|
|
|
+ for item in result:
|
|
|
+ bid = item['id']
|
|
|
+ result = self.mysqlHelper.get_book_info_by_source(bid)
|
|
|
+ if result is not None:
|
|
|
+ continue
|
|
|
+ url = self.get_book_info_url(bid)
|
|
|
+ yield scrapy.Request(url, callback=self.parse_book_info)
|
|
|
+ # break
|
|
|
+
|
|
|
+ def parse_book_info(self, response):
|
|
|
+ if response.text == '':
|
|
|
+ return None
|
|
|
+ now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
|
|
|
+ result = self.book_info_result(response)
|
|
|
+ if result is None:
|
|
|
+ return None
|
|
|
+
|
|
|
+ category_id = 0 if result.get('category_id') is None else result.get('category_id')
|
|
|
+
|
|
|
+ book_info_item = BookInfoItem()
|
|
|
+ source_bid = result.get('bid')
|
|
|
+ book_info_item['cp_bid'] = source_bid
|
|
|
+ book_info_item['cp_name'] = self.source_name
|
|
|
+ book_info_item['cp_id'] = self.source_id
|
|
|
+ book_info_item['name'] = result['name']
|
|
|
+ book_info_item['author'] = result['author']
|
|
|
+ book_info_item['intro'] = result['intro']
|
|
|
+ book_info_item['cover'] = result['cover']
|
|
|
+ book_info_item['keyword'] = result['keyword']
|
|
|
+ book_info_item['category_id'] = category_id
|
|
|
+ book_info_item['status'] = result['status']
|
|
|
+ book_info_item['size'] = 0
|
|
|
+ book_info_item['category_name'] = result['category']
|
|
|
+ book_info_item['last_chapter'] = ''
|
|
|
+ book_info_item['chapter_count'] = 0
|
|
|
+ book_info_item['first_cid'] = 0
|
|
|
+ book_info_item['last_cid'] = 0
|
|
|
+ book_info_item['channel'] = result['channel']
|
|
|
+ book_info_item['updated_at'] = now
|
|
|
+ book_info_item['created_at'] = now
|
|
|
+ bid = self.mysqlHelper.insert_book(book_info_item)
|
|
|
+ if self.__is_first:
|
|
|
+ self.__stats.set_value('bid_start', bid)
|
|
|
+ self.__is_first = False
|
|
|
+ self.__stats.get_value('bid_list').append(bid)
|
|
|
+ url = self.get_chapter_list_url(source_bid)
|
|
|
+ meta = {'bid': bid, 'source_bid': source_bid}
|
|
|
+ yield scrapy.Request(url, callback=self.parse_chapter_list, meta=meta)
|
|
|
+
|
|
|
+ def parse_chapter_list(self, response):
|
|
|
+ if response.text == '':
|
|
|
+ return None
|
|
|
+ result = self.chapter_list_result(response)
|
|
|
+ bid = response.meta['bid']
|
|
|
+ source_bid = response.meta['source_bid']
|
|
|
+ i = 0
|
|
|
+ for chapter_item in result:
|
|
|
+ i = i + 1
|
|
|
+ cid = chapter_item['source_chapter_id']
|
|
|
+ meta = chapter_item
|
|
|
+ meta['bid'] = bid
|
|
|
+ url = self.get_chapter_content_url(source_bid, cid)
|
|
|
+ yield scrapy.Request(url, callback=self.parse_chapter_content, meta=meta)
|
|
|
+
|
|
|
+ def parse_chapter_content(self, response):
|
|
|
+ if response.text == '':
|
|
|
+ return None
|
|
|
+ now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
|
|
|
+ result = self.chapter_content_result(response)
|
|
|
+ meta = response.meta
|
|
|
+ chapter_item = ChapterItem()
|
|
|
+ chapter_item['bid'] = meta['bid']
|
|
|
+ chapter_item['name'] = meta['name']
|
|
|
+ chapter_item['sequence'] = meta['sequence']
|
|
|
+ chapter_item['size'] = meta['size']
|
|
|
+ chapter_item['is_vip'] = meta['is_vip']
|
|
|
+ chapter_item['source_chapter_id'] = meta['source_chapter_id']
|
|
|
+ chapter_item['chapter_content_id'] = 0
|
|
|
+ chapter_item['content'] = formatcontent(result['content'])
|
|
|
+
|
|
|
+ if result.get('size') is not None:
|
|
|
+ chapter_item['size'] = result.get('size')
|
|
|
+ if result.get('is_vip') is not None:
|
|
|
+ chapter_item['is_vip'] = result.get('is_vip')
|
|
|
+ if result.get('name') is not None:
|
|
|
+ chapter_item['name'] = result.get('name')
|
|
|
+ if result.get('recent_update_at') is not None:
|
|
|
+ chapter_item['recent_update_at'] = result.get('recent_update_at')
|
|
|
+ if result.get('source_chapter_id') is not None:
|
|
|
+ chapter_item['source_chapter_id'] = result.get('source_chapter_id')
|
|
|
+ chapter_item['prev_cid'] = 0
|
|
|
+ chapter_item['next_cid'] = 0
|
|
|
+ chapter_item['updated_at'] = now
|
|
|
+ chapter_item['created_at'] = now
|
|
|
+ chapter_item = ChapterItem(meta)
|
|
|
+ yield chapter_item
|
|
|
+
|
|
|
+ def get_start_url(self):
|
|
|
+ raise NotImplementedError
|
|
|
+
|
|
|
+ def bid_list_result(self, response):
|
|
|
+ raise NotImplementedError
|
|
|
+
|
|
|
+ def get_book_info_url(self, bid):
|
|
|
+ raise NotImplementedError
|
|
|
+
|
|
|
+ def book_info_result(self, response):
|
|
|
+ raise NotImplementedError
|
|
|
+
|
|
|
+ def get_chapter_list_url(self, bid):
|
|
|
+ raise NotImplementedError
|
|
|
+
|
|
|
+ def chapter_list_result(self, response):
|
|
|
+ raise NotImplementedError
|
|
|
+
|
|
|
+ def get_chapter_content_url(self, bid, cid):
|
|
|
+ raise NotImplementedError
|
|
|
+
|
|
|
+ def chapter_content_result(self, response):
|
|
|
+ raise NotImplementedError
|
|
|
+
|
|
|
+
|
|
|
+class baseUpdateSpider(scrapy.Spider):
|
|
|
+ name = ''
|
|
|
+ source = ''
|
|
|
+ source_id = 0
|
|
|
+ custom_settings = {
|
|
|
+ 'DOWNLOAD_DELAY': 0.01,
|
|
|
+ 'SOURCE': source,
|
|
|
+ }
|
|
|
+
|
|
|
+ def __init__(self, host, user, password, db, stats, settings):
|
|
|
+ scrapy.Spider.__init__(self)
|
|
|
+ source = self.source
|
|
|
+ self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db, source=source,source_id=self.source_id)
|
|
|
+ self.__stats = stats
|
|
|
+ self.__stats.set_value('spider_type', 'update')
|
|
|
+ self.__stats.set_value('bid_list', [])
|
|
|
+ self.__is_first = True
|
|
|
+
|
|
|
+ @classmethod
|
|
|
+ def from_crawler(cls, crawler):
|
|
|
+ settings = crawler.settings
|
|
|
+ host = settings.get('MYSQL_HOST')
|
|
|
+ user = settings.get('MYSQL_USER')
|
|
|
+ password = settings.get('MYSQL_PWD')
|
|
|
+ db = settings.get('MYSQL_DB')
|
|
|
+ return cls(host=host, user=user, password=password, db=db, stats=crawler.stats, settings=settings)
|
|
|
+
|
|
|
+ def start_requests(self):
|
|
|
+ book_list = self.mysqlHelper.get_need_update_book_list()
|
|
|
+ if book_list is not None:
|
|
|
+ for book in book_list:
|
|
|
+ url = self.get_chapter_list_url(book['copilot'])
|
|
|
+ meta = {'bid': book['id'], 'cp_bid': book['cp_bid']}
|
|
|
+ yield scrapy.Request(url, callback=self.parse_chapter_list, meta=meta)
|
|
|
+
|
|
|
+ def parse_chapter_list(self, response):
|
|
|
+ if response.text == '':
|
|
|
+ return None
|
|
|
+ chapter_list = self.chapter_list_result(response)
|
|
|
+ meta = response.meta
|
|
|
+ if chapter_list is not None:
|
|
|
+ bid = response.meta.get('bid')
|
|
|
+ cp_bid = response.meta.get('cp_bid')
|
|
|
+ last_chapter = self.mysqlHelper.get_last_cid_by_bid(bid)
|
|
|
+ start = False
|
|
|
+ if last_chapter is None:
|
|
|
+ start = True
|
|
|
+ last_source_cid = ''
|
|
|
+ last_sequence = 0
|
|
|
+ last_chapter_id = 0
|
|
|
+ else:
|
|
|
+ last_source_cid = str(last_chapter['source_chapter_id'])
|
|
|
+ last_sequence = last_chapter['sequence']
|
|
|
+ last_chapter_id = last_chapter['id']
|
|
|
+
|
|
|
+ has_new_chapter = False
|
|
|
+ for chapter_item in chapter_list:
|
|
|
+ if not start:
|
|
|
+ if len(last_source_cid) > 0:
|
|
|
+ if str(chapter_item['source_chapter_id']) == str(last_source_cid):
|
|
|
+ start = True
|
|
|
+ else:
|
|
|
+ if int(chapter_item['sequence']) == last_sequence:
|
|
|
+ start = True
|
|
|
+ continue
|
|
|
+ if not has_new_chapter:
|
|
|
+ self.__stats.get_value('bid_list').append(
|
|
|
+ {"bid": meta['bid'], 'start': last_chapter_id, 'start_sequence': last_sequence})
|
|
|
+ has_new_chapter = True
|
|
|
+ cid = chapter_item['source_chapter_id']
|
|
|
+ last_sequence = last_sequence + 1
|
|
|
+ meta = chapter_item
|
|
|
+ meta['bid'] = bid
|
|
|
+ url = self.get_chapter_content_url(cp_bid, cid)
|
|
|
+ yield scrapy.Request(url, callback=self.parse_chapter_content, meta=meta)
|
|
|
+
|
|
|
+ def parse_chapter_content(self, response):
|
|
|
+ if response.text == '':
|
|
|
+ return None
|
|
|
+ now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
|
|
|
+ result = self.chapter_content_result(response)
|
|
|
+ meta = response.meta
|
|
|
+ chapter_item = ChapterItem()
|
|
|
+ chapter_item['bid'] = meta['bid']
|
|
|
+ chapter_item['name'] = meta['name']
|
|
|
+ chapter_item['sequence'] = meta['sequence']
|
|
|
+ chapter_item['size'] = meta['size']
|
|
|
+ chapter_item['is_vip'] = meta['is_vip']
|
|
|
+ chapter_item['source_chapter_id'] = meta['source_chapter_id']
|
|
|
+ chapter_item['chapter_content_id'] = 0
|
|
|
+ chapter_item['content'] = formatcontent(result['content'])
|
|
|
+
|
|
|
+ if result.get('size') is not None:
|
|
|
+ chapter_item['size'] = result.get('size')
|
|
|
+ if result.get('is_vip') is not None:
|
|
|
+ chapter_item['is_vip'] = result.get('is_vip')
|
|
|
+ if result.get('name') is not None:
|
|
|
+ chapter_item['name'] = result.get('name')
|
|
|
+ if result.get('recent_update_at') is not None:
|
|
|
+ chapter_item['recent_update_at'] = result.get('recent_update_at')
|
|
|
+ if result.get('source_chapter_id') is not None:
|
|
|
+ chapter_item['source_chapter_id'] = result.get('source_chapter_id')
|
|
|
+ chapter_item['prev_cid'] = 0
|
|
|
+ chapter_item['next_cid'] = 0
|
|
|
+ chapter_item['updated_at'] = now
|
|
|
+ chapter_item['created_at'] = now
|
|
|
+ chapter_item = ChapterItem(meta)
|
|
|
+ yield chapter_item
|
|
|
+
|
|
|
+
|
|
|
+ def get_chapter_list_url(self, bid):
|
|
|
+ raise NotImplementedError
|
|
|
+
|
|
|
+ def chapter_list_result(self, response):
|
|
|
+ raise NotImplementedError
|
|
|
+
|
|
|
+ def get_chapter_content_url(self, bid, cid):
|
|
|
+ raise NotImplementedError
|
|
|
+
|
|
|
+ def chapter_content_result(self, response):
|
|
|
+ raise NotImplementedError
|
|
|
+
|
|
|
+
|
|
|
+class fixChapterSpider(scrapy.Spider):
|
|
|
+ name = ''
|
|
|
+ source = ''
|
|
|
+ source_id = 0
|
|
|
+ custom_settings = {
|
|
|
+ 'DOWNLOAD_DELAY': 0.01,
|
|
|
+ 'SOURCE': source,
|
|
|
+ }
|
|
|
+
|
|
|
+ def __init__(self, host, user, password, db, bid_list, stats, settings):
|
|
|
+ scrapy.Spider.__init__(self)
|
|
|
+ source = self.source
|
|
|
+ self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db, source=source,source_id=self.source_id)
|
|
|
+ self.__stats = stats
|
|
|
+ self.__stats.set_value('spider_type', 'fix')
|
|
|
+ self.__stats.set_value('bid_list', [])
|
|
|
+ self.__is_first = True
|
|
|
+ self.bid_list = bid_list
|
|
|
+
|
|
|
+ @classmethod
|
|
|
+ def from_crawler(cls, crawler, *args, **kwargs):
|
|
|
+ settings = crawler.settings
|
|
|
+ host = settings.get('MYSQL_HOST')
|
|
|
+ user = settings.get('MYSQL_USER')
|
|
|
+ password = settings.get('MYSQL_PWD')
|
|
|
+ db = settings.get('MYSQL_DB')
|
|
|
+ bid = kwargs.get('bid')
|
|
|
+ if bid is not None:
|
|
|
+ bid_list = bid.split(',')
|
|
|
+ else:
|
|
|
+ bid_list = []
|
|
|
+ return cls(host=host, user=user, password=password, db=db,
|
|
|
+ bid_list=bid_list, stats=crawler.stats, settings=settings)
|
|
|
+
|
|
|
+ def start_requests(self):
|
|
|
+ if self.bid_list is None:
|
|
|
+ yield
|
|
|
+ return
|
|
|
+ for book in self.bid_list:
|
|
|
+ info = self.mysqlHelper.get_book_info_by_id(book)
|
|
|
+ if info is None:
|
|
|
+ continue
|
|
|
+ self.mysqlHelper.re_sequence(book)
|
|
|
+ url = self.get_chapter_list_url(info['cp_bid'])
|
|
|
+ meta = {'bid': book, 'cp_bid': info['cp_bid']}
|
|
|
+ yield scrapy.Request(url, callback=self.parse_chapter_list, meta=meta)
|
|
|
+
|
|
|
+ def parse_chapter_list(self, response):
|
|
|
+ if response.text == '':
|
|
|
+ return None
|
|
|
+ chapter_list = self.chapter_list_result(response)
|
|
|
+ if chapter_list is not None:
|
|
|
+ bid = response.meta.get('bid')
|
|
|
+ cp_bid = response.meta.get('cp_bid')
|
|
|
+ last_sequence = 0
|
|
|
+ for chapter_item in chapter_list:
|
|
|
+ last_sequence = last_sequence + 1
|
|
|
+ chapter_info = self.mysqlHelper.get_cid_by_bid_sequence(bid, chapter_item['sequence'])
|
|
|
+ cid = chapter_item['source_chapter_id']
|
|
|
+ meta = chapter_item
|
|
|
+ if chapter_info is not None:
|
|
|
+ meta['type'] = 'update'
|
|
|
+ meta['chapter_content_id'] = chapter_info['chapter_content_id']
|
|
|
+ meta['cid'] = chapter_info['id']
|
|
|
+ meta['bid'] = bid
|
|
|
+ url = self.get_chapter_content_url(cp_bid, cid)
|
|
|
+ yield scrapy.Request(url, callback=self.parse_chapter_content, meta=meta)
|
|
|
+ self.__stats.get_value('bid_list').append({'bid':bid,'end':last_sequence})
|
|
|
+
|
|
|
+ def parse_chapter_content(self, response):
|
|
|
+ if response.text == '':
|
|
|
+ return None
|
|
|
+ now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
|
|
|
+ result = self.chapter_content_result(response)
|
|
|
+ meta = response.meta
|
|
|
+ data = {}
|
|
|
+ data['bid'] = meta['bid']
|
|
|
+ data['name'] = meta['name']
|
|
|
+ data['size'] = meta['size']
|
|
|
+ data['is_vip'] = meta['is_vip']
|
|
|
+ data['sequence'] = meta['sequence']
|
|
|
+ data['source_chapter_id'] = meta['source_chapter_id']
|
|
|
+ data['recent_update_at'] = meta['recent_update_at']
|
|
|
+ data['content'] = formatcontent(result['content'])
|
|
|
+
|
|
|
+ if result.get('size') is not None:
|
|
|
+ data['size'] = result.get('size')
|
|
|
+ if result.get('is_vip') is not None:
|
|
|
+ data['is_vip'] = result.get('is_vip')
|
|
|
+ if result.get('name') is not None:
|
|
|
+ data['name'] = result.get('name')
|
|
|
+ if result.get('recent_update_at') is not None:
|
|
|
+ data['recent_update_at'] = result.get('recent_update_at')
|
|
|
+ if result.get('source_chapter_id') is not None:
|
|
|
+ data['source_chapter_id'] = result.get('source_chapter_id')
|
|
|
+ if meta.get('type') is not None:
|
|
|
+ content = formatcontent(result['content'])
|
|
|
+ data['content'] = content
|
|
|
+ data['size'] = len(removePunctuation(content))
|
|
|
+ self.mysqlHelper.update_content(meta['chapter_content_id'], meta['name'], result['content'])
|
|
|
+ if meta.get('cid') is not None:
|
|
|
+ data['cid'] = meta['cid']
|
|
|
+ self.mysqlHelper.update_chapter(data)
|
|
|
+ else:
|
|
|
+ data['prev_cid'] = 0
|
|
|
+ data['next_cid'] = 0
|
|
|
+ data['updated_at'] = now
|
|
|
+ data['created_at'] = now
|
|
|
+ chapter_item = ChapterItem(data)
|
|
|
+ yield chapter_item
|
|
|
+
|
|
|
+ def get_chapter_list_url(self, bid):
|
|
|
+ raise NotImplementedError
|
|
|
+
|
|
|
+ def chapter_list_result(self, response):
|
|
|
+ raise NotImplementedError
|
|
|
+
|
|
|
+ def get_chapter_content_url(self, bid, cid):
|
|
|
+ raise NotImplementedError
|
|
|
+
|
|
|
+ def chapter_content_result(self, response):
|
|
|
+ raise NotImplementedError
|
|
|
+
|
|
|
+
|
|
|
+class fixBookInfoSpider(scrapy.Spider):
|
|
|
+ name = ''
|
|
|
+ source = ''
|
|
|
+ source_name = ''
|
|
|
+ source_id = 0
|
|
|
+
|
|
|
+ def __init__(self, host, user, password, db, bid_list, stats):
|
|
|
+ scrapy.Spider.__init__(self)
|
|
|
+ source = self.source
|
|
|
+ self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db, source=source,source_id=self.source_id)
|
|
|
+ self.bid_list = bid_list
|
|
|
+
|
|
|
+ @classmethod
|
|
|
+ def from_crawler(cls, crawler, *args, **kwargs):
|
|
|
+ settings = crawler.settings
|
|
|
+ host = settings.get('MYSQL_HOST')
|
|
|
+ user = settings.get('MYSQL_USER')
|
|
|
+ password = settings.get('MYSQL_PWD')
|
|
|
+ db = settings.get('MYSQL_DB')
|
|
|
+ bid = kwargs.get('bid')
|
|
|
+ if bid is not None:
|
|
|
+ bid_list = bid.split(',')
|
|
|
+ else:
|
|
|
+ bid_list = []
|
|
|
+ return cls(host=host, user=user, password=password, db=db, bid_list=bid_list, stats=crawler.stats)
|
|
|
+
|
|
|
+ def start_requests(self):
|
|
|
+ if self.bid_list is None:
|
|
|
+ yield
|
|
|
+ return
|
|
|
+ for book in self.bid_list:
|
|
|
+ info = self.mysqlHelper.get_book_info_by_id(book)
|
|
|
+ if info is None:
|
|
|
+ continue
|
|
|
+ url = self.get_book_info_url(info['cp_bid'])
|
|
|
+ meta = {'bid': book, 'cp_bid': info['cp_bid']}
|
|
|
+ yield scrapy.Request(url, callback=self.parse_book_info, meta=meta)
|
|
|
+
|
|
|
+ def parse_book_info(self, response):
|
|
|
+ if response.text == '':
|
|
|
+ return None
|
|
|
+ result = self.book_info_result(response)
|
|
|
+ if result is None:
|
|
|
+ yield
|
|
|
+ return
|
|
|
+ result['bid'] = response.meta['bid']
|
|
|
+ self.mysqlHelper.update_book_info(result)
|
|
|
+
|
|
|
+ def get_book_info_url(self, bid):
|
|
|
+ raise NotImplementedError
|
|
|
+
|
|
|
+ def book_info_result(self, response):
|
|
|
+ raise NotImplementedError
|