|
@@ -0,0 +1,418 @@
|
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
|
+
|
|
|
|
+import time
|
|
|
|
+import scrapy
|
|
|
|
+from ydyspider.items import BookInfoItem, ChapterItem
|
|
|
|
+from ydyspider.mysqlHelper import MysqlHelper
|
|
|
|
+import hashlib
|
|
|
|
+import random
|
|
|
|
+
|
|
|
|
+from ydyspider.pipelines import formatcontent, removePunctuation
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def md5(token):
|
|
|
|
+ m = hashlib.md5()
|
|
|
|
+ m.update(token.encode('utf-8'))
|
|
|
|
+ return m.hexdigest()
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def sha1(token):
|
|
|
|
+ m = hashlib.sha1()
|
|
|
|
+ m.update(token.encode('utf-8'))
|
|
|
|
+ return m.hexdigest()
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def random_str(slen=10):
|
|
|
|
+ seed = "1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
|
|
|
+ sa = []
|
|
|
|
+ for i in range(slen):
|
|
|
|
+ sa.append(random.choice(seed))
|
|
|
|
+ return ''.join(sa)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+class baseSpider(scrapy.Spider):
|
|
|
|
+ name = ''
|
|
|
|
+ allowed_domains = []
|
|
|
|
+ base_url = ''
|
|
|
|
+ source = ''
|
|
|
|
+ source_name = ''
|
|
|
|
+
|
|
|
|
+ def __init__(self, host, user, password, db, stats):
|
|
|
|
+ scrapy.Spider.__init__(self)
|
|
|
|
+ source = self.source
|
|
|
|
+ self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db,source=source)
|
|
|
|
+ self.__stats = stats
|
|
|
|
+ self.__source = self.source
|
|
|
|
+ self.__stats.set_value('bid_list', [])
|
|
|
|
+ self.__stats.set_value('spider_type', 'add')
|
|
|
|
+
|
|
|
|
+ @classmethod
|
|
|
|
+ def from_crawler(cls, crawler):
|
|
|
|
+ settings = crawler.settings
|
|
|
|
+ host = settings.get('MYSQL_HOST')
|
|
|
|
+ user = settings.get('MYSQL_USER')
|
|
|
|
+ password = settings.get('MYSQL_PWD')
|
|
|
|
+ db = settings.get('MYSQL_DB')
|
|
|
|
+
|
|
|
|
+ return cls(host=host, user=user, password=password, db=db, stats=crawler.stats)
|
|
|
|
+
|
|
|
|
+ def start_requests(self):
|
|
|
|
+ yield scrapy.Request(self.get_start_url(), callback=self.parse_book_list)
|
|
|
|
+
|
|
|
|
+ def parse_book_list(self, response):
|
|
|
|
+ result = self.bid_list_result(response)
|
|
|
|
+ for item in result:
|
|
|
|
+ bid = item['id']
|
|
|
|
+ result = self.mysqlHelper.get_book_info_by_source(bid)
|
|
|
|
+ if result is not None:
|
|
|
|
+ continue
|
|
|
|
+ url = self.get_book_info_url(bid)
|
|
|
|
+ yield scrapy.Request(url, callback=self.parse_book_info)
|
|
|
|
+
|
|
|
|
+ def parse_book_info(self, response):
|
|
|
|
+ if response.text == '':
|
|
|
|
+ return None
|
|
|
|
+ now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
|
|
|
|
+ result = self.book_info_result(response)
|
|
|
|
+ if result is None:
|
|
|
|
+ return None
|
|
|
|
+ source_bid = result.get('bid')
|
|
|
|
+ book_info_item = BookInfoItem()
|
|
|
|
+ book_info_item['source_bid'] = source_bid
|
|
|
|
+ book_info_item['name'] = result['name']
|
|
|
|
+ book_info_item['author'] = result['author']
|
|
|
|
+ book_info_item['intro'] = result['intro']
|
|
|
|
+ book_info_item['cover'] = result['cover']
|
|
|
|
+ book_info_item['keyword'] = result['keyword']
|
|
|
|
+ book_info_item['category_id'] = 0 if result.get('category_id') is None else result.get('category_id')
|
|
|
|
+ book_info_item['status'] = result['status']
|
|
|
|
+ book_info_item['chapter_count'] = 0
|
|
|
|
+ book_info_item['first_cid'] = 0
|
|
|
|
+ book_info_item['last_cid'] = 0
|
|
|
|
+ book_info_item['size'] = 0
|
|
|
|
+ book_info_item['last_chapter'] = ''
|
|
|
|
+ book_info_item['category_name'] = result['category']
|
|
|
|
+ book_info_item['source'] = self.source
|
|
|
|
+ book_info_item['updated_at'] = now
|
|
|
|
+ book_info_item['created_at'] = now
|
|
|
|
+ bid = self.mysqlHelper.insert_book(book_info_item)
|
|
|
|
+ self.__stats.get_value('bid_list').append(bid)
|
|
|
|
+ url = self.get_chapter_list_url(source_bid)
|
|
|
|
+ meta = {'bid': bid, 'source_bid': source_bid}
|
|
|
|
+ yield scrapy.Request(url, self.parse_chapter_list, meta=meta)
|
|
|
|
+
|
|
|
|
+ def parse_chapter_list(self, response):
|
|
|
|
+ if response.text == '':
|
|
|
|
+ return None
|
|
|
|
+ result = self.chapter_list_result(response)
|
|
|
|
+ if result is None:
|
|
|
|
+ return None
|
|
|
|
+ bid = response.meta['bid']
|
|
|
|
+ source_bid = response.meta['source_bid']
|
|
|
|
+ for chapter_item in result:
|
|
|
|
+ meta = chapter_item
|
|
|
|
+ cid = chapter_item['source_chapter_id']
|
|
|
|
+ meta['bid'] = bid
|
|
|
|
+ url = self.get_chapter_content_url(source_bid, cid)
|
|
|
|
+ yield scrapy.Request(url, self.parse_chapter_content, meta=meta)
|
|
|
|
+
|
|
|
|
+ def parse_chapter_content(self, response):
|
|
|
|
+ if response.text == '':
|
|
|
|
+ return None
|
|
|
|
+ now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
|
|
|
|
+ result = self.chapter_content_result(response)
|
|
|
|
+ meta = response.meta
|
|
|
|
+ chapter_item = ChapterItem()
|
|
|
|
+ chapter_item['bid'] = meta['bid']
|
|
|
|
+ chapter_item['name'] = meta['name']
|
|
|
|
+ chapter_item['sequence'] = meta['sequence']
|
|
|
|
+ chapter_item['is_vip'] = meta['is_vip']
|
|
|
|
+ chapter_item['prev_cid'] = 0
|
|
|
|
+ chapter_item['next_cid'] = 0
|
|
|
|
+ chapter_item['recent_update_at'] = meta['recent_update_at']
|
|
|
|
+ content = formatcontent(result['content'])
|
|
|
|
+ chapter_item['content'] = content
|
|
|
|
+ chapter_item['size'] = len(removePunctuation(content))
|
|
|
|
+ chapter_item['chapter_content_id'] = 0
|
|
|
|
+ chapter_item['source_chapter_id'] = meta['source_chapter_id']
|
|
|
|
+ chapter_item['created_at'] = now
|
|
|
|
+ chapter_item['updated_at'] = now
|
|
|
|
+ if result.get('size') is not None:
|
|
|
|
+ chapter_item['size'] = result.get('size')
|
|
|
|
+ if result.get('is_vip') is not None:
|
|
|
|
+ chapter_item['is_vip'] = result.get('is_vip')
|
|
|
|
+ if result.get('name') is not None:
|
|
|
|
+ chapter_item['name'] = result.get('name')
|
|
|
|
+ if result.get('recent_update_at') is not None:
|
|
|
|
+ chapter_item['recent_update_at'] = result.get('recent_update_at')
|
|
|
|
+ if result.get('source_chapter_id') is not None:
|
|
|
|
+ chapter_item['source_chapter_id'] = result.get('source_chapter_id')
|
|
|
|
+ yield chapter_item
|
|
|
|
+
|
|
|
|
+ def get_start_url(self):
|
|
|
|
+ raise NotImplementedError
|
|
|
|
+
|
|
|
|
+ def bid_list_result(self, response):
|
|
|
|
+ raise NotImplementedError
|
|
|
|
+
|
|
|
|
+ def get_book_info_url(self, bid):
|
|
|
|
+ raise NotImplementedError
|
|
|
|
+
|
|
|
|
+ def book_info_result(self, response):
|
|
|
|
+ raise NotImplementedError
|
|
|
|
+
|
|
|
|
+ def get_chapter_list_url(self, bid):
|
|
|
|
+ raise NotImplementedError
|
|
|
|
+
|
|
|
|
+ def chapter_list_result(self, response):
|
|
|
|
+ raise NotImplementedError
|
|
|
|
+
|
|
|
|
+ def get_chapter_content_url(self, bid, cid):
|
|
|
|
+ raise NotImplementedError
|
|
|
|
+
|
|
|
|
+ def chapter_content_result(self, response):
|
|
|
|
+ raise NotImplementedError
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+class baseUpdateSpider(scrapy.Spider):
|
|
|
|
+ name = ''
|
|
|
|
+ allowed_domains = []
|
|
|
|
+ base_url = ''
|
|
|
|
+ source = ''
|
|
|
|
+ source_name = ''
|
|
|
|
+
|
|
|
|
+ def __init__(self, host, user, password, db, stats):
|
|
|
|
+ scrapy.Spider.__init__(self)
|
|
|
|
+ source = self.source
|
|
|
|
+ self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db, source=source)
|
|
|
|
+ self.__stats = stats
|
|
|
|
+ self.__stats.set_value('spider_type', 'update')
|
|
|
|
+ self.__stats.set_value('bid_list', [])
|
|
|
|
+ self.__is_first = True
|
|
|
|
+
|
|
|
|
+ @classmethod
|
|
|
|
+ def from_crawler(cls, crawler):
|
|
|
|
+ settings = crawler.settings
|
|
|
|
+ host = settings.get('MYSQL_HOST')
|
|
|
|
+ user = settings.get('MYSQL_USER')
|
|
|
|
+ password = settings.get('MYSQL_PWD')
|
|
|
|
+ db = settings.get('MYSQL_DB')
|
|
|
|
+ return cls(host=host, user=user, password=password, db=db, stats=crawler.stats)
|
|
|
|
+
|
|
|
|
+ def start_requests(self):
|
|
|
|
+ book_list = self.mysqlHelper.get_need_update_book_list()
|
|
|
|
+ if book_list is not None:
|
|
|
|
+ for book in book_list:
|
|
|
|
+ url = self.get_chapter_list_url(book['source_bid'])
|
|
|
|
+ meta = {'bid': book['id'], 'source_bid': book['source_bid']}
|
|
|
|
+ yield scrapy.Request(url, callback=self.parse_chapter_list, meta=meta)
|
|
|
|
+
|
|
|
|
+ def parse_chapter_list(self, response):
|
|
|
|
+ if response.text == '':
|
|
|
|
+ return None
|
|
|
|
+ chapter_list = self.chapter_list_result(response)
|
|
|
|
+ meta = response.meta
|
|
|
|
+ if chapter_list is not None:
|
|
|
|
+ bid = response.meta.get('bid')
|
|
|
|
+ source_bid = response.meta.get('source_bid')
|
|
|
|
+ last_chapter = self.mysqlHelper.get_last_cid_by_bid(bid)
|
|
|
|
+ start = False
|
|
|
|
+ if last_chapter is None:
|
|
|
|
+ start = True
|
|
|
|
+ last_source_cid = 0
|
|
|
|
+ last_sequence = 0
|
|
|
|
+ last_chapter_id = 0
|
|
|
|
+ else:
|
|
|
|
+ last_source_cid = last_chapter['source_chapter_id']
|
|
|
|
+ last_sequence = last_chapter['sequence']
|
|
|
|
+ last_chapter_id = last_chapter['id']
|
|
|
|
+
|
|
|
|
+ has_new_chapter = False
|
|
|
|
+ for chapter_item in chapter_list:
|
|
|
|
+ if not start:
|
|
|
|
+ if int(chapter_item['source_chapter_id']) == int(last_source_cid):
|
|
|
|
+ start = True
|
|
|
|
+ continue
|
|
|
|
+ if not has_new_chapter:
|
|
|
|
+ self.__stats.get_value('bid_list').append(
|
|
|
|
+ {"bid": meta['bid'], 'start': last_chapter_id})
|
|
|
|
+ has_new_chapter = True
|
|
|
|
+ cid = chapter_item['source_chapter_id']
|
|
|
|
+ last_sequence = last_sequence + 1
|
|
|
|
+ if chapter_item['sequence'] == 0:
|
|
|
|
+ chapter_item['sequence'] = last_sequence
|
|
|
|
+ meta = chapter_item
|
|
|
|
+ meta['bid'] = bid
|
|
|
|
+ url = self.get_chapter_content_url(source_bid, cid)
|
|
|
|
+ yield scrapy.Request(url, callback=self.parse_chapter_content, meta=meta)
|
|
|
|
+
|
|
|
|
+ def parse_chapter_content(self, response):
|
|
|
|
+ if response.text == '':
|
|
|
|
+ return None
|
|
|
|
+ now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
|
|
|
|
+ result = self.chapter_content_result(response)
|
|
|
|
+ meta = response.meta
|
|
|
|
+ chapter_item = ChapterItem()
|
|
|
|
+ chapter_item['bid'] = meta['bid']
|
|
|
|
+ chapter_item['name'] = meta['name']
|
|
|
|
+ chapter_item['sequence'] = meta['sequence']
|
|
|
|
+ chapter_item['is_vip'] = meta['is_vip']
|
|
|
|
+ chapter_item['prev_cid'] = 0
|
|
|
|
+ chapter_item['next_cid'] = 0
|
|
|
|
+ chapter_item['recent_update_at'] = meta['recent_update_at']
|
|
|
|
+ content = formatcontent(result['content'])
|
|
|
|
+ chapter_item['size'] = len(removePunctuation(content))
|
|
|
|
+ chapter_item['content'] = content
|
|
|
|
+ chapter_item['chapter_content_id'] = 0
|
|
|
|
+ chapter_item['source_chapter_id'] = meta['source_chapter_id']
|
|
|
|
+ chapter_item['created_at'] = now
|
|
|
|
+ chapter_item['updated_at'] = now
|
|
|
|
+ if result.get('is_vip') is not None:
|
|
|
|
+ chapter_item['is_vip'] = result.get('is_vip')
|
|
|
|
+ if result.get('name') is not None:
|
|
|
|
+ chapter_item['name'] = result.get('name')
|
|
|
|
+ if result.get('recent_update_at') is not None:
|
|
|
|
+ chapter_item['recent_update_at'] = result.get('recent_update_at')
|
|
|
|
+ if result.get('source_chapter_id') is not None:
|
|
|
|
+ chapter_item['source_chapter_id'] = result.get('source_chapter_id')
|
|
|
|
+ yield chapter_item
|
|
|
|
+
|
|
|
|
+ def get_chapter_list_url(self, bid):
|
|
|
|
+ raise NotImplementedError
|
|
|
|
+
|
|
|
|
+ def chapter_list_result(self, response):
|
|
|
|
+ raise NotImplementedError
|
|
|
|
+
|
|
|
|
+ def get_chapter_content_url(self, bid, cid):
|
|
|
|
+ raise NotImplementedError
|
|
|
|
+
|
|
|
|
+ def chapter_content_result(self, response):
|
|
|
|
+ raise NotImplementedError
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+class fixChapterSpider(scrapy.Spider):
|
|
|
|
+ name = ''
|
|
|
|
+ source = ''
|
|
|
|
+
|
|
|
|
+ def __init__(self, host, user, password, db, bid_list, stats):
|
|
|
|
+ scrapy.Spider.__init__(self)
|
|
|
|
+ source = self.source
|
|
|
|
+ self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db, source=source)
|
|
|
|
+ self.__stats = stats
|
|
|
|
+ self.__is_first = True
|
|
|
|
+ self.bid_list = bid_list
|
|
|
|
+
|
|
|
|
+ @classmethod
|
|
|
|
+ def from_crawler(cls, crawler, *args, **kwargs):
|
|
|
|
+ settings = crawler.settings
|
|
|
|
+ host = settings.get('MYSQL_HOST')
|
|
|
|
+ user = settings.get('MYSQL_USER')
|
|
|
|
+ password = settings.get('MYSQL_PWD')
|
|
|
|
+ db = settings.get('MYSQL_DB')
|
|
|
|
+ bid = kwargs.get('bid')
|
|
|
|
+ if bid is not None:
|
|
|
|
+ bid_list = bid.split(',')
|
|
|
|
+ else:
|
|
|
|
+ bid_list = []
|
|
|
|
+ return cls(host=host, user=user, password=password, db=db, bid_list=bid_list, stats=crawler.stats)
|
|
|
|
+
|
|
|
|
+ def start_requests(self):
|
|
|
|
+ if self.bid_list is None:
|
|
|
|
+ yield
|
|
|
|
+ return
|
|
|
|
+ for book in self.bid_list:
|
|
|
|
+ info = self.mysqlHelper.get_book_info_by_id(book)
|
|
|
|
+ if info is None:
|
|
|
|
+ continue
|
|
|
|
+ url = self.get_chapter_list_url(info['source_bid'])
|
|
|
|
+ meta = {'bid': book, 'source_bid': info['source_bid']}
|
|
|
|
+ yield scrapy.Request(url, callback=self.parse_chapter_list, meta=meta)
|
|
|
|
+
|
|
|
|
+ def parse_chapter_list(self, response):
|
|
|
|
+ if response.text == '':
|
|
|
|
+ return None
|
|
|
|
+ chapter_list = self.chapter_list_result(response)
|
|
|
|
+ if chapter_list is not None:
|
|
|
|
+ bid = response.meta.get('bid')
|
|
|
|
+ source_bid = response.meta.get('source_bid')
|
|
|
|
+ last_sequence = 0
|
|
|
|
+ for chapter_item in chapter_list:
|
|
|
|
+ last_sequence = last_sequence + 1
|
|
|
|
+
|
|
|
|
+ if chapter_item['sequence'] == 0:
|
|
|
|
+ chapter_item['sequence'] = last_sequence
|
|
|
|
+ chapter_info = self.mysqlHelper.get_cid_by_bid_sequence(bid, last_sequence)
|
|
|
|
+ cid = chapter_item['source_chapter_id']
|
|
|
|
+
|
|
|
|
+ meta = chapter_item
|
|
|
|
+ if chapter_info is not None:
|
|
|
|
+ meta['type'] = 'update'
|
|
|
|
+ meta['chapter_content_id'] = chapter_info['chapter_content_id']
|
|
|
|
+ meta['cid'] = chapter_info['id']
|
|
|
|
+ meta['bid'] = bid
|
|
|
|
+ url = self.get_chapter_content_url(source_bid, cid)
|
|
|
|
+ yield scrapy.Request(url, callback=self.parse_chapter_content, meta=meta)
|
|
|
|
+
|
|
|
|
+ def parse_chapter_content(self, response):
|
|
|
|
+ if response.text == '':
|
|
|
|
+ return None
|
|
|
|
+ now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
|
|
|
|
+ result = self.chapter_content_result(response)
|
|
|
|
+ meta = response.meta
|
|
|
|
+ content = formatcontent(result['content'])
|
|
|
|
+ meta['size'] = len(removePunctuation(content))
|
|
|
|
+ meta['content'] = content
|
|
|
|
+ if result.get('size') is not None:
|
|
|
|
+ meta['size'] = result.get('size')
|
|
|
|
+ if result.get('is_vip') is not None:
|
|
|
|
+ meta['is_vip'] = result.get('is_vip')
|
|
|
|
+ if result.get('name') is not None:
|
|
|
|
+ meta['name'] = result.get('name')
|
|
|
|
+ if result.get('recent_update_at') is not None:
|
|
|
|
+ meta['recent_update_at'] = result.get('recent_update_at')
|
|
|
|
+ if result.get('source_chapter_id') is not None:
|
|
|
|
+ meta['source_chapter_id'] = result.get('source_chapter_id')
|
|
|
|
+ if meta.get('type') is not None:
|
|
|
|
+ self.mysqlHelper.update_content(meta['chapter_content_id'], meta['name'], result['content'])
|
|
|
|
+ self.mysqlHelper.update_chapter(meta)
|
|
|
|
+ else:
|
|
|
|
+ chapter_item = ChapterItem()
|
|
|
|
+ chapter_item['bid'] = meta['bid']
|
|
|
|
+ chapter_item['name'] = meta['name']
|
|
|
|
+ chapter_item['sequence'] = meta['sequence']
|
|
|
|
+ chapter_item['size'] = meta['size']
|
|
|
|
+ chapter_item['is_vip'] = meta['is_vip']
|
|
|
|
+ chapter_item['prev_cid'] = 0
|
|
|
|
+ chapter_item['next_cid'] = 0
|
|
|
|
+ chapter_item['recent_update_at'] = meta['recent_update_at']
|
|
|
|
+ chapter_item['content'] = result['content']
|
|
|
|
+ if meta.get('chapter_content_id') is not None:
|
|
|
|
+ chapter_item['chapter_content_id'] = meta['chapter_content_id']
|
|
|
|
+ else:
|
|
|
|
+ chapter_item['chapter_content_id'] = 0
|
|
|
|
+ chapter_item['source_chapter_id'] = meta['source_chapter_id']
|
|
|
|
+ chapter_item['created_at'] = now
|
|
|
|
+ chapter_item['updated_at'] = now
|
|
|
|
+ if result.get('size') is not None:
|
|
|
|
+ chapter_item['size'] = result.get('size')
|
|
|
|
+ if result.get('is_vip') is not None:
|
|
|
|
+ chapter_item['is_vip'] = result.get('is_vip')
|
|
|
|
+ if result.get('name') is not None:
|
|
|
|
+ chapter_item['name'] = result.get('name')
|
|
|
|
+ if result.get('recent_update_at') is not None:
|
|
|
|
+ chapter_item['recent_update_at'] = result.get('recent_update_at')
|
|
|
|
+ if result.get('source_chapter_id') is not None:
|
|
|
|
+ chapter_item['source_chapter_id'] = result.get('source_chapter_id')
|
|
|
|
+ yield chapter_item
|
|
|
|
+
|
|
|
|
+ def get_chapter_list_url(self, bid):
|
|
|
|
+ raise NotImplementedError
|
|
|
|
+
|
|
|
|
+ def chapter_list_result(self, response):
|
|
|
|
+ raise NotImplementedError
|
|
|
|
+
|
|
|
|
+ def get_chapter_content_url(self, bid, cid):
|
|
|
|
+ raise NotImplementedError
|
|
|
|
+
|
|
|
|
+ def chapter_content_result(self, response):
|
|
|
|
+ raise NotImplementedError
|