123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465 |
- import scrapy
- import time
- import random
- from content_spider.mysqlHelper import MysqlHelper
- from content_spider.items import BookInfoItem, ChapterItem
- from content_spider.pipelines import formatcontent, removePunctuation
- from content_spider.Util import my_log
- class baseSpider(scrapy.Spider):
- name = ''
- source = ''
- source_name = ''
- source_id = 0
- custom_settings = {
- 'DOWNLOAD_DELAY': 0.01,
- 'SOURCE': source,
- }
- def __init__(self, host, user, password, db, stats, settings):
- scrapy.Spider.__init__(self)
- source = self.source
- source_id = self.source_id
- self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db, source=source,source_id=source_id)
- self.__stats = stats
- self.__stats.set_value('spider_type', 'add')
- self.__stats.set_value('bid_list', [])
- self.__is_first = True
- @classmethod
- def from_crawler(cls, crawler):
- settings = crawler.settings
- host = settings.get('MYSQL_HOST')
- user = settings.get('MYSQL_USER')
- password = settings.get('MYSQL_PWD')
- db = settings.get('MYSQL_DB')
- return cls(host=host, user=user, password=password, db=db, stats=crawler.stats, settings=settings)
- def start_requests(self):
- yield scrapy.Request(self.get_start_url(), callback=self.parse_book_list)
- def parse_book_list(self, response):
- result = self.bid_list_result(response)
- for item in result:
- bid = item['id']
- result = self.mysqlHelper.get_book_info_by_source(bid)
- if result is not None:
- continue
- url = self.get_book_info_url(bid)
- yield scrapy.Request(url, callback=self.parse_book_info)
- # break
- def parse_book_info(self, response):
- if response.text == '':
- return None
- now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
- result = self.book_info_result(response)
- if result is None:
- return None
- category_id = 0 if result.get('category_id') is None else result.get('category_id')
- book_info_item = BookInfoItem()
- source_bid = result.get('bid')
- book_info_item['cp_bid'] = source_bid
- book_info_item['cp_name'] = self.source_name
- book_info_item['cp_id'] = self.source_id
- book_info_item['name'] = result['name']
- book_info_item['author'] = result['author']
- book_info_item['intro'] = result['intro']
- book_info_item['cover'] = result['cover']
- book_info_item['keyword'] = result['keyword']
- book_info_item['category_id'] = category_id
- book_info_item['status'] = result['status']
- book_info_item['size'] = 0
- book_info_item['category_name'] = result['category']
- book_info_item['last_chapter'] = ''
- book_info_item['chapter_count'] = 0
- book_info_item['first_cid'] = 0
- book_info_item['last_cid'] = 0
- book_info_item['channel'] = result['channel']
- book_info_item['updated_at'] = now
- book_info_item['created_at'] = now
- bid = self.mysqlHelper.insert_book(book_info_item)
- if self.__is_first:
- self.__stats.set_value('bid_start', bid)
- self.__is_first = False
- self.__stats.get_value('bid_list').append(bid)
- url = self.get_chapter_list_url(source_bid)
- meta = {'bid': bid, 'source_bid': source_bid}
- yield scrapy.Request(url, callback=self.parse_chapter_list, meta=meta)
- def parse_chapter_list(self, response):
- if response.text == '':
- return None
- result = self.chapter_list_result(response)
- bid = response.meta['bid']
- source_bid = response.meta['source_bid']
- i = 0
- for chapter_item in result:
- i = i + 1
- cid = chapter_item['source_chapter_id']
- meta = chapter_item
- meta['bid'] = bid
- url = self.get_chapter_content_url(source_bid, cid)
- yield scrapy.Request(url, callback=self.parse_chapter_content, meta=meta)
- def parse_chapter_content(self, response):
- if response.text == '':
- return None
- now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
- result = self.chapter_content_result(response)
- meta = response.meta
- chapter_item = ChapterItem()
- chapter_item['bid'] = meta['bid']
- chapter_item['name'] = meta['name']
- chapter_item['sequence'] = meta['sequence']
- chapter_item['size'] = meta['size']
- chapter_item['is_vip'] = meta['is_vip']
- chapter_item['source_chapter_id'] = meta['source_chapter_id']
- chapter_item['recent_update_at'] = meta['recent_update_at']
- chapter_item['chapter_content_id'] = 0
- chapter_item['content'] = formatcontent(result['content'])
- if result.get('size') is not None:
- chapter_item['size'] = result.get('size')
- if result.get('is_vip') is not None:
- chapter_item['is_vip'] = result.get('is_vip')
- if result.get('name') is not None:
- chapter_item['name'] = result.get('name')
- if result.get('recent_update_at') is not None:
- chapter_item['recent_update_at'] = result.get('recent_update_at')
- if result.get('source_chapter_id') is not None:
- chapter_item['source_chapter_id'] = result.get('source_chapter_id')
- chapter_item['prev_cid'] = 0
- chapter_item['next_cid'] = 0
- chapter_item['updated_at'] = now
- chapter_item['created_at'] = now
- yield chapter_item
- def get_start_url(self):
- raise NotImplementedError
- def bid_list_result(self, response):
- raise NotImplementedError
- def get_book_info_url(self, bid):
- raise NotImplementedError
- def book_info_result(self, response):
- raise NotImplementedError
- def get_chapter_list_url(self, bid):
- raise NotImplementedError
- def chapter_list_result(self, response):
- raise NotImplementedError
- def get_chapter_content_url(self, bid, cid):
- raise NotImplementedError
- def chapter_content_result(self, response):
- raise NotImplementedError
- class baseUpdateSpider(scrapy.Spider):
- name = ''
- source = ''
- source_id = 0
- custom_settings = {
- 'DOWNLOAD_DELAY': 0.01,
- 'SOURCE': source,
- }
- def __init__(self, host, user, password, db, stats, settings):
- scrapy.Spider.__init__(self)
- source = self.source
- self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db, source=source,source_id=self.source_id)
- self.__stats = stats
- self.__stats.set_value('spider_type', 'update')
- self.__stats.set_value('bid_list', [])
- self.__is_first = True
- @classmethod
- def from_crawler(cls, crawler):
- settings = crawler.settings
- host = settings.get('MYSQL_HOST')
- user = settings.get('MYSQL_USER')
- password = settings.get('MYSQL_PWD')
- db = settings.get('MYSQL_DB')
- return cls(host=host, user=user, password=password, db=db, stats=crawler.stats, settings=settings)
- def start_requests(self):
- book_list = self.mysqlHelper.get_need_update_book_list()
- if book_list is not None:
- for book in book_list:
- url = self.get_chapter_list_url(book['copilot'])
- meta = {'bid': book['id'], 'cp_bid': book['cp_bid']}
- yield scrapy.Request(url, callback=self.parse_chapter_list, meta=meta)
- def parse_chapter_list(self, response):
- if response.text == '':
- return None
- chapter_list = self.chapter_list_result(response)
- meta = response.meta
- if chapter_list is not None:
- bid = response.meta.get('bid')
- cp_bid = response.meta.get('cp_bid')
- last_chapter = self.mysqlHelper.get_last_cid_by_bid(bid)
- start = False
- if last_chapter is None:
- start = True
- last_source_cid = ''
- last_sequence = 0
- last_chapter_id = 0
- else:
- last_source_cid = str(last_chapter['source_chapter_id'])
- last_sequence = last_chapter['sequence']
- last_chapter_id = last_chapter['id']
- has_new_chapter = False
- for chapter_item in chapter_list:
- if not start:
- if len(last_source_cid) > 0:
- if str(chapter_item['source_chapter_id']) == str(last_source_cid):
- start = True
- else:
- if int(chapter_item['sequence']) == last_sequence:
- start = True
- continue
- if not has_new_chapter:
- self.__stats.get_value('bid_list').append(
- {"bid": meta['bid'], 'start': last_chapter_id, 'start_sequence': last_sequence})
- has_new_chapter = True
- cid = chapter_item['source_chapter_id']
- last_sequence = last_sequence + 1
- meta = chapter_item
- meta['bid'] = bid
- url = self.get_chapter_content_url(cp_bid, cid)
- yield scrapy.Request(url, callback=self.parse_chapter_content, meta=meta)
- def parse_chapter_content(self, response):
- if response.text == '':
- return None
- now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
- result = self.chapter_content_result(response)
- meta = response.meta
- chapter_item = ChapterItem()
- chapter_item['bid'] = meta['bid']
- chapter_item['name'] = meta['name']
- chapter_item['sequence'] = meta['sequence']
- chapter_item['size'] = meta['size']
- chapter_item['is_vip'] = meta['is_vip']
- chapter_item['source_chapter_id'] = meta['source_chapter_id']
- chapter_item['chapter_content_id'] = 0
- chapter_item['content'] = formatcontent(result['content'])
- if result.get('size') is not None:
- chapter_item['size'] = result.get('size')
- if result.get('is_vip') is not None:
- chapter_item['is_vip'] = result.get('is_vip')
- if result.get('name') is not None:
- chapter_item['name'] = result.get('name')
- if result.get('recent_update_at') is not None:
- chapter_item['recent_update_at'] = result.get('recent_update_at')
- if result.get('source_chapter_id') is not None:
- chapter_item['source_chapter_id'] = result.get('source_chapter_id')
- chapter_item['prev_cid'] = 0
- chapter_item['next_cid'] = 0
- chapter_item['updated_at'] = now
- chapter_item['created_at'] = now
- yield chapter_item
- def get_chapter_list_url(self, bid):
- raise NotImplementedError
- def chapter_list_result(self, response):
- raise NotImplementedError
- def get_chapter_content_url(self, bid, cid):
- raise NotImplementedError
- def chapter_content_result(self, response):
- raise NotImplementedError
- class fixChapterSpider(scrapy.Spider):
- name = ''
- source = ''
- source_id = 0
- custom_settings = {
- 'DOWNLOAD_DELAY': 0.01,
- 'SOURCE': source,
- }
- def __init__(self, host, user, password, db, bid_list, stats, settings):
- scrapy.Spider.__init__(self)
- source = self.source
- self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db, source=source,source_id=self.source_id)
- self.__stats = stats
- self.__stats.set_value('spider_type', 'fix')
- self.__stats.set_value('bid_list', [])
- self.__is_first = True
- self.bid_list = bid_list
- @classmethod
- def from_crawler(cls, crawler, *args, **kwargs):
- settings = crawler.settings
- host = settings.get('MYSQL_HOST')
- user = settings.get('MYSQL_USER')
- password = settings.get('MYSQL_PWD')
- db = settings.get('MYSQL_DB')
- bid = kwargs.get('bid')
- if bid is not None:
- bid_list = bid.split(',')
- else:
- bid_list = []
- return cls(host=host, user=user, password=password, db=db,
- bid_list=bid_list, stats=crawler.stats, settings=settings)
- def start_requests(self):
- if self.bid_list is None:
- yield
- return
- for book in self.bid_list:
- info = self.mysqlHelper.get_book_info_by_id(book)
- if info is None:
- continue
- self.mysqlHelper.re_sequence(book)
- url = self.get_chapter_list_url(info['cp_bid'])
- meta = {'bid': book, 'cp_bid': info['cp_bid']}
- yield scrapy.Request(url, callback=self.parse_chapter_list, meta=meta)
- def parse_chapter_list(self, response):
- if response.text == '':
- return None
- chapter_list = self.chapter_list_result(response)
- if chapter_list is not None:
- bid = response.meta.get('bid')
- cp_bid = response.meta.get('cp_bid')
- last_sequence = 0
- for chapter_item in chapter_list:
- last_sequence = last_sequence + 1
- chapter_info = self.mysqlHelper.get_cid_by_bid_sequence(bid, chapter_item['sequence'])
- cid = chapter_item['source_chapter_id']
- meta = chapter_item
- if chapter_info is not None:
- meta['type'] = 'update'
- meta['chapter_content_id'] = chapter_info['chapter_content_id']
- meta['cid'] = chapter_info['id']
- meta['bid'] = bid
- url = self.get_chapter_content_url(cp_bid, cid)
- yield scrapy.Request(url, callback=self.parse_chapter_content, meta=meta)
- self.__stats.get_value('bid_list').append({'bid':bid,'end':last_sequence})
- def parse_chapter_content(self, response):
- if response.text == '':
- return None
- now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
- result = self.chapter_content_result(response)
- meta = response.meta
- data = {}
- data['bid'] = meta['bid']
- data['name'] = meta['name']
- data['size'] = meta['size']
- data['is_vip'] = meta['is_vip']
- data['sequence'] = meta['sequence']
- data['source_chapter_id'] = meta['source_chapter_id']
- data['recent_update_at'] = meta['recent_update_at']
- data['content'] = formatcontent(result['content'])
- if result.get('size') is not None:
- data['size'] = result.get('size')
- if result.get('is_vip') is not None:
- data['is_vip'] = result.get('is_vip')
- if result.get('name') is not None:
- data['name'] = result.get('name')
- if result.get('recent_update_at') is not None:
- data['recent_update_at'] = result.get('recent_update_at')
- if result.get('source_chapter_id') is not None:
- data['source_chapter_id'] = result.get('source_chapter_id')
- if meta.get('type') is not None:
- content = formatcontent(result['content'])
- data['content'] = content
- data['size'] = len(removePunctuation(content))
- self.mysqlHelper.update_content(meta['chapter_content_id'], meta['name'], result['content'])
- if meta.get('cid') is not None:
- data['cid'] = meta['cid']
- self.mysqlHelper.update_chapter(data)
- else:
- data['prev_cid'] = 0
- data['next_cid'] = 0
- data['updated_at'] = now
- data['created_at'] = now
- chapter_item = ChapterItem(data)
- yield chapter_item
- def get_chapter_list_url(self, bid):
- raise NotImplementedError
- def chapter_list_result(self, response):
- raise NotImplementedError
- def get_chapter_content_url(self, bid, cid):
- raise NotImplementedError
- def chapter_content_result(self, response):
- raise NotImplementedError
- class fixBookInfoSpider(scrapy.Spider):
- name = ''
- source = ''
- source_name = ''
- source_id = 0
- def __init__(self, host, user, password, db, bid_list, stats):
- scrapy.Spider.__init__(self)
- source = self.source
- self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db, source=source,source_id=self.source_id)
- self.bid_list = bid_list
- @classmethod
- def from_crawler(cls, crawler, *args, **kwargs):
- settings = crawler.settings
- host = settings.get('MYSQL_HOST')
- user = settings.get('MYSQL_USER')
- password = settings.get('MYSQL_PWD')
- db = settings.get('MYSQL_DB')
- bid = kwargs.get('bid')
- if bid is not None:
- bid_list = bid.split(',')
- else:
- bid_list = []
- return cls(host=host, user=user, password=password, db=db, bid_list=bid_list, stats=crawler.stats)
- def start_requests(self):
- if self.bid_list is None:
- yield
- return
- for book in self.bid_list:
- info = self.mysqlHelper.get_book_info_by_id(book)
- if info is None:
- continue
- url = self.get_book_info_url(info['cp_bid'])
- meta = {'bid': book, 'cp_bid': info['cp_bid']}
- yield scrapy.Request(url, callback=self.parse_book_info, meta=meta)
- def parse_book_info(self, response):
- if response.text == '':
- return None
- result = self.book_info_result(response)
- if result is None:
- yield
- return
- result['bid'] = response.meta['bid']
- self.mysqlHelper.update_book_info(result)
- def get_book_info_url(self, bid):
- raise NotImplementedError
- def book_info_result(self, response):
- raise NotImplementedError
|