|  | @@ -0,0 +1,262 @@
 | 
	
		
			
				|  |  | +# -*- coding: utf-8 -*-
 | 
	
		
			
				|  |  | +import json
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +import scrapy
 | 
	
		
			
				|  |  | +import time
 | 
	
		
			
				|  |  | +import random
 | 
	
		
			
				|  |  | +from content_spider.mysqlHelper import MysqlHelper
 | 
	
		
			
				|  |  | +from content_spider.items import BookInfoItem, ChapterItem
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +category_list = [{'id': '1', 'name': '都市言情', 'channel_id': 2, 'category_name': '婚恋情感', 'category_id': 98},
 | 
	
		
			
				|  |  | + {'id': '2', 'name': '时空穿越', 'channel_id': 2, 'category_name': '穿越重生', 'category_id': 83},
 | 
	
		
			
				|  |  | + {'id': '3', 'name': '总裁豪门', 'channel_id': 2, 'category_name': '穿越重生', 'category_id': 83},
 | 
	
		
			
				|  |  | + {'id': '4', 'name': '玄幻仙侠', 'channel_id': 1, 'category_name': '玄幻奇幻', 'category_id': 23},
 | 
	
		
			
				|  |  | + {'id': '6', 'name': '悬疑灵异', 'channel_id': 1, 'category_name': '灵异恐怖', 'category_id': 81},
 | 
	
		
			
				|  |  | + {'id': '7', 'name': '都市异能', 'channel_id': 1, 'category_name': '现代修真', 'category_id': 68},
 | 
	
		
			
				|  |  | + {'id': '8', 'name': '历史军事', 'channel_id': 1, 'category_name': '特种军旅', 'category_id': 51},
 | 
	
		
			
				|  |  | + {'id': '9', 'name': '古代言情', 'channel_id': 2, 'category_name': '婚恋情感', 'category_id': 98},
 | 
	
		
			
				|  |  | + {'id': '10', 'name': '热血青春', 'channel_id': 1, 'category_name': '青春爱情', 'category_id': 94},
 | 
	
		
			
				|  |  | + {'id': '11', 'name': '网游竞技', 'channel_id': 1, 'category_name': '游戏竞技', 'category_id': 19},
 | 
	
		
			
				|  |  | + {'id': '12', 'name': '幻想世界', 'channel_id': 2, 'category_name': '东方玄幻', 'category_id': 96},
 | 
	
		
			
				|  |  | + {'id': '13', 'name': '社科科普', 'channel_id': 2, 'category_name': '东方玄幻', 'category_id': 96},
 | 
	
		
			
				|  |  | + {'id': '14', 'name': '经管理财', 'channel_id': 2, 'category_name': '其他作品', 'category_id': 127},
 | 
	
		
			
				|  |  | + {'id': '15', 'name': '纪实传记', 'channel_id': 2, 'category_name': '其他作品', 'category_id': 127},
 | 
	
		
			
				|  |  | + {'id': '16', 'name': '励志成功', 'channel_id': 2, 'category_name': '其他作品', 'category_id': 127},
 | 
	
		
			
				|  |  | + {'id': '17', 'name': '童话寓言', 'channel_id': 2, 'category_name': '其他作品', 'category_id': 127},
 | 
	
		
			
				|  |  | + {'id': '18', 'name': '外国名著', 'channel_id': 2, 'category_name': '其他作品', 'category_id': 127},
 | 
	
		
			
				|  |  | + {'id': '19', 'name': '古典名著', 'channel_id': 2, 'category_name': '其他作品', 'category_id': 127},
 | 
	
		
			
				|  |  | + {'id': '20', 'name': '职场商战', 'channel_id': 2, 'category_name': '其他作品', 'category_id': 127},
 | 
	
		
			
				|  |  | + {'id': '21', 'name': '当代文学', 'channel_id': 2, 'category_name': '其他作品', 'category_id': 127},
 | 
	
		
			
				|  |  | + {'id': '22', 'name': '影视娱乐', 'channel_id': 2, 'category_name': '其他作品', 'category_id': 127},
 | 
	
		
			
				|  |  | + {'id': '23', 'name': '科幻末世', 'channel_id': 2, 'category_name': '其他作品', 'category_id': 127},
 | 
	
		
			
				|  |  | + {'id': '24', 'name': '同人小说', 'channel_id': 2, 'category_name': '其他作品', 'category_id': 127},
 | 
	
		
			
				|  |  | + {'id': '25', 'name': '短篇小说', 'channel_id': 2, 'category_name': '其他作品', 'category_id': 127}]
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +class BookSpider(scrapy.Spider):
 | 
	
		
			
				|  |  | +    name = 'haoyue'
 | 
	
		
			
				|  |  | +    allowed_domains = ['www.haoyuewenxue.com']
 | 
	
		
			
				|  |  | +    source = 'zy_haoyue'
 | 
	
		
			
				|  |  | +    source_name = 'haoyue豪阅'
 | 
	
		
			
				|  |  | +    source_id  = 28
 | 
	
		
			
				|  |  | +    apikey = "jR83xjL0E5taO43MHnwKBcM8HSfYP1k4"
 | 
	
		
			
				|  |  | +    mchid = "71583342"
 | 
	
		
			
				|  |  | +    base_url = 'http://www.haoyuewenxue.com/api/{}?apikey='+apikey+'&mchid=' + mchid
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    custom_settings = {
 | 
	
		
			
				|  |  | +        'DOWNLOAD_DELAY': 0.1,
 | 
	
		
			
				|  |  | +        'SOURCE': source,
 | 
	
		
			
				|  |  | +        'LOG_FILE': 'content_spider/log/' + name + time.strftime("%Y-%m-%d", time.localtime()) + '.log'
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    def __init__(self, host, user, password, db, stats, settings):
 | 
	
		
			
				|  |  | +        scrapy.Spider.__init__(self)
 | 
	
		
			
				|  |  | +        source = self.source
 | 
	
		
			
				|  |  | +        self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db, source=source,source_id=self.source_id)
 | 
	
		
			
				|  |  | +        self.__stats = stats
 | 
	
		
			
				|  |  | +        self.__stats.set_value('spider_type', 'add')
 | 
	
		
			
				|  |  | +        self.__stats.set_value('bid_list', [])
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    @classmethod
 | 
	
		
			
				|  |  | +    def from_crawler(cls, crawler):
 | 
	
		
			
				|  |  | +        settings = crawler.settings
 | 
	
		
			
				|  |  | +        host = settings.get('MYSQL_HOST')
 | 
	
		
			
				|  |  | +        user = settings.get('MYSQL_USER')
 | 
	
		
			
				|  |  | +        password = settings.get('MYSQL_PWD')
 | 
	
		
			
				|  |  | +        db = settings.get('MYSQL_DB')
 | 
	
		
			
				|  |  | +        return cls(host=host, user=user, password=password, db=db, stats=crawler.stats, settings=settings)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    def start_requests(self):
 | 
	
		
			
				|  |  | +        url = self.base_url.format('getallbook')
 | 
	
		
			
				|  |  | +        yield scrapy.Request(url, callback=self.parse_book_list)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    def parse_book_list(self, response):
 | 
	
		
			
				|  |  | +        result = json.loads(response.text)
 | 
	
		
			
				|  |  | +        for item in result['data']:
 | 
	
		
			
				|  |  | +            bid = item['id']
 | 
	
		
			
				|  |  | +            result = self.mysqlHelper.get_book_info_by_source(bid)
 | 
	
		
			
				|  |  | +            if result is not None:
 | 
	
		
			
				|  |  | +                continue
 | 
	
		
			
				|  |  | +            url = self.base_url.format('getbookdetail') + '&bookid={}'.format(bid)
 | 
	
		
			
				|  |  | +            yield scrapy.Request(url, callback=self.parse_book_info)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    def parse_book_info(self, response):
 | 
	
		
			
				|  |  | +        if response.text == '':
 | 
	
		
			
				|  |  | +            return None
 | 
	
		
			
				|  |  | +        now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
 | 
	
		
			
				|  |  | +        result = json.loads(response.text)
 | 
	
		
			
				|  |  | +        if result is None:
 | 
	
		
			
				|  |  | +            return None
 | 
	
		
			
				|  |  | +        result = result['data']
 | 
	
		
			
				|  |  | +        book_info_item = BookInfoItem()
 | 
	
		
			
				|  |  | +        source_bid = result.get('id')
 | 
	
		
			
				|  |  | +        category = self.get_category(str(result['tid']))
 | 
	
		
			
				|  |  | +        book_info_item['cp_bid'] = source_bid
 | 
	
		
			
				|  |  | +        book_info_item['name'] = result['title']
 | 
	
		
			
				|  |  | +        book_info_item['cp_name'] = self.source_name
 | 
	
		
			
				|  |  | +        book_info_item['cp_id'] = self.source_id
 | 
	
		
			
				|  |  | +        book_info_item['author'] = result['author']
 | 
	
		
			
				|  |  | +        book_info_item['intro'] = result['description']
 | 
	
		
			
				|  |  | +        book_info_item['cover'] = result['litpic']
 | 
	
		
			
				|  |  | +        book_info_item['keyword'] = ''
 | 
	
		
			
				|  |  | +        book_info_item['category_id'] = category['category_id']
 | 
	
		
			
				|  |  | +        book_info_item['status'] = result['isover']
 | 
	
		
			
				|  |  | +        book_info_item['size'] = 0
 | 
	
		
			
				|  |  | +        book_info_item['category_name'] = category['category_name']
 | 
	
		
			
				|  |  | +        book_info_item['last_chapter'] = ''
 | 
	
		
			
				|  |  | +        book_info_item['chapter_count'] = 0
 | 
	
		
			
				|  |  | +        book_info_item['first_cid'] = 0
 | 
	
		
			
				|  |  | +        book_info_item['last_cid'] = 0
 | 
	
		
			
				|  |  | +        book_info_item['channel'] = category['channel_id']
 | 
	
		
			
				|  |  | +        book_info_item['updated_at'] = now
 | 
	
		
			
				|  |  | +        book_info_item['created_at'] = now
 | 
	
		
			
				|  |  | +        bid = self.mysqlHelper.insert_book(book_info_item)
 | 
	
		
			
				|  |  | +        self.__stats.get_value('bid_list').append(bid)
 | 
	
		
			
				|  |  | +        chapter = result['chapter'][0]
 | 
	
		
			
				|  |  | +        i = 1
 | 
	
		
			
				|  |  | +        for chapter_item in chapter['list']:
 | 
	
		
			
				|  |  | +            url = self.base_url.format('getbookchapter') + '&bookid={}&chapterid={}'.format(source_bid, chapter_item['chapterid'])
 | 
	
		
			
				|  |  | +            meta = {'bid': bid, 'source_bid': source_bid, 'sequence': i, 'cid': chapter_item['chapterid']}
 | 
	
		
			
				|  |  | +            i = i+1
 | 
	
		
			
				|  |  | +            yield scrapy.Request(url, callback=self.parse_chapter_content, meta=meta)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    def parse_chapter_content(self, response):
 | 
	
		
			
				|  |  | +        if response.text == '':
 | 
	
		
			
				|  |  | +            return None
 | 
	
		
			
				|  |  | +        now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
 | 
	
		
			
				|  |  | +        result = json.loads(response.text)
 | 
	
		
			
				|  |  | +        result = result['data']
 | 
	
		
			
				|  |  | +        old_meta = response.meta
 | 
	
		
			
				|  |  | +        meta = dict()
 | 
	
		
			
				|  |  | +        meta['bid'] = old_meta['bid']
 | 
	
		
			
				|  |  | +        meta['name'] = result['title']
 | 
	
		
			
				|  |  | +        meta['sequence'] = old_meta['sequence']
 | 
	
		
			
				|  |  | +        meta['content'] = result['content']
 | 
	
		
			
				|  |  | +        meta['source_chapter_id'] = old_meta['cid']
 | 
	
		
			
				|  |  | +        meta['size'] = result.get('words')
 | 
	
		
			
				|  |  | +        meta['is_vip'] = 1 if old_meta['sequence'] >= 20 else 0
 | 
	
		
			
				|  |  | +        meta['recent_update_at'] = result.get('updatetime')
 | 
	
		
			
				|  |  | +        meta['prev_cid'] = 0
 | 
	
		
			
				|  |  | +        meta['next_cid'] = 0
 | 
	
		
			
				|  |  | +        meta['updated_at'] = now
 | 
	
		
			
				|  |  | +        meta['created_at'] = now
 | 
	
		
			
				|  |  | +        chapter_item = ChapterItem(meta)
 | 
	
		
			
				|  |  | +        yield chapter_item
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    def get_category(self,tid):
 | 
	
		
			
				|  |  | +        for item in category_list:
 | 
	
		
			
				|  |  | +            if str(tid) == item['id']:
 | 
	
		
			
				|  |  | +                return item
 | 
	
		
			
				|  |  | +        return category_list[0]
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +class BookupdateSpider(scrapy.Spider):
 | 
	
		
			
				|  |  | +    name = 'haoyueupdate'
 | 
	
		
			
				|  |  | +    allowed_domains = ['www.haoyuewenxue.com']
 | 
	
		
			
				|  |  | +    source = 'zy_haoyue'
 | 
	
		
			
				|  |  | +    source_name = 'haoyue豪阅'
 | 
	
		
			
				|  |  | +    source_id  = 28
 | 
	
		
			
				|  |  | +    apikey = "jR83xjL0E5taO43MHnwKBcM8HSfYP1k4"
 | 
	
		
			
				|  |  | +    mchid = "71583342"
 | 
	
		
			
				|  |  | +    base_url = 'http://www.haoyuewenxue.com/api/{}?apikey='+apikey+'&mchid=' + mchid
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    custom_settings = {
 | 
	
		
			
				|  |  | +        'DOWNLOAD_DELAY': 0.01,
 | 
	
		
			
				|  |  | +        'SOURCE': source,
 | 
	
		
			
				|  |  | +        'LOG_FILE': 'content_spider/log/' + name + time.strftime("%Y-%m-%d", time.localtime()) + '.log'
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    def __init__(self, host, user, password, db, stats):
 | 
	
		
			
				|  |  | +        scrapy.Spider.__init__(self)
 | 
	
		
			
				|  |  | +        source = self.source
 | 
	
		
			
				|  |  | +        self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db, source=source,source_id=self.source_id)
 | 
	
		
			
				|  |  | +        self.__stats = stats
 | 
	
		
			
				|  |  | +        self.__stats.set_value('spider_type', 'update')
 | 
	
		
			
				|  |  | +        self.__stats.set_value('bid_list', [])
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    @classmethod
 | 
	
		
			
				|  |  | +    def from_crawler(cls, crawler):
 | 
	
		
			
				|  |  | +        settings = crawler.settings
 | 
	
		
			
				|  |  | +        host = settings.get('MYSQL_HOST')
 | 
	
		
			
				|  |  | +        user = settings.get('MYSQL_USER')
 | 
	
		
			
				|  |  | +        password = settings.get('MYSQL_PWD')
 | 
	
		
			
				|  |  | +        db = settings.get('MYSQL_DB')
 | 
	
		
			
				|  |  | +        return cls(host=host, user=user, password=password, db=db, stats=crawler.stats)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    def start_requests(self):
 | 
	
		
			
				|  |  | +        book_list = self.mysqlHelper.get_need_update_book_list()
 | 
	
		
			
				|  |  | +        if book_list is not None:
 | 
	
		
			
				|  |  | +            for book in book_list:
 | 
	
		
			
				|  |  | +                url = self.base_url.format('getbookdetail') + '&bookid={}'.format(book['cp_bid'])
 | 
	
		
			
				|  |  | +                meta = {'bid': book['id'], 'source_bid': book['cp_bid']}
 | 
	
		
			
				|  |  | +                yield scrapy.Request(url, callback=self.parse_chapter_list, meta=meta)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    def parse_chapter_list(self, response):
 | 
	
		
			
				|  |  | +        if response.text == '':
 | 
	
		
			
				|  |  | +            return None
 | 
	
		
			
				|  |  | +        now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
 | 
	
		
			
				|  |  | +        result = json.loads(response.text)
 | 
	
		
			
				|  |  | +        if result is None:
 | 
	
		
			
				|  |  | +            return None
 | 
	
		
			
				|  |  | +        result = result['data']
 | 
	
		
			
				|  |  | +        bid = response.meta.get('bid')
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        status = result['isover']
 | 
	
		
			
				|  |  | +        if int(status) == 1:
 | 
	
		
			
				|  |  | +            self.mysqlHelper.update_book_status(bid,status)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        chapter_list = result['chapter'][0]
 | 
	
		
			
				|  |  | +        meta = response.meta
 | 
	
		
			
				|  |  | +        if chapter_list is not None:
 | 
	
		
			
				|  |  | +            source_bid = response.meta.get('source_bid')
 | 
	
		
			
				|  |  | +            last_chapter = self.mysqlHelper.get_last_cid_by_bid(bid)
 | 
	
		
			
				|  |  | +            start = False
 | 
	
		
			
				|  |  | +            if last_chapter is None:
 | 
	
		
			
				|  |  | +                start = True
 | 
	
		
			
				|  |  | +                last_source_cid = ''
 | 
	
		
			
				|  |  | +                last_chapter_id = 0
 | 
	
		
			
				|  |  | +                last_sequence = 0
 | 
	
		
			
				|  |  | +            else:
 | 
	
		
			
				|  |  | +                last_source_cid = str(last_chapter['source_chapter_id'])
 | 
	
		
			
				|  |  | +                last_chapter_id = last_chapter['id']
 | 
	
		
			
				|  |  | +                last_sequence = last_chapter['sequence']
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +            has_new_chapter = False
 | 
	
		
			
				|  |  | +            for chapter_item in chapter_list['list']:
 | 
	
		
			
				|  |  | +                if not start:
 | 
	
		
			
				|  |  | +                    if int(chapter_item['chapterid']) == int(last_source_cid):
 | 
	
		
			
				|  |  | +                        start = True
 | 
	
		
			
				|  |  | +                    continue
 | 
	
		
			
				|  |  | +                if not has_new_chapter:
 | 
	
		
			
				|  |  | +                    self.__stats.get_value('bid_list').append(
 | 
	
		
			
				|  |  | +                        {"bid": meta['bid'], 'start': last_chapter_id, 'start_sequence': last_sequence})
 | 
	
		
			
				|  |  | +                    has_new_chapter = True
 | 
	
		
			
				|  |  | +                url = self.base_url.format('getbookchapter') + '&bookid={}&chapterid={}'.\
 | 
	
		
			
				|  |  | +                    format(source_bid, chapter_item['chapterid'])
 | 
	
		
			
				|  |  | +                meta = {'bid': bid, 'source_bid': source_bid, 'sequence': chapter_item['chapterid'], 'cid': chapter_item['chapterid']}
 | 
	
		
			
				|  |  | +                yield scrapy.Request(url, callback=self.parse_chapter_content, meta=meta)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    def parse_chapter_content(self, response):
 | 
	
		
			
				|  |  | +        if response.text == '':
 | 
	
		
			
				|  |  | +            return None
 | 
	
		
			
				|  |  | +        now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
 | 
	
		
			
				|  |  | +        result = json.loads(response.text)
 | 
	
		
			
				|  |  | +        result = result['data']
 | 
	
		
			
				|  |  | +        old_meta = response.meta
 | 
	
		
			
				|  |  | +        meta = dict()
 | 
	
		
			
				|  |  | +        meta['bid'] = old_meta['bid']
 | 
	
		
			
				|  |  | +        meta['name'] = result['title']
 | 
	
		
			
				|  |  | +        meta['sequence'] = old_meta['sequence']
 | 
	
		
			
				|  |  | +        meta['content'] = result['content']
 | 
	
		
			
				|  |  | +        meta['source_chapter_id'] = old_meta['cid']
 | 
	
		
			
				|  |  | +        meta['size'] = result.get('words')
 | 
	
		
			
				|  |  | +        meta['is_vip'] = 1 if old_meta['sequence'] >= 20 else 0
 | 
	
		
			
				|  |  | +        meta['recent_update_at'] = result.get('updatetime')
 | 
	
		
			
				|  |  | +        meta['prev_cid'] = 0
 | 
	
		
			
				|  |  | +        meta['next_cid'] = 0
 | 
	
		
			
				|  |  | +        meta['updated_at'] = now
 | 
	
		
			
				|  |  | +        meta['created_at'] = now
 | 
	
		
			
				|  |  | +        chapter_item = ChapterItem(meta)
 | 
	
		
			
				|  |  | +        yield chapter_item
 |