zhaoyang před 2 roky
rodič
revize
ec6f9b2b06

+ 1 - 1
content_spider/spiders/futian/futian.py

@@ -11,7 +11,7 @@ import json
 name = 'futian'
 name = 'futian'
 allowed_domains = ['www.futian.com']
 allowed_domains = ['www.futian.com']
 source = 'zy_futian'
 source = 'zy_futian'
-source_name = 'futian伏天	'
+source_name = 'futian伏天'
 source_id = 29
 source_id = 29
 base_url = 'http://www.futianbook.com/api/json'
 base_url = 'http://www.futianbook.com/api/json'
 apikey = 'f454124dc5ac861170e8087d67226536'
 apikey = 'f454124dc5ac861170e8087d67226536'

+ 262 - 0
content_spider/spiders/haoyue/haoyue.py

@@ -0,0 +1,262 @@
+# -*- coding: utf-8 -*-
+import json
+
+import scrapy
+import time
+import random
+from content_spider.mysqlHelper import MysqlHelper
+from content_spider.items import BookInfoItem, ChapterItem
+
+
+category_list = [{'id': '1', 'name': '都市言情', 'channel_id': 2, 'category_name': '婚恋情感', 'category_id': 98},
+ {'id': '2', 'name': '时空穿越', 'channel_id': 2, 'category_name': '穿越重生', 'category_id': 83},
+ {'id': '3', 'name': '总裁豪门', 'channel_id': 2, 'category_name': '穿越重生', 'category_id': 83},
+ {'id': '4', 'name': '玄幻仙侠', 'channel_id': 1, 'category_name': '玄幻奇幻', 'category_id': 23},
+ {'id': '6', 'name': '悬疑灵异', 'channel_id': 1, 'category_name': '灵异恐怖', 'category_id': 81},
+ {'id': '7', 'name': '都市异能', 'channel_id': 1, 'category_name': '现代修真', 'category_id': 68},
+ {'id': '8', 'name': '历史军事', 'channel_id': 1, 'category_name': '特种军旅', 'category_id': 51},
+ {'id': '9', 'name': '古代言情', 'channel_id': 2, 'category_name': '婚恋情感', 'category_id': 98},
+ {'id': '10', 'name': '热血青春', 'channel_id': 1, 'category_name': '青春爱情', 'category_id': 94},
+ {'id': '11', 'name': '网游竞技', 'channel_id': 1, 'category_name': '游戏竞技', 'category_id': 19},
+ {'id': '12', 'name': '幻想世界', 'channel_id': 2, 'category_name': '东方玄幻', 'category_id': 96},
+ {'id': '13', 'name': '社科科普', 'channel_id': 2, 'category_name': '东方玄幻', 'category_id': 96},
+ {'id': '14', 'name': '经管理财', 'channel_id': 2, 'category_name': '其他作品', 'category_id': 127},
+ {'id': '15', 'name': '纪实传记', 'channel_id': 2, 'category_name': '其他作品', 'category_id': 127},
+ {'id': '16', 'name': '励志成功', 'channel_id': 2, 'category_name': '其他作品', 'category_id': 127},
+ {'id': '17', 'name': '童话寓言', 'channel_id': 2, 'category_name': '其他作品', 'category_id': 127},
+ {'id': '18', 'name': '外国名著', 'channel_id': 2, 'category_name': '其他作品', 'category_id': 127},
+ {'id': '19', 'name': '古典名著', 'channel_id': 2, 'category_name': '其他作品', 'category_id': 127},
+ {'id': '20', 'name': '职场商战', 'channel_id': 2, 'category_name': '其他作品', 'category_id': 127},
+ {'id': '21', 'name': '当代文学', 'channel_id': 2, 'category_name': '其他作品', 'category_id': 127},
+ {'id': '22', 'name': '影视娱乐', 'channel_id': 2, 'category_name': '其他作品', 'category_id': 127},
+ {'id': '23', 'name': '科幻末世', 'channel_id': 2, 'category_name': '其他作品', 'category_id': 127},
+ {'id': '24', 'name': '同人小说', 'channel_id': 2, 'category_name': '其他作品', 'category_id': 127},
+ {'id': '25', 'name': '短篇小说', 'channel_id': 2, 'category_name': '其他作品', 'category_id': 127}]
+
+
+class BookSpider(scrapy.Spider):
+    name = 'haoyue'
+    allowed_domains = ['www.haoyuewenxue.com']
+    source = 'zy_haoyue'
+    source_name = 'haoyue豪阅'
+    source_id  = 28
+    apikey = "jR83xjL0E5taO43MHnwKBcM8HSfYP1k4"
+    mchid = "71583342"
+    base_url = 'http://www.haoyuewenxue.com/api/{}?apikey='+apikey+'&mchid=' + mchid
+
+    custom_settings = {
+        'DOWNLOAD_DELAY': 0.1,
+        'SOURCE': source,
+        'LOG_FILE': 'content_spider/log/' + name + time.strftime("%Y-%m-%d", time.localtime()) + '.log'
+    }
+
+    def __init__(self, host, user, password, db, stats, settings):
+        scrapy.Spider.__init__(self)
+        source = self.source
+        self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db, source=source,source_id=self.source_id)
+        self.__stats = stats
+        self.__stats.set_value('spider_type', 'add')
+        self.__stats.set_value('bid_list', [])
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        settings = crawler.settings
+        host = settings.get('MYSQL_HOST')
+        user = settings.get('MYSQL_USER')
+        password = settings.get('MYSQL_PWD')
+        db = settings.get('MYSQL_DB')
+        return cls(host=host, user=user, password=password, db=db, stats=crawler.stats, settings=settings)
+
+    def start_requests(self):
+        url = self.base_url.format('getallbook')
+        yield scrapy.Request(url, callback=self.parse_book_list)
+
+    def parse_book_list(self, response):
+        result = json.loads(response.text)
+        for item in result['data']:
+            bid = item['id']
+            result = self.mysqlHelper.get_book_info_by_source(bid)
+            if result is not None:
+                continue
+            url = self.base_url.format('getbookdetail') + '&bookid={}'.format(bid)
+            yield scrapy.Request(url, callback=self.parse_book_info)
+
+    def parse_book_info(self, response):
+        if response.text == '':
+            return None
+        now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+        result = json.loads(response.text)
+        if result is None:
+            return None
+        result = result['data']
+        book_info_item = BookInfoItem()
+        source_bid = result.get('id')
+        category = self.get_category(str(result['tid']))
+        book_info_item['cp_bid'] = source_bid
+        book_info_item['name'] = result['title']
+        book_info_item['cp_name'] = self.source_name
+        book_info_item['cp_id'] = self.source_id
+        book_info_item['author'] = result['author']
+        book_info_item['intro'] = result['description']
+        book_info_item['cover'] = result['litpic']
+        book_info_item['keyword'] = ''
+        book_info_item['category_id'] = category['category_id']
+        book_info_item['status'] = result['isover']
+        book_info_item['size'] = 0
+        book_info_item['category_name'] = category['category_name']
+        book_info_item['last_chapter'] = ''
+        book_info_item['chapter_count'] = 0
+        book_info_item['first_cid'] = 0
+        book_info_item['last_cid'] = 0
+        book_info_item['channel'] = category['channel_id']
+        book_info_item['updated_at'] = now
+        book_info_item['created_at'] = now
+        bid = self.mysqlHelper.insert_book(book_info_item)
+        self.__stats.get_value('bid_list').append(bid)
+        chapter = result['chapter'][0]
+        i = 1
+        for chapter_item in chapter['list']:
+            url = self.base_url.format('getbookchapter') + '&bookid={}&chapterid={}'.format(source_bid, chapter_item['chapterid'])
+            meta = {'bid': bid, 'source_bid': source_bid, 'sequence': i, 'cid': chapter_item['chapterid']}
+            i = i+1
+            yield scrapy.Request(url, callback=self.parse_chapter_content, meta=meta)
+
+    def parse_chapter_content(self, response):
+        if response.text == '':
+            return None
+        now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+        result = json.loads(response.text)
+        result = result['data']
+        old_meta = response.meta
+        meta = dict()
+        meta['bid'] = old_meta['bid']
+        meta['name'] = result['title']
+        meta['sequence'] = old_meta['sequence']
+        meta['content'] = result['content']
+        meta['source_chapter_id'] = old_meta['cid']
+        meta['size'] = result.get('words')
+        meta['is_vip'] = 1 if old_meta['sequence'] >= 20 else 0
+        meta['recent_update_at'] = result.get('updatetime')
+        meta['prev_cid'] = 0
+        meta['next_cid'] = 0
+        meta['updated_at'] = now
+        meta['created_at'] = now
+        chapter_item = ChapterItem(meta)
+        yield chapter_item
+
+    def get_category(self,tid):
+        for item in category_list:
+            if str(tid) == item['id']:
+                return item
+        return category_list[0]
+
+
+
+class BookupdateSpider(scrapy.Spider):
+    name = 'haoyueupdate'
+    allowed_domains = ['www.haoyuewenxue.com']
+    source = 'zy_haoyue'
+    source_name = 'haoyue豪阅'
+    source_id  = 28
+    apikey = "jR83xjL0E5taO43MHnwKBcM8HSfYP1k4"
+    mchid = "71583342"
+    base_url = 'http://www.haoyuewenxue.com/api/{}?apikey='+apikey+'&mchid=' + mchid
+
+    custom_settings = {
+        'DOWNLOAD_DELAY': 0.01,
+        'SOURCE': source,
+        'LOG_FILE': 'content_spider/log/' + name + time.strftime("%Y-%m-%d", time.localtime()) + '.log'
+    }
+
+    def __init__(self, host, user, password, db, stats):
+        scrapy.Spider.__init__(self)
+        source = self.source
+        self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db, source=source,source_id=self.source_id)
+        self.__stats = stats
+        self.__stats.set_value('spider_type', 'update')
+        self.__stats.set_value('bid_list', [])
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        settings = crawler.settings
+        host = settings.get('MYSQL_HOST')
+        user = settings.get('MYSQL_USER')
+        password = settings.get('MYSQL_PWD')
+        db = settings.get('MYSQL_DB')
+        return cls(host=host, user=user, password=password, db=db, stats=crawler.stats)
+
+    def start_requests(self):
+        book_list = self.mysqlHelper.get_need_update_book_list()
+        if book_list is not None:
+            for book in book_list:
+                url = self.base_url.format('getbookdetail') + '&bookid={}'.format(book['cp_bid'])
+                meta = {'bid': book['id'], 'source_bid': book['cp_bid']}
+                yield scrapy.Request(url, callback=self.parse_chapter_list, meta=meta)
+
+    def parse_chapter_list(self, response):
+        if response.text == '':
+            return None
+        now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+        result = json.loads(response.text)
+        if result is None:
+            return None
+        result = result['data']
+        bid = response.meta.get('bid')
+
+        status = result['isover']
+        if int(status) == 1:
+            self.mysqlHelper.update_book_status(bid,status)
+
+        chapter_list = result['chapter'][0]
+        meta = response.meta
+        if chapter_list is not None:
+            source_bid = response.meta.get('source_bid')
+            last_chapter = self.mysqlHelper.get_last_cid_by_bid(bid)
+            start = False
+            if last_chapter is None:
+                start = True
+                last_source_cid = ''
+                last_chapter_id = 0
+                last_sequence = 0
+            else:
+                last_source_cid = str(last_chapter['source_chapter_id'])
+                last_chapter_id = last_chapter['id']
+                last_sequence = last_chapter['sequence']
+
+            has_new_chapter = False
+            for chapter_item in chapter_list['list']:
+                if not start:
+                    if int(chapter_item['chapterid']) == int(last_source_cid):
+                        start = True
+                    continue
+                if not has_new_chapter:
+                    self.__stats.get_value('bid_list').append(
+                        {"bid": meta['bid'], 'start': last_chapter_id, 'start_sequence': last_sequence})
+                    has_new_chapter = True
+                url = self.base_url.format('getbookchapter') + '&bookid={}&chapterid={}'.\
+                    format(source_bid, chapter_item['chapterid'])
+                meta = {'bid': bid, 'source_bid': source_bid, 'sequence': chapter_item['chapterid'], 'cid': chapter_item['chapterid']}
+                yield scrapy.Request(url, callback=self.parse_chapter_content, meta=meta)
+
+    def parse_chapter_content(self, response):
+        if response.text == '':
+            return None
+        now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+        result = json.loads(response.text)
+        result = result['data']
+        old_meta = response.meta
+        meta = dict()
+        meta['bid'] = old_meta['bid']
+        meta['name'] = result['title']
+        meta['sequence'] = old_meta['sequence']
+        meta['content'] = result['content']
+        meta['source_chapter_id'] = old_meta['cid']
+        meta['size'] = result.get('words')
+        meta['is_vip'] = 1 if old_meta['sequence'] >= 20 else 0
+        meta['recent_update_at'] = result.get('updatetime')
+        meta['prev_cid'] = 0
+        meta['next_cid'] = 0
+        meta['updated_at'] = now
+        meta['created_at'] = now
+        chapter_item = ChapterItem(meta)
+        yield chapter_item

binární
content_spider/spiders/haoyue/网站API文档.doc


binární
content_spider/spiders/haoyue/豪阅文学网书单.xlsx


+ 237 - 1
content_spider/temp_test.py

@@ -88,5 +88,241 @@ for  item in category_list:
         result.append(item2)
         result.append(item2)
 
 
 
 
-print(result)
+
+judian = [{
+		"id": "1",
+		"classname": "都市言情",
+		"pid": "0",
+		"level": 0
+	}, {
+		"id": "2",
+		"classname": "时空穿越",
+		"pid": "0",
+		"level": 0
+	}, {
+		"id": "3",
+		"classname": "总裁豪门",
+		"pid": "0",
+		"level": 0
+	}, {
+		"id": "4",
+		"classname": "玄幻仙侠",
+		"pid": "0",
+		"keyword": None,
+		"description": None,
+		"litpic": None,
+		"level": 0
+	}, {
+		"id": "6",
+		"classname": "悬疑灵异",
+		"pid": "0",
+		"keyword": None,
+		"description": None,
+		"litpic": None,
+		"level": 0
+	}, {
+		"id": "7",
+		"classname": "都市异能",
+		"pid": "0",
+		"keyword": None,
+		"description": None,
+		"litpic": None,
+		"level": 0
+	}, {
+		"id": "8",
+		"classname": "历史军事",
+		"pid": "0",
+		"keyword": None,
+		"description": None,
+		"litpic": None,
+		"level": 0
+	}, {
+		"id": "9",
+		"classname": "古代言情",
+		"pid": "0",
+		"keyword": None,
+		"description": None,
+		"litpic": None,
+		"level": 0
+	}, {
+		"id": "10",
+		"classname": "热血青春",
+		"pid": "0",
+		"keyword": None,
+		"description": None,
+		"litpic": None,
+		"level": 0
+	}, {
+		"id": "11",
+		"classname": "网游竞技",
+		"pid": "0",
+		"keyword": None,
+		"description": None,
+		"litpic": None,
+		"level": 0
+	}, {
+		"id": "12",
+		"classname": "幻想世界",
+		"pid": "0",
+		"keyword": None,
+		"description": None,
+		"litpic": None,
+		"level": 0
+	}, {
+		"id": "13",
+		"classname": "社科科普",
+		"pid": "0",
+		"keyword": None,
+		"description": None,
+		"litpic": None,
+		"level": 0
+	}, {
+		"id": "14",
+		"classname": "经管理财",
+		"pid": "0",
+		"keyword": None,
+		"description": None,
+		"litpic": None,
+		"level": 0
+	}, {
+		"id": "15",
+		"classname": "纪实传记",
+		"pid": "0",
+		"keyword": None,
+		"description": None,
+		"litpic": None,
+		"level": 0
+	}, {
+		"id": "16",
+		"classname": "励志成功",
+		"pid": "0",
+		"keyword": None,
+		"description": None,
+		"litpic": None,
+		"level": 0
+	}, {
+		"id": "17",
+		"classname": "童话寓言",
+		"pid": "0",
+		"keyword": None,
+		"description": None,
+		"litpic": None,
+		"level": 0
+	}, {
+		"id": "18",
+		"classname": "外国名著",
+		"pid": "0",
+		"keyword": None,
+		"description": None,
+		"litpic": None,
+		"level": 0
+	}, {
+		"id": "19",
+		"classname": "古典名著",
+		"pid": "0",
+		"keyword": None,
+		"description": None,
+		"litpic": None,
+		"level": 0
+	}, {
+		"id": "20",
+		"classname": "职场商战",
+		"pid": "0",
+		"keyword": None,
+		"description": None,
+		"litpic": None,
+		"level": 0
+	}, {
+		"id": "21",
+		"classname": "当代文学",
+		"pid": "0",
+		"keyword": None,
+		"description": None,
+		"litpic": None,
+		"level": 0
+	}, {
+		"id": "22",
+		"classname": "影视娱乐",
+		"pid": "0",
+		"keyword": None,
+		"description": None,
+		"litpic": None,
+		"level": 0
+	}, {
+		"id": "23",
+		"classname": "科幻末世",
+		"pid": "0",
+		"keyword": None,
+		"description": None,
+		"litpic": None,
+		"level": 0
+	}, {
+		"id": "24",
+		"classname": "同人小说",
+		"pid": "0",
+		"keyword": None,
+		"description": None,
+		"litpic": None,
+		"level": 0
+	}, {
+		"id": "25",
+		"classname": "短篇小说",
+		"pid": "0",
+		"keyword": None,
+		"description": None,
+		"litpic": None,
+		"level": 0
+	}];
+
+jr = []
+for uitem in judian:
+    item = {"id":uitem['id'],"name":uitem['classname']}
+    item['channel_id'] = 2
+    item['category_name'] = "其他作品"
+    item['category_id'] = 127
+    if uitem['id'] == "1" or uitem['id'] == "9":
+        item['channel_id'] = 2
+        item['category_name'] = "婚恋情感"
+        item['category_id'] = 98
+    if uitem['id'] == "2" or uitem['id'] == "3":
+        item['channel_id'] = 2
+        item['category_name'] = "穿越重生"
+        item['category_id'] = 83
+    if uitem['id'] == "4":
+        item['channel_id'] = 1
+        item['category_name'] = "玄幻奇幻"
+        item['category_id'] = 23
+    if uitem['id'] == "6":
+        item['channel_id'] = 1
+        item['category_name'] = "灵异恐怖"
+        item['category_id'] = 81
+    if uitem['id'] == "7":
+        item['channel_id'] = 1
+        item['category_name'] = "现代修真"
+        item['category_id'] = 68
+    if uitem['id'] == "8":
+        item['channel_id'] = 1
+        item['category_name'] = "特种军旅"
+        item['category_id'] = 51
+    if uitem['id'] == "10":
+        item['channel_id'] = 1
+        item['category_name'] = "青春爱情"
+        item['category_id'] = 94
+    if uitem['id'] == "11":
+        item['channel_id'] = 1
+        item['category_name'] = "游戏竞技"
+        item['category_id'] = 19
+    if uitem['id'] == "12":
+        item['channel_id'] = 2
+        item['category_name'] = "东方玄幻"
+        item['category_id'] = 96
+    if uitem['id'] == "13":
+        item['channel_id'] = 2
+        item['category_name'] = "东方玄幻"
+        item['category_id'] = 96
+        
+    jr.append(item)
+
+print(jr)
+