zhaoyang 2 年 前
コミット
913634cdb7

+ 0 - 0
content_spider/spiders/wandu/__init__.py


+ 171 - 0
content_spider/spiders/wandu/wandu.py

@@ -0,0 +1,171 @@
+# -*- coding: utf-8 -*-
+
+from content_spider.baseSpider import baseSpider
+from content_spider.baseSpider import baseUpdateSpider
+from content_spider.baseSpider import fixChapterSpider
+from content_spider.baseSpider import baseUpdateBookStatusSpider
+from content_spider.Util import md5
+import time
+import json
+
+
+name = 'wandu'
+allowed_domains = ['api.wandu.cn']
+source = 'zy_wandu'
+source_name = '万读'
+source_id = 44
+appid = 'wdca9d79a7b66c39eb'
+base_url = 'https://api.wandu.cn/open/search/{}?appid=' + appid 
+
+
+category = [{'id': 1, 'name': '都市言情', 'channel_id': 2, 'category_id': 88, 'category_name': '豪门总裁'},
+ {'id': 2, 'name': '古代言情', 'channel_id': 2, 'category_id': 83, 'category_name': '穿越重生'},
+ {'id': 4, 'name': '青春校园', 'channel_id': 2, 'category_id': 104, 'category_name': '青春校园'},
+ {'id': 5, 'name': '总裁豪门', 'channel_id': 2, 'category_id': 88, 'category_name': '豪门总裁'},
+ {'id': 15, 'name': '灵异惊悚', 'channel_id': 2, 'category_id': 114, 'category_name': '恐怖惊悚'},
+ {'id': 29, 'name': '现代言情', 'channel_id': 2, 'category_id': 88, 'category_name': '豪门总裁'},
+ {'id': 30, 'name': '幻想言情', 'channel_id': 2, 'category_id': 110, 'category_name': '上古蛮荒'},
+ {'id': 31, 'name': '魔幻异界', 'channel_id': 2, 'category_id': 110, 'category_name': '上古蛮荒'},
+ {'id': 32, 'name': '仙侠情缘', 'channel_id': 2, 'category_id': 97, 'category_name': '古典仙侠'},
+ {'id': 33, 'name': '推理悬疑', 'channel_id': 2, 'category_id': 113, 'category_name': '悬疑探险'},
+ {'id': 34, 'name': '次元同人', 'channel_id': 2, 'category_id': 107, 'category_name': '其他'},
+ {'id': 36, 'name': '游戏竞技', 'channel_id': 2, 'category_id': 119, 'category_name': '游戏'},
+ {'id': 37, 'name': '短篇其他', 'channel_id': 2, 'category_id': 107, 'category_name': '其他'},
+ {'id': 38, 'name': '穿越重生', 'channel_id': 2, 'category_id': 83, 'category_name': '穿越重生'},
+ {'id': 39, 'name': '女尊女强', 'channel_id': 2, 'category_id': 123, 'category_name': '女尊王朝'},
+ {'id': 40, 'name': '轻松爆笑', 'channel_id': 2, 'category_id': 107, 'category_name': '其他'},
+ {'id': 42, 'name': '纯爱', 'channel_id': 2, 'category_id': 107, 'category_name': '其他'},
+ {'id': 6, 'name': '悬疑惊悚', 'channel_id': 1, 'category_id': 81, 'category_name': '灵异恐怖'},
+ {'id': 7, 'name': '婚恋生活', 'channel_id': 1, 'category_id': 54, 'category_name': '都市爱情'},
+ {'id': 9, 'name': '热血青春', 'channel_id': 1, 'category_id': 94, 'category_name': '青春爱情'},
+ {'id': 10, 'name': '游戏竞技', 'channel_id': 1, 'category_id': 19, 'category_name': '游戏竞技'},
+ {'id': 11, 'name': '都市情感', 'channel_id': 1, 'category_id': 54, 'category_name': '都市爱情'},
+ {'id': 12, 'name': '官场仕途', 'channel_id': 1, 'category_id': 55, 'category_name': '官场沉浮'},
+ {'id': 13, 'name': '武侠仙侠', 'channel_id': 1, 'category_id': 21, 'category_name': '武侠仙侠'},
+ {'id': 14, 'name': '都市生活', 'channel_id': 1, 'category_id': 54, 'category_name': '都市爱情'},
+ {'id': 16, 'name': '乡村生活', 'channel_id': 1, 'category_id': 57, 'category_name': '乡土风情'},
+ {'id': 17, 'name': '修真玄幻', 'channel_id': 1, 'category_id': 23, 'category_name': '玄幻奇幻'},
+ {'id': 18, 'name': '军事历史', 'channel_id': 1, 'category_id': 14, 'category_name': '历史穿越'},
+ {'id': 19, 'name': '都市异能', 'channel_id': 1, 'category_id': 68, 'category_name': '现代修真'},
+ {'id': 20, 'name': '玄幻奇幻', 'channel_id': 1, 'category_id': 23, 'category_name': '玄幻奇幻'},
+ {'id': 24, 'name': '科幻未来', 'channel_id': 1, 'category_id': 127, 'category_name': '其他作品'},
+ {'id': 25, 'name': '都市 娱乐', 'channel_id': 1, 'category_id': 54, 'category_name': '都市爱情'},
+ {'id': 26, 'name': '次元同人', 'channel_id': 1, 'category_id': 127, 'category_name': '其他作品'},
+ {'id': 27, 'name': '短篇其他', 'channel_id': 1, 'category_id': 127, 'category_name': '其他作品'},
+ {'id': 28, 'name': '现实反思', 'channel_id': 1, 'category_id': 127, 'category_name': '其他作品'},
+ {'id': 41, 'name': '轻松爆笑', 'channel_id': 1, 'category_id': 127, 'category_name': '其他作品'}]
+
+
+def get_category(tag_id):
+    for item in category:
+        if int(tag_id) == item['id']:
+            return item
+    return category[0]
+
+
+class wanduProcess(object):
+    name = name
+    allowed_domains = allowed_domains
+    source = source
+    source_name = source_name
+    source_id = source_id
+
+
+    def get_start_url(self):
+        #https://api.wandu.cn/open/search/getwandubooklist?appid=xxx
+        return base_url.format('getwandubooklist')
+
+    def bid_list_result(self, response):
+        result = json.loads(response.text)
+        if result is None or result.get('data') is None:
+            return []
+        result_list = []
+        for item in result['data']:
+            result_list.append({'id': item['novel_id']})
+        return result_list
+
+    def get_book_info_url(self, bid):
+        #http://api.wandu.cn/open/search/getwandubookinfo?appid=xxx&bookid=xx
+        return base_url.format('getwandubookinfo') + '&bookid={}'.format(bid)
+
+    def book_info_result(self, response):
+        result = json.loads(response.text)
+        result = result['data']
+        category_info = get_category(result['tag_id']);
+        return {
+            'bid': result['novel_id'], 'name': result['novel_name'], 'author': result['author_name'],
+            'intro': result['summary'], 'cover': result['pic'], 'keyword': '',
+            'status':result['is_done'], 'category': category_info['category_name'],'category_id':category_info['category_id'],
+            'channel': result['channel_id']
+        }
+
+    def get_chapter_list_url(self, bid):
+        #https://api.wandu.cn/open/search/getwanduchapterlist?appid=xxx&bookid=xx
+        return base_url.format('getwanduchapterlist') + '&bookid={}&per_num=10000'.format(bid)
+
+    def chapter_list_result(self, response):
+        result = json.loads(response.text)
+        if result is None or result.get('data') is None:
+            return []
+
+        result_list = []
+        i = 0
+        for chapter_item in result['data']['lists']:
+            i = i+1
+            result_list.append({
+                'source_chapter_id': chapter_item['paragraph_id'], 'name': chapter_item['title'],
+                'sequence': chapter_item['order_num'], 'is_vip': chapter_item['is_vip'],
+                'size': chapter_item['word_num'], 'recent_update_at': chapter_item['update_time']
+            })
+        return result_list
+
+    def get_chapter_content_url(self, bid, cid):
+        #http://api.wandu.cn/open/search/getwanduchaptercontent?appid=xxx&bookid=xx&chapterid=xx
+        return base_url.format('getwanduchaptercontent') + '&bookid={}&chapterid={}'.format(bid, cid)
+
+    def chapter_content_result(self, response):
+        result = json.loads(response.text)
+        if result is None:
+            return {'content': ''}
+
+        return {
+            'content': result['data']['content']
+        }
+
+    
+class wanduSpider(wanduProcess,baseSpider):
+    name = name
+
+    custom_settings = {
+        'DOWNLOAD_DELAY': 0.1,
+        'SOURCE': source,
+        'LOG_FILE': 'content_spider/log/' + name + time.strftime("%Y-%m-%d", time.localtime()) + '.log'
+    }
+
+
+class wanduUpdateSpider(wanduProcess,baseUpdateSpider):
+    name = name + "update"
+    custom_settings = {
+        'DOWNLOAD_DELAY': 0.1,
+        'SOURCE': source,
+        'LOG_FILE': 'content_spider/log/' + name + time.strftime("%Y-%m-%d", time.localtime()) + '.log'
+    }
+
+
+    
+class wanduFixSpider(wanduProcess,fixChapterSpider):
+    name = name + 'fix'
+    custom_settings = {
+        'DOWNLOAD_DELAY': 0.1,
+        'SOURCE': source,
+        'LOG_FILE': 'content_spider/log/' + name + time.strftime("%Y-%m-%d", time.localtime()) + '.log'
+    }
+
+
+class wanduBookInfoSpider(wanduProcess,baseUpdateBookStatusSpider):
+    name = name + "bookinfo"
+    custom_settings = {
+        'DOWNLOAD_DELAY': 0.1,
+        'SOURCE': source,
+        'LOG_FILE': 'content_spider/log/' + name + time.strftime("%Y-%m-%d", time.localtime()) + '.log'
+    }

BIN
content_spider/spiders/wandu/wandu_api_docs_v1.1.pdf


BIN
content_spider/spiders/wandu/万读分类映射.xlsx


+ 75 - 63
content_spider/temp_test.py

@@ -13,92 +13,104 @@ from xml.dom.minidom import parseString
 import time
 import xlrd
 
-df = xlrd.open_workbook("./1.xls")
+df = xlrd.open_workbook("D:\project\zhiyu_content_spider\content_spider/12.xls")
 table=df.sheets()[0]
 
 
 result = []
 
-for i in range(0,table.nrows):
+for i in range(1,table.nrows):
     row = table.row_values(i)
     if len(row) <= 0:
         break
-    c_id = int(row[2])
+    c_id = int(row[0])
 
-    name = row[3]
+    name = row[1]
     item = {"id":c_id,"name":name}
-    if c_id in [19,20,21,22,23,24,101]:
-        item['channel_id'] = 1
-        item['category_id'] = 23
-        item['category_name'] = '玄幻奇幻'
-    if c_id in [26,27,29,30]:
-        item['channel_id'] = 1
-        item['category_id'] = 21
-        item['category_name'] = '武侠仙侠'
-    if c_id in [14,15,16,17,18]:
-        item['channel_id'] = 1
-        item['category_id'] = 54
-        item['category_name'] = '都市爱情'
-    if c_id in [31,32,102]:
-        item['channel_id'] = 1
-        item['category_id'] = 51
-        item['category_name'] = '特种军旅'
-    if c_id in [5,38,103,104,105]:
-        item['channel_id'] = 1
-        item['category_id'] = 22
-        item['category_name'] = '西方玄幻'
-    if c_id in [25,34,35,36]:
-        item['channel_id'] = 1
-        item['category_id'] = 19
-        item['category_name'] = '游戏竞技'
-    if c_id in [1000,1002,1004,1006,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2020]:
-        item['channel_id'] = 1
-        item['category_id'] = 127
-        item['category_name'] = '其他作品'
-    if c_id in [48,49,50,51,53,81]:
+    if c_id in [1,5,29]:
         item['channel_id'] = 2
-        item['category_id'] = 98
-        item['category_name'] = '婚恋情感'
-    if c_id in [55,56,57,58]:
+        item['category_id'] = 88
+        item['category_name'] = '豪门总裁'
+    if c_id in [2,38]:
         item['channel_id'] = 2
-        item['category_id'] = 104
-        item['category_name'] = '青春校园'
-    if c_id in [55,56,57,58]:
+        item['category_id'] = 83
+        item['category_name'] = '穿越重生'
+    if c_id in [4]:
         item['channel_id'] = 2
         item['category_id'] = 104
         item['category_name'] = '青春校园'
-    if c_id == 59:
+    if c_id in [15]:
         item['channel_id'] = 2
-        item['category_id'] = 123
-        item['category_name'] = '女尊王朝'
-    if c_id == 60 or c_id == 62:
+        item['category_id'] = 114
+        item['category_name'] = '恐怖惊悚'
+    if c_id in [30,31]:
         item['channel_id'] = 2
-        item['category_id'] = 120
-        item['category_name'] = '宫斗宅斗'
-    if c_id == 61 or c_id == 80 or c_id in [68,69,70,71,120]:
+        item['category_id'] = 110
+        item['category_name'] = '上古蛮荒'
+    if c_id in [32]:
         item['channel_id'] = 2
-        item['category_id'] = 83
-        item['category_name'] = '穿越重生'
-    if c_id in [63,64,65,66,67,2014]:
+        item['category_id'] = 97
+        item['category_name'] = '古典仙侠'
+    if c_id in [33]:
         item['channel_id'] = 2
-        item['category_id'] = 96
-        item['category_name'] = '东方玄幻'
-    if c_id in [72,73,74,75,2018]:
+        item['category_id'] = 113
+        item['category_name'] = '悬疑探险'
+    if c_id in [34,37,40,42]:
+        item['channel_id'] = 2
+        item['category_id'] = 107
+        item['category_name'] = '其他'
+    if c_id in [36]:
         item['channel_id'] = 2
         item['category_id'] = 119
         item['category_name'] = '游戏'
-    if c_id in [76,77,78,79]:
+    if c_id in [39]:
         item['channel_id'] = 2
-        item['category_id'] = 106
-        item['category_name'] = '耽美同人'
-    if c_id == 2012 or c_id == 2013 or c_id == 2016:
-        item['channel_id'] = 2
-        item['category_id'] = 103
-        item['category_name'] = '青春纯爱'
-    if c_id in [1001,1003,1005,1007,2015,2017]:
-        item['channel_id'] = 2
-        item['category_id'] = 107
-        item['category_name'] = '其他'
+        item['category_id'] = 123
+        item['category_name'] = '女尊王朝'
+    if c_id in [6]:
+        item['channel_id'] = 1
+        item['category_id'] = 81
+        item['category_name'] = '灵异恐怖'
+    if c_id in [7,11,14,25]:
+        item['channel_id'] = 1
+        item['category_id'] = 54
+        item['category_name'] = '都市爱情'
+    if c_id in [9]:
+        item['channel_id'] = 1
+        item['category_id'] = 94
+        item['category_name'] = '青春爱情'
+    if c_id in [10]:
+        item['channel_id'] = 1
+        item['category_id'] = 19
+        item['category_name'] = '游戏竞技'
+    if c_id in [12]:
+        item['channel_id'] = 1
+        item['category_id'] = 55
+        item['category_name'] = '官场沉浮'
+    if c_id in [13]:
+        item['channel_id'] = 1
+        item['category_id'] = 21
+        item['category_name'] = '武侠仙侠'
+    if c_id in [16]:
+        item['channel_id'] = 1
+        item['category_id'] = 57
+        item['category_name'] = '乡土风情'
+    if c_id in [17,20]:
+        item['channel_id'] = 1
+        item['category_id'] = 23
+        item['category_name'] = '玄幻奇幻'
+    if c_id in [18]:
+        item['channel_id'] = 1
+        item['category_id'] = 14
+        item['category_name'] = '历史穿越'
+    if c_id in [19]:
+        item['channel_id'] = 1
+        item['category_id'] = 68
+        item['category_name'] = '现代修真'
+    if c_id in [24,26,27,28,41]:
+        item['channel_id'] = 1
+        item['category_id'] = 127
+        item['category_name'] = '其他作品'
     result.append(item)