haoyue.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262
  1. # -*- coding: utf-8 -*-
  2. import json
  3. import scrapy
  4. import time
  5. import random
  6. from content_spider.mysqlHelper import MysqlHelper
  7. from content_spider.items import BookInfoItem, ChapterItem
  8. category_list = [{'id': '1', 'name': '都市言情', 'channel_id': 2, 'category_name': '婚恋情感', 'category_id': 98},
  9. {'id': '2', 'name': '时空穿越', 'channel_id': 2, 'category_name': '穿越重生', 'category_id': 83},
  10. {'id': '3', 'name': '总裁豪门', 'channel_id': 2, 'category_name': '穿越重生', 'category_id': 83},
  11. {'id': '4', 'name': '玄幻仙侠', 'channel_id': 1, 'category_name': '玄幻奇幻', 'category_id': 23},
  12. {'id': '6', 'name': '悬疑灵异', 'channel_id': 1, 'category_name': '灵异恐怖', 'category_id': 81},
  13. {'id': '7', 'name': '都市异能', 'channel_id': 1, 'category_name': '现代修真', 'category_id': 68},
  14. {'id': '8', 'name': '历史军事', 'channel_id': 1, 'category_name': '特种军旅', 'category_id': 51},
  15. {'id': '9', 'name': '古代言情', 'channel_id': 2, 'category_name': '婚恋情感', 'category_id': 98},
  16. {'id': '10', 'name': '热血青春', 'channel_id': 1, 'category_name': '青春爱情', 'category_id': 94},
  17. {'id': '11', 'name': '网游竞技', 'channel_id': 1, 'category_name': '游戏竞技', 'category_id': 19},
  18. {'id': '12', 'name': '幻想世界', 'channel_id': 2, 'category_name': '东方玄幻', 'category_id': 96},
  19. {'id': '13', 'name': '社科科普', 'channel_id': 2, 'category_name': '东方玄幻', 'category_id': 96},
  20. {'id': '14', 'name': '经管理财', 'channel_id': 2, 'category_name': '其他作品', 'category_id': 127},
  21. {'id': '15', 'name': '纪实传记', 'channel_id': 2, 'category_name': '其他作品', 'category_id': 127},
  22. {'id': '16', 'name': '励志成功', 'channel_id': 2, 'category_name': '其他作品', 'category_id': 127},
  23. {'id': '17', 'name': '童话寓言', 'channel_id': 2, 'category_name': '其他作品', 'category_id': 127},
  24. {'id': '18', 'name': '外国名著', 'channel_id': 2, 'category_name': '其他作品', 'category_id': 127},
  25. {'id': '19', 'name': '古典名著', 'channel_id': 2, 'category_name': '其他作品', 'category_id': 127},
  26. {'id': '20', 'name': '职场商战', 'channel_id': 2, 'category_name': '其他作品', 'category_id': 127},
  27. {'id': '21', 'name': '当代文学', 'channel_id': 2, 'category_name': '其他作品', 'category_id': 127},
  28. {'id': '22', 'name': '影视娱乐', 'channel_id': 2, 'category_name': '其他作品', 'category_id': 127},
  29. {'id': '23', 'name': '科幻末世', 'channel_id': 2, 'category_name': '其他作品', 'category_id': 127},
  30. {'id': '24', 'name': '同人小说', 'channel_id': 2, 'category_name': '其他作品', 'category_id': 127},
  31. {'id': '25', 'name': '短篇小说', 'channel_id': 2, 'category_name': '其他作品', 'category_id': 127}]
  32. class BookSpider(scrapy.Spider):
  33. name = 'haoyue'
  34. allowed_domains = ['www.haoyuewenxue.com']
  35. source = 'zy_haoyue'
  36. source_name = 'haoyue豪阅'
  37. source_id = 28
  38. apikey = "jR83xjL0E5taO43MHnwKBcM8HSfYP1k4"
  39. mchid = "71583342"
  40. base_url = 'http://www.haoyuewenxue.com/api/{}?apikey='+apikey+'&mchid=' + mchid
  41. custom_settings = {
  42. 'DOWNLOAD_DELAY': 0.1,
  43. 'SOURCE': source,
  44. 'LOG_FILE': 'content_spider/log/' + name + time.strftime("%Y-%m-%d", time.localtime()) + '.log'
  45. }
  46. def __init__(self, host, user, password, db, stats, settings):
  47. scrapy.Spider.__init__(self)
  48. source = self.source
  49. self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db, source=source,source_id=self.source_id)
  50. self.__stats = stats
  51. self.__stats.set_value('spider_type', 'add')
  52. self.__stats.set_value('bid_list', [])
  53. @classmethod
  54. def from_crawler(cls, crawler):
  55. settings = crawler.settings
  56. host = settings.get('MYSQL_HOST')
  57. user = settings.get('MYSQL_USER')
  58. password = settings.get('MYSQL_PWD')
  59. db = settings.get('MYSQL_DB')
  60. return cls(host=host, user=user, password=password, db=db, stats=crawler.stats, settings=settings)
  61. def start_requests(self):
  62. url = self.base_url.format('getallbook')
  63. yield scrapy.Request(url, callback=self.parse_book_list)
  64. def parse_book_list(self, response):
  65. result = json.loads(response.text)
  66. for item in result['data']:
  67. bid = item['id']
  68. result = self.mysqlHelper.get_book_info_by_source(bid)
  69. if result is not None:
  70. continue
  71. url = self.base_url.format('getbookdetail') + '&bookid={}'.format(bid)
  72. yield scrapy.Request(url, callback=self.parse_book_info)
  73. def parse_book_info(self, response):
  74. if response.text == '':
  75. return None
  76. now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  77. result = json.loads(response.text)
  78. if result is None:
  79. return None
  80. result = result['data']
  81. book_info_item = BookInfoItem()
  82. source_bid = result.get('id')
  83. category = self.get_category(str(result['tid']))
  84. book_info_item['cp_bid'] = source_bid
  85. book_info_item['name'] = result['title']
  86. book_info_item['cp_name'] = self.source_name
  87. book_info_item['cp_id'] = self.source_id
  88. book_info_item['author'] = result['author']
  89. book_info_item['intro'] = result['description']
  90. book_info_item['cover'] = result['litpic']
  91. book_info_item['keyword'] = ''
  92. book_info_item['category_id'] = category['category_id']
  93. book_info_item['status'] = result['isover']
  94. book_info_item['size'] = 0
  95. book_info_item['category_name'] = category['category_name']
  96. book_info_item['last_chapter'] = ''
  97. book_info_item['chapter_count'] = 0
  98. book_info_item['first_cid'] = 0
  99. book_info_item['last_cid'] = 0
  100. book_info_item['channel'] = category['channel_id']
  101. book_info_item['updated_at'] = now
  102. book_info_item['created_at'] = now
  103. bid = self.mysqlHelper.insert_book(book_info_item)
  104. self.__stats.get_value('bid_list').append(bid)
  105. chapter = result['chapter'][0]
  106. i = 1
  107. for chapter_item in chapter['list']:
  108. url = self.base_url.format('getbookchapter') + '&bookid={}&chapterid={}'.format(source_bid, chapter_item['chapterid'])
  109. meta = {'bid': bid, 'source_bid': source_bid, 'sequence': i, 'cid': chapter_item['chapterid']}
  110. i = i+1
  111. yield scrapy.Request(url, callback=self.parse_chapter_content, meta=meta)
  112. def parse_chapter_content(self, response):
  113. if response.text == '':
  114. return None
  115. now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  116. result = json.loads(response.text)
  117. result = result['data']
  118. old_meta = response.meta
  119. meta = dict()
  120. meta['bid'] = old_meta['bid']
  121. meta['name'] = result['title']
  122. meta['sequence'] = old_meta['sequence']
  123. meta['content'] = result['content']
  124. meta['source_chapter_id'] = old_meta['cid']
  125. meta['size'] = result.get('words')
  126. meta['is_vip'] = 1 if old_meta['sequence'] >= 20 else 0
  127. meta['recent_update_at'] = result.get('updatetime')
  128. meta['prev_cid'] = 0
  129. meta['next_cid'] = 0
  130. meta['updated_at'] = now
  131. meta['created_at'] = now
  132. chapter_item = ChapterItem(meta)
  133. yield chapter_item
  134. def get_category(self,tid):
  135. for item in category_list:
  136. if str(tid) == item['id']:
  137. return item
  138. return category_list[0]
  139. class BookupdateSpider(scrapy.Spider):
  140. name = 'haoyueupdate'
  141. allowed_domains = ['www.haoyuewenxue.com']
  142. source = 'zy_haoyue'
  143. source_name = 'haoyue豪阅'
  144. source_id = 28
  145. apikey = "jR83xjL0E5taO43MHnwKBcM8HSfYP1k4"
  146. mchid = "71583342"
  147. base_url = 'http://www.haoyuewenxue.com/api/{}?apikey='+apikey+'&mchid=' + mchid
  148. custom_settings = {
  149. 'DOWNLOAD_DELAY': 0.01,
  150. 'SOURCE': source,
  151. 'LOG_FILE': 'content_spider/log/' + name + time.strftime("%Y-%m-%d", time.localtime()) + '.log'
  152. }
  153. def __init__(self, host, user, password, db, stats):
  154. scrapy.Spider.__init__(self)
  155. source = self.source
  156. self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db, source=source,source_id=self.source_id)
  157. self.__stats = stats
  158. self.__stats.set_value('spider_type', 'update')
  159. self.__stats.set_value('bid_list', [])
  160. @classmethod
  161. def from_crawler(cls, crawler):
  162. settings = crawler.settings
  163. host = settings.get('MYSQL_HOST')
  164. user = settings.get('MYSQL_USER')
  165. password = settings.get('MYSQL_PWD')
  166. db = settings.get('MYSQL_DB')
  167. return cls(host=host, user=user, password=password, db=db, stats=crawler.stats)
  168. def start_requests(self):
  169. book_list = self.mysqlHelper.get_need_update_book_list()
  170. if book_list is not None:
  171. for book in book_list:
  172. url = self.base_url.format('getbookdetail') + '&bookid={}'.format(book['cp_bid'])
  173. meta = {'bid': book['id'], 'source_bid': book['cp_bid']}
  174. yield scrapy.Request(url, callback=self.parse_chapter_list, meta=meta)
  175. def parse_chapter_list(self, response):
  176. if response.text == '':
  177. return None
  178. now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  179. result = json.loads(response.text)
  180. if result is None:
  181. return None
  182. result = result['data']
  183. bid = response.meta.get('bid')
  184. status = result['isover']
  185. if int(status) == 1:
  186. self.mysqlHelper.update_book_status(bid,status)
  187. chapter_list = result['chapter'][0]
  188. meta = response.meta
  189. if chapter_list is not None:
  190. source_bid = response.meta.get('source_bid')
  191. last_chapter = self.mysqlHelper.get_last_cid_by_bid(bid)
  192. start = False
  193. if last_chapter is None:
  194. start = True
  195. last_source_cid = ''
  196. last_chapter_id = 0
  197. last_sequence = 0
  198. else:
  199. last_source_cid = str(last_chapter['source_chapter_id'])
  200. last_chapter_id = last_chapter['id']
  201. last_sequence = last_chapter['sequence']
  202. has_new_chapter = False
  203. for chapter_item in chapter_list['list']:
  204. if not start:
  205. if int(chapter_item['chapterid']) == int(last_source_cid):
  206. start = True
  207. continue
  208. if not has_new_chapter:
  209. self.__stats.get_value('bid_list').append(
  210. {"bid": meta['bid'], 'start': last_chapter_id, 'start_sequence': last_sequence})
  211. has_new_chapter = True
  212. url = self.base_url.format('getbookchapter') + '&bookid={}&chapterid={}'.\
  213. format(source_bid, chapter_item['chapterid'])
  214. meta = {'bid': bid, 'source_bid': source_bid, 'sequence': chapter_item['chapterid'], 'cid': chapter_item['chapterid']}
  215. yield scrapy.Request(url, callback=self.parse_chapter_content, meta=meta)
  216. def parse_chapter_content(self, response):
  217. if response.text == '':
  218. return None
  219. now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  220. result = json.loads(response.text)
  221. result = result['data']
  222. old_meta = response.meta
  223. meta = dict()
  224. meta['bid'] = old_meta['bid']
  225. meta['name'] = result['title']
  226. meta['sequence'] = old_meta['sequence']
  227. meta['content'] = result['content']
  228. meta['source_chapter_id'] = old_meta['cid']
  229. meta['size'] = result.get('words')
  230. meta['is_vip'] = 1 if old_meta['sequence'] >= 20 else 0
  231. meta['recent_update_at'] = result.get('updatetime')
  232. meta['prev_cid'] = 0
  233. meta['next_cid'] = 0
  234. meta['updated_at'] = now
  235. meta['created_at'] = now
  236. chapter_item = ChapterItem(meta)
  237. yield chapter_item