baseSpider.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428
  1. # -*- coding: utf-8 -*-
  2. import time
  3. import scrapy
  4. from ydyspider.items import BookInfoItem, ChapterItem
  5. from ydyspider.mysqlHelper import MysqlHelper
  6. import hashlib
  7. import random
  8. from ydyspider.pipelines import formatcontent, removePunctuation
  9. def md5(token):
  10. m = hashlib.md5()
  11. m.update(token.encode('utf-8'))
  12. return m.hexdigest()
  13. def sign(param, key):
  14. param = sorted(param.items(), key=lambda x: x[0])
  15. string = ''
  16. for item in param:
  17. string = string + '{}={}&'.format(str(item[0]), str(item[1]))
  18. string = string + 'key={}'.format(key)
  19. return md5(string).upper()
  20. def sha1(token):
  21. m = hashlib.sha1()
  22. m.update(token.encode('utf-8'))
  23. return m.hexdigest()
  24. def random_str(slen=10):
  25. seed = "1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
  26. sa = []
  27. for i in range(slen):
  28. sa.append(random.choice(seed))
  29. return ''.join(sa)
  30. class baseSpider(scrapy.Spider):
  31. name = ''
  32. allowed_domains = []
  33. base_url = ''
  34. source = ''
  35. source_name = ''
  36. def __init__(self, host, user, password, db, stats):
  37. scrapy.Spider.__init__(self)
  38. source = self.source
  39. self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db,source=source)
  40. self.__stats = stats
  41. self.__source = self.source
  42. self.__stats.set_value('bid_list', [])
  43. self.__stats.set_value('spider_type', 'add')
  44. @classmethod
  45. def from_crawler(cls, crawler):
  46. settings = crawler.settings
  47. host = settings.get('MYSQL_HOST')
  48. user = settings.get('MYSQL_USER')
  49. password = settings.get('MYSQL_PWD')
  50. db = settings.get('MYSQL_DB')
  51. return cls(host=host, user=user, password=password, db=db, stats=crawler.stats)
  52. def start_requests(self):
  53. yield scrapy.Request(self.get_start_url(), callback=self.parse_book_list)
  54. def parse_book_list(self, response):
  55. result = self.bid_list_result(response)
  56. for item in result:
  57. bid = item['id']
  58. result = self.mysqlHelper.get_book_info_by_source(bid)
  59. if result is not None:
  60. continue
  61. url = self.get_book_info_url(bid)
  62. yield scrapy.Request(url, callback=self.parse_book_info)
  63. def parse_book_info(self, response):
  64. if response.text == '':
  65. return None
  66. now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  67. result = self.book_info_result(response)
  68. if result is None:
  69. return None
  70. source_bid = result.get('bid')
  71. book_info_item = BookInfoItem()
  72. book_info_item['source_bid'] = source_bid
  73. book_info_item['name'] = result['name']
  74. book_info_item['author'] = result['author']
  75. book_info_item['intro'] = result['intro']
  76. book_info_item['cover'] = result['cover']
  77. book_info_item['keyword'] = result['keyword']
  78. book_info_item['category_id'] = 0 if result.get('category_id') is None else result.get('category_id')
  79. book_info_item['status'] = result['status']
  80. book_info_item['chapter_count'] = 0 if result.get('chapter_count') is None else result.get('chapter_count')
  81. book_info_item['first_cid'] = 0
  82. book_info_item['last_cid'] = 0
  83. book_info_item['size'] = 0 if result.get('size') is None else result.get('size')
  84. book_info_item['last_chapter'] = '' if result.get('last_chapter') is None else result.get('last_chapter')
  85. book_info_item['category_name'] = result['category_name']
  86. book_info_item['source_name'] = self.source
  87. book_info_item['gender'] = 0 if result.get('gender') is None else result.get('gender')
  88. book_info_item['updated_at'] = now
  89. book_info_item['created_at'] = now
  90. bid = self.mysqlHelper.insert_book(book_info_item)
  91. self.__stats.get_value('bid_list').append(bid)
  92. url = self.get_chapter_list_url(source_bid)
  93. meta = {'bid': bid, 'source_bid': source_bid}
  94. yield scrapy.Request(url, self.parse_chapter_list, meta=meta)
  95. def parse_chapter_list(self, response):
  96. if response.text == '':
  97. return None
  98. result = self.chapter_list_result(response)
  99. if result is None:
  100. return None
  101. bid = response.meta['bid']
  102. source_bid = response.meta['source_bid']
  103. for chapter_item in result:
  104. meta = chapter_item
  105. cid = chapter_item['source_chapter_id']
  106. meta['bid'] = bid
  107. url = self.get_chapter_content_url(source_bid, cid)
  108. yield scrapy.Request(url, self.parse_chapter_content, meta=meta)
  109. def parse_chapter_content(self, response):
  110. if response.text == '':
  111. return None
  112. now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  113. result = self.chapter_content_result(response)
  114. meta = response.meta
  115. chapter_item = ChapterItem()
  116. chapter_item['bid'] = meta['bid']
  117. chapter_item['name'] = meta['name']
  118. chapter_item['sequence'] = meta['sequence']
  119. chapter_item['is_vip'] = meta['is_vip']
  120. chapter_item['prev_cid'] = 0
  121. chapter_item['next_cid'] = 0
  122. chapter_item['recent_update_at'] = meta['recent_update_at']
  123. content = formatcontent(result['content'])
  124. chapter_item['content'] = content
  125. chapter_item['size'] = len(removePunctuation(content))
  126. chapter_item['chapter_content_id'] = 0
  127. chapter_item['source_chapter_id'] = meta['source_chapter_id']
  128. chapter_item['created_at'] = now
  129. chapter_item['updated_at'] = now
  130. if result.get('size') is not None:
  131. chapter_item['size'] = result.get('size')
  132. if result.get('is_vip') is not None:
  133. chapter_item['is_vip'] = result.get('is_vip')
  134. if result.get('name') is not None:
  135. chapter_item['name'] = result.get('name')
  136. if result.get('recent_update_at') is not None:
  137. chapter_item['recent_update_at'] = result.get('recent_update_at')
  138. if result.get('source_chapter_id') is not None:
  139. chapter_item['source_chapter_id'] = result.get('source_chapter_id')
  140. yield chapter_item
  141. def get_start_url(self):
  142. raise NotImplementedError
  143. def bid_list_result(self, response):
  144. raise NotImplementedError
  145. def get_book_info_url(self, bid):
  146. raise NotImplementedError
  147. def book_info_result(self, response):
  148. raise NotImplementedError
  149. def get_chapter_list_url(self, bid):
  150. raise NotImplementedError
  151. def chapter_list_result(self, response):
  152. raise NotImplementedError
  153. def get_chapter_content_url(self, bid, cid):
  154. raise NotImplementedError
  155. def chapter_content_result(self, response):
  156. raise NotImplementedError
  157. class baseUpdateSpider(scrapy.Spider):
  158. name = ''
  159. allowed_domains = []
  160. base_url = ''
  161. source = ''
  162. source_name = ''
  163. def __init__(self, host, user, password, db, stats):
  164. scrapy.Spider.__init__(self)
  165. source = self.source
  166. self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db, source=source)
  167. self.__stats = stats
  168. self.__stats.set_value('spider_type', 'update')
  169. self.__stats.set_value('bid_list', [])
  170. self.__is_first = True
  171. @classmethod
  172. def from_crawler(cls, crawler):
  173. settings = crawler.settings
  174. host = settings.get('MYSQL_HOST')
  175. user = settings.get('MYSQL_USER')
  176. password = settings.get('MYSQL_PWD')
  177. db = settings.get('MYSQL_DB')
  178. return cls(host=host, user=user, password=password, db=db, stats=crawler.stats)
  179. def start_requests(self):
  180. book_list = self.mysqlHelper.get_need_update_book_list()
  181. if book_list is not None:
  182. for book in book_list:
  183. url = self.get_chapter_list_url(book['source_bid'])
  184. meta = {'bid': book['id'], 'source_bid': book['source_bid']}
  185. yield scrapy.Request(url, callback=self.parse_chapter_list, meta=meta)
  186. def parse_chapter_list(self, response):
  187. if response.text == '':
  188. return None
  189. chapter_list = self.chapter_list_result(response)
  190. meta = response.meta
  191. if chapter_list is not None:
  192. bid = response.meta.get('bid')
  193. source_bid = response.meta.get('source_bid')
  194. last_chapter = self.mysqlHelper.get_last_cid_by_bid(bid)
  195. start = False
  196. if last_chapter is None:
  197. start = True
  198. last_source_cid = 0
  199. last_sequence = 0
  200. last_chapter_id = 0
  201. else:
  202. last_source_cid = last_chapter['source_chapter_id']
  203. last_sequence = last_chapter['sequence']
  204. last_chapter_id = last_chapter['id']
  205. has_new_chapter = False
  206. for chapter_item in chapter_list:
  207. if not start:
  208. if int(chapter_item['source_chapter_id']) == int(last_source_cid):
  209. start = True
  210. continue
  211. if not has_new_chapter:
  212. self.__stats.get_value('bid_list').append(
  213. {"bid": meta['bid'], 'start': last_chapter_id})
  214. has_new_chapter = True
  215. cid = chapter_item['source_chapter_id']
  216. last_sequence = last_sequence + 1
  217. if chapter_item['sequence'] == 0:
  218. chapter_item['sequence'] = last_sequence
  219. meta = chapter_item
  220. meta['bid'] = bid
  221. url = self.get_chapter_content_url(source_bid, cid)
  222. yield scrapy.Request(url, callback=self.parse_chapter_content, meta=meta)
  223. def parse_chapter_content(self, response):
  224. if response.text == '':
  225. return None
  226. now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  227. result = self.chapter_content_result(response)
  228. meta = response.meta
  229. chapter_item = ChapterItem()
  230. chapter_item['bid'] = meta['bid']
  231. chapter_item['name'] = meta['name']
  232. chapter_item['sequence'] = meta['sequence']
  233. chapter_item['is_vip'] = meta['is_vip']
  234. chapter_item['prev_cid'] = 0
  235. chapter_item['next_cid'] = 0
  236. chapter_item['recent_update_at'] = meta['recent_update_at']
  237. content = formatcontent(result['content'])
  238. chapter_item['size'] = len(removePunctuation(content))
  239. chapter_item['content'] = content
  240. chapter_item['chapter_content_id'] = 0
  241. chapter_item['source_chapter_id'] = meta['source_chapter_id']
  242. chapter_item['created_at'] = now
  243. chapter_item['updated_at'] = now
  244. if result.get('is_vip') is not None:
  245. chapter_item['is_vip'] = result.get('is_vip')
  246. if result.get('name') is not None:
  247. chapter_item['name'] = result.get('name')
  248. if result.get('recent_update_at') is not None:
  249. chapter_item['recent_update_at'] = result.get('recent_update_at')
  250. if result.get('source_chapter_id') is not None:
  251. chapter_item['source_chapter_id'] = result.get('source_chapter_id')
  252. yield chapter_item
  253. def get_chapter_list_url(self, bid):
  254. raise NotImplementedError
  255. def chapter_list_result(self, response):
  256. raise NotImplementedError
  257. def get_chapter_content_url(self, bid, cid):
  258. raise NotImplementedError
  259. def chapter_content_result(self, response):
  260. raise NotImplementedError
  261. class fixChapterSpider(scrapy.Spider):
  262. name = ''
  263. source = ''
  264. def __init__(self, host, user, password, db, bid_list, stats):
  265. scrapy.Spider.__init__(self)
  266. source = self.source
  267. self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db, source=source)
  268. self.__stats = stats
  269. self.__is_first = True
  270. self.bid_list = bid_list
  271. @classmethod
  272. def from_crawler(cls, crawler, *args, **kwargs):
  273. settings = crawler.settings
  274. host = settings.get('MYSQL_HOST')
  275. user = settings.get('MYSQL_USER')
  276. password = settings.get('MYSQL_PWD')
  277. db = settings.get('MYSQL_DB')
  278. bid = kwargs.get('bid')
  279. if bid is not None:
  280. bid_list = bid.split(',')
  281. else:
  282. bid_list = []
  283. return cls(host=host, user=user, password=password, db=db, bid_list=bid_list, stats=crawler.stats)
  284. def start_requests(self):
  285. if self.bid_list is None:
  286. yield
  287. return
  288. for book in self.bid_list:
  289. info = self.mysqlHelper.get_book_info_by_id(book)
  290. if info is None:
  291. continue
  292. url = self.get_chapter_list_url(info['source_bid'])
  293. meta = {'bid': book, 'source_bid': info['source_bid']}
  294. yield scrapy.Request(url, callback=self.parse_chapter_list, meta=meta)
  295. def parse_chapter_list(self, response):
  296. if response.text == '':
  297. return None
  298. chapter_list = self.chapter_list_result(response)
  299. if chapter_list is not None:
  300. bid = response.meta.get('bid')
  301. source_bid = response.meta.get('source_bid')
  302. last_sequence = 0
  303. for chapter_item in chapter_list:
  304. last_sequence = last_sequence + 1
  305. if chapter_item['sequence'] == 0:
  306. chapter_item['sequence'] = last_sequence
  307. chapter_info = self.mysqlHelper.get_cid_by_bid_sequence(bid, last_sequence)
  308. cid = chapter_item['source_chapter_id']
  309. meta = chapter_item
  310. if chapter_info is not None:
  311. meta['type'] = 'update'
  312. meta['chapter_content_id'] = chapter_info['chapter_content_id']
  313. meta['cid'] = chapter_info['id']
  314. meta['bid'] = bid
  315. url = self.get_chapter_content_url(source_bid, cid)
  316. yield scrapy.Request(url, callback=self.parse_chapter_content, meta=meta)
  317. def parse_chapter_content(self, response):
  318. if response.text == '':
  319. return None
  320. now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  321. result = self.chapter_content_result(response)
  322. meta = response.meta
  323. content = formatcontent(result['content'])
  324. meta['size'] = len(removePunctuation(content))
  325. meta['content'] = content
  326. if result.get('size') is not None:
  327. meta['size'] = result.get('size')
  328. if result.get('is_vip') is not None:
  329. meta['is_vip'] = result.get('is_vip')
  330. if result.get('name') is not None:
  331. meta['name'] = result.get('name')
  332. if result.get('recent_update_at') is not None:
  333. meta['recent_update_at'] = result.get('recent_update_at')
  334. if result.get('source_chapter_id') is not None:
  335. meta['source_chapter_id'] = result.get('source_chapter_id')
  336. if meta.get('type') is not None:
  337. self.mysqlHelper.update_content(meta['chapter_content_id'], meta['name'], result['content'])
  338. self.mysqlHelper.update_chapter(meta)
  339. else:
  340. chapter_item = ChapterItem()
  341. chapter_item['bid'] = meta['bid']
  342. chapter_item['name'] = meta['name']
  343. chapter_item['sequence'] = meta['sequence']
  344. chapter_item['size'] = meta['size']
  345. chapter_item['is_vip'] = meta['is_vip']
  346. chapter_item['prev_cid'] = 0
  347. chapter_item['next_cid'] = 0
  348. chapter_item['recent_update_at'] = meta['recent_update_at']
  349. chapter_item['content'] = result['content']
  350. if meta.get('chapter_content_id') is not None:
  351. chapter_item['chapter_content_id'] = meta['chapter_content_id']
  352. else:
  353. chapter_item['chapter_content_id'] = 0
  354. chapter_item['source_chapter_id'] = meta['source_chapter_id']
  355. chapter_item['created_at'] = now
  356. chapter_item['updated_at'] = now
  357. if result.get('size') is not None:
  358. chapter_item['size'] = result.get('size')
  359. if result.get('is_vip') is not None:
  360. chapter_item['is_vip'] = result.get('is_vip')
  361. if result.get('name') is not None:
  362. chapter_item['name'] = result.get('name')
  363. if result.get('recent_update_at') is not None:
  364. chapter_item['recent_update_at'] = result.get('recent_update_at')
  365. if result.get('source_chapter_id') is not None:
  366. chapter_item['source_chapter_id'] = result.get('source_chapter_id')
  367. yield chapter_item
  368. def get_chapter_list_url(self, bid):
  369. raise NotImplementedError
  370. def chapter_list_result(self, response):
  371. raise NotImplementedError
  372. def get_chapter_content_url(self, bid, cid):
  373. raise NotImplementedError
  374. def chapter_content_result(self, response):
  375. raise NotImplementedError