baseSpider.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465
  1. import scrapy
  2. import time
  3. import random
  4. from content_spider.mysqlHelper import MysqlHelper
  5. from content_spider.items import BookInfoItem, ChapterItem
  6. from content_spider.pipelines import formatcontent, removePunctuation
  7. from content_spider.Util import my_log
  8. class baseSpider(scrapy.Spider):
  9. name = ''
  10. source = ''
  11. source_name = ''
  12. source_id = 0
  13. custom_settings = {
  14. 'DOWNLOAD_DELAY': 0.01,
  15. 'SOURCE': source,
  16. }
  17. def __init__(self, host, user, password, db, stats, settings):
  18. scrapy.Spider.__init__(self)
  19. source = self.source
  20. source_id = self.source_id
  21. self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db, source=source,source_id=source_id)
  22. self.__stats = stats
  23. self.__stats.set_value('spider_type', 'add')
  24. self.__stats.set_value('bid_list', [])
  25. self.__is_first = True
  26. @classmethod
  27. def from_crawler(cls, crawler):
  28. settings = crawler.settings
  29. host = settings.get('MYSQL_HOST')
  30. user = settings.get('MYSQL_USER')
  31. password = settings.get('MYSQL_PWD')
  32. db = settings.get('MYSQL_DB')
  33. return cls(host=host, user=user, password=password, db=db, stats=crawler.stats, settings=settings)
  34. def start_requests(self):
  35. yield scrapy.Request(self.get_start_url(), callback=self.parse_book_list)
  36. def parse_book_list(self, response):
  37. result = self.bid_list_result(response)
  38. for item in result:
  39. bid = item['id']
  40. result = self.mysqlHelper.get_book_info_by_source(bid)
  41. if result is not None:
  42. continue
  43. url = self.get_book_info_url(bid)
  44. yield scrapy.Request(url, callback=self.parse_book_info)
  45. # break
  46. def parse_book_info(self, response):
  47. if response.text == '':
  48. return None
  49. now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  50. result = self.book_info_result(response)
  51. if result is None:
  52. return None
  53. category_id = 0 if result.get('category_id') is None else result.get('category_id')
  54. book_info_item = BookInfoItem()
  55. source_bid = result.get('bid')
  56. book_info_item['cp_bid'] = source_bid
  57. book_info_item['cp_name'] = self.source_name
  58. book_info_item['cp_id'] = self.source_id
  59. book_info_item['name'] = result['name']
  60. book_info_item['author'] = result['author']
  61. book_info_item['intro'] = result['intro']
  62. book_info_item['cover'] = result['cover']
  63. book_info_item['keyword'] = result['keyword']
  64. book_info_item['category_id'] = category_id
  65. book_info_item['status'] = result['status']
  66. book_info_item['size'] = 0
  67. book_info_item['category_name'] = result['category']
  68. book_info_item['last_chapter'] = ''
  69. book_info_item['chapter_count'] = 0
  70. book_info_item['first_cid'] = 0
  71. book_info_item['last_cid'] = 0
  72. book_info_item['channel'] = result['channel']
  73. book_info_item['updated_at'] = now
  74. book_info_item['created_at'] = now
  75. bid = self.mysqlHelper.insert_book(book_info_item)
  76. if self.__is_first:
  77. self.__stats.set_value('bid_start', bid)
  78. self.__is_first = False
  79. self.__stats.get_value('bid_list').append(bid)
  80. url = self.get_chapter_list_url(source_bid)
  81. meta = {'bid': bid, 'source_bid': source_bid}
  82. yield scrapy.Request(url, callback=self.parse_chapter_list, meta=meta)
  83. def parse_chapter_list(self, response):
  84. if response.text == '':
  85. return None
  86. result = self.chapter_list_result(response)
  87. bid = response.meta['bid']
  88. source_bid = response.meta['source_bid']
  89. i = 0
  90. for chapter_item in result:
  91. i = i + 1
  92. cid = chapter_item['source_chapter_id']
  93. meta = chapter_item
  94. meta['bid'] = bid
  95. url = self.get_chapter_content_url(source_bid, cid)
  96. yield scrapy.Request(url, callback=self.parse_chapter_content, meta=meta)
  97. def parse_chapter_content(self, response):
  98. if response.text == '':
  99. return None
  100. now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  101. result = self.chapter_content_result(response)
  102. meta = response.meta
  103. chapter_item = ChapterItem()
  104. chapter_item['bid'] = meta['bid']
  105. chapter_item['name'] = meta['name']
  106. chapter_item['sequence'] = meta['sequence']
  107. chapter_item['size'] = meta['size']
  108. chapter_item['is_vip'] = meta['is_vip']
  109. chapter_item['source_chapter_id'] = meta['source_chapter_id']
  110. chapter_item['recent_update_at'] = meta['recent_update_at']
  111. chapter_item['chapter_content_id'] = 0
  112. chapter_item['content'] = formatcontent(result['content'])
  113. if result.get('size') is not None:
  114. chapter_item['size'] = result.get('size')
  115. if result.get('is_vip') is not None:
  116. chapter_item['is_vip'] = result.get('is_vip')
  117. if result.get('name') is not None:
  118. chapter_item['name'] = result.get('name')
  119. if result.get('recent_update_at') is not None:
  120. chapter_item['recent_update_at'] = result.get('recent_update_at')
  121. if result.get('source_chapter_id') is not None:
  122. chapter_item['source_chapter_id'] = result.get('source_chapter_id')
  123. chapter_item['prev_cid'] = 0
  124. chapter_item['next_cid'] = 0
  125. chapter_item['updated_at'] = now
  126. chapter_item['created_at'] = now
  127. yield chapter_item
  128. def get_start_url(self):
  129. raise NotImplementedError
  130. def bid_list_result(self, response):
  131. raise NotImplementedError
  132. def get_book_info_url(self, bid):
  133. raise NotImplementedError
  134. def book_info_result(self, response):
  135. raise NotImplementedError
  136. def get_chapter_list_url(self, bid):
  137. raise NotImplementedError
  138. def chapter_list_result(self, response):
  139. raise NotImplementedError
  140. def get_chapter_content_url(self, bid, cid):
  141. raise NotImplementedError
  142. def chapter_content_result(self, response):
  143. raise NotImplementedError
  144. class baseUpdateSpider(scrapy.Spider):
  145. name = ''
  146. source = ''
  147. source_id = 0
  148. custom_settings = {
  149. 'DOWNLOAD_DELAY': 0.01,
  150. 'SOURCE': source,
  151. }
  152. def __init__(self, host, user, password, db, stats, settings):
  153. scrapy.Spider.__init__(self)
  154. source = self.source
  155. self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db, source=source,source_id=self.source_id)
  156. self.__stats = stats
  157. self.__stats.set_value('spider_type', 'update')
  158. self.__stats.set_value('bid_list', [])
  159. self.__is_first = True
  160. @classmethod
  161. def from_crawler(cls, crawler):
  162. settings = crawler.settings
  163. host = settings.get('MYSQL_HOST')
  164. user = settings.get('MYSQL_USER')
  165. password = settings.get('MYSQL_PWD')
  166. db = settings.get('MYSQL_DB')
  167. return cls(host=host, user=user, password=password, db=db, stats=crawler.stats, settings=settings)
  168. def start_requests(self):
  169. book_list = self.mysqlHelper.get_need_update_book_list()
  170. if book_list is not None:
  171. for book in book_list:
  172. url = self.get_chapter_list_url(book['copilot'])
  173. meta = {'bid': book['id'], 'cp_bid': book['cp_bid']}
  174. yield scrapy.Request(url, callback=self.parse_chapter_list, meta=meta)
  175. def parse_chapter_list(self, response):
  176. if response.text == '':
  177. return None
  178. chapter_list = self.chapter_list_result(response)
  179. meta = response.meta
  180. if chapter_list is not None:
  181. bid = response.meta.get('bid')
  182. cp_bid = response.meta.get('cp_bid')
  183. last_chapter = self.mysqlHelper.get_last_cid_by_bid(bid)
  184. start = False
  185. if last_chapter is None:
  186. start = True
  187. last_source_cid = ''
  188. last_sequence = 0
  189. last_chapter_id = 0
  190. else:
  191. last_source_cid = str(last_chapter['source_chapter_id'])
  192. last_sequence = last_chapter['sequence']
  193. last_chapter_id = last_chapter['id']
  194. has_new_chapter = False
  195. for chapter_item in chapter_list:
  196. if not start:
  197. if len(last_source_cid) > 0:
  198. if str(chapter_item['source_chapter_id']) == str(last_source_cid):
  199. start = True
  200. else:
  201. if int(chapter_item['sequence']) == last_sequence:
  202. start = True
  203. continue
  204. if not has_new_chapter:
  205. self.__stats.get_value('bid_list').append(
  206. {"bid": meta['bid'], 'start': last_chapter_id, 'start_sequence': last_sequence})
  207. has_new_chapter = True
  208. cid = chapter_item['source_chapter_id']
  209. last_sequence = last_sequence + 1
  210. meta = chapter_item
  211. meta['bid'] = bid
  212. url = self.get_chapter_content_url(cp_bid, cid)
  213. yield scrapy.Request(url, callback=self.parse_chapter_content, meta=meta)
  214. def parse_chapter_content(self, response):
  215. if response.text == '':
  216. return None
  217. now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  218. result = self.chapter_content_result(response)
  219. meta = response.meta
  220. chapter_item = ChapterItem()
  221. chapter_item['bid'] = meta['bid']
  222. chapter_item['name'] = meta['name']
  223. chapter_item['sequence'] = meta['sequence']
  224. chapter_item['size'] = meta['size']
  225. chapter_item['is_vip'] = meta['is_vip']
  226. chapter_item['source_chapter_id'] = meta['source_chapter_id']
  227. chapter_item['chapter_content_id'] = 0
  228. chapter_item['content'] = formatcontent(result['content'])
  229. if result.get('size') is not None:
  230. chapter_item['size'] = result.get('size')
  231. if result.get('is_vip') is not None:
  232. chapter_item['is_vip'] = result.get('is_vip')
  233. if result.get('name') is not None:
  234. chapter_item['name'] = result.get('name')
  235. if result.get('recent_update_at') is not None:
  236. chapter_item['recent_update_at'] = result.get('recent_update_at')
  237. if result.get('source_chapter_id') is not None:
  238. chapter_item['source_chapter_id'] = result.get('source_chapter_id')
  239. chapter_item['prev_cid'] = 0
  240. chapter_item['next_cid'] = 0
  241. chapter_item['updated_at'] = now
  242. chapter_item['created_at'] = now
  243. yield chapter_item
  244. def get_chapter_list_url(self, bid):
  245. raise NotImplementedError
  246. def chapter_list_result(self, response):
  247. raise NotImplementedError
  248. def get_chapter_content_url(self, bid, cid):
  249. raise NotImplementedError
  250. def chapter_content_result(self, response):
  251. raise NotImplementedError
  252. class fixChapterSpider(scrapy.Spider):
  253. name = ''
  254. source = ''
  255. source_id = 0
  256. custom_settings = {
  257. 'DOWNLOAD_DELAY': 0.01,
  258. 'SOURCE': source,
  259. }
  260. def __init__(self, host, user, password, db, bid_list, stats, settings):
  261. scrapy.Spider.__init__(self)
  262. source = self.source
  263. self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db, source=source,source_id=self.source_id)
  264. self.__stats = stats
  265. self.__stats.set_value('spider_type', 'fix')
  266. self.__stats.set_value('bid_list', [])
  267. self.__is_first = True
  268. self.bid_list = bid_list
  269. @classmethod
  270. def from_crawler(cls, crawler, *args, **kwargs):
  271. settings = crawler.settings
  272. host = settings.get('MYSQL_HOST')
  273. user = settings.get('MYSQL_USER')
  274. password = settings.get('MYSQL_PWD')
  275. db = settings.get('MYSQL_DB')
  276. bid = kwargs.get('bid')
  277. if bid is not None:
  278. bid_list = bid.split(',')
  279. else:
  280. bid_list = []
  281. return cls(host=host, user=user, password=password, db=db,
  282. bid_list=bid_list, stats=crawler.stats, settings=settings)
  283. def start_requests(self):
  284. if self.bid_list is None:
  285. yield
  286. return
  287. for book in self.bid_list:
  288. info = self.mysqlHelper.get_book_info_by_id(book)
  289. if info is None:
  290. continue
  291. self.mysqlHelper.re_sequence(book)
  292. url = self.get_chapter_list_url(info['cp_bid'])
  293. meta = {'bid': book, 'cp_bid': info['cp_bid']}
  294. yield scrapy.Request(url, callback=self.parse_chapter_list, meta=meta)
  295. def parse_chapter_list(self, response):
  296. if response.text == '':
  297. return None
  298. chapter_list = self.chapter_list_result(response)
  299. if chapter_list is not None:
  300. bid = response.meta.get('bid')
  301. cp_bid = response.meta.get('cp_bid')
  302. last_sequence = 0
  303. for chapter_item in chapter_list:
  304. last_sequence = last_sequence + 1
  305. chapter_info = self.mysqlHelper.get_cid_by_bid_sequence(bid, chapter_item['sequence'])
  306. cid = chapter_item['source_chapter_id']
  307. meta = chapter_item
  308. if chapter_info is not None:
  309. meta['type'] = 'update'
  310. meta['chapter_content_id'] = chapter_info['chapter_content_id']
  311. meta['cid'] = chapter_info['id']
  312. meta['bid'] = bid
  313. url = self.get_chapter_content_url(cp_bid, cid)
  314. yield scrapy.Request(url, callback=self.parse_chapter_content, meta=meta)
  315. self.__stats.get_value('bid_list').append({'bid':bid,'end':last_sequence})
  316. def parse_chapter_content(self, response):
  317. if response.text == '':
  318. return None
  319. now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  320. result = self.chapter_content_result(response)
  321. meta = response.meta
  322. data = {}
  323. data['bid'] = meta['bid']
  324. data['name'] = meta['name']
  325. data['size'] = meta['size']
  326. data['is_vip'] = meta['is_vip']
  327. data['sequence'] = meta['sequence']
  328. data['source_chapter_id'] = meta['source_chapter_id']
  329. data['recent_update_at'] = meta['recent_update_at']
  330. data['content'] = formatcontent(result['content'])
  331. if result.get('size') is not None:
  332. data['size'] = result.get('size')
  333. if result.get('is_vip') is not None:
  334. data['is_vip'] = result.get('is_vip')
  335. if result.get('name') is not None:
  336. data['name'] = result.get('name')
  337. if result.get('recent_update_at') is not None:
  338. data['recent_update_at'] = result.get('recent_update_at')
  339. if result.get('source_chapter_id') is not None:
  340. data['source_chapter_id'] = result.get('source_chapter_id')
  341. if meta.get('type') is not None:
  342. content = formatcontent(result['content'])
  343. data['content'] = content
  344. data['size'] = len(removePunctuation(content))
  345. self.mysqlHelper.update_content(meta['chapter_content_id'], meta['name'], result['content'])
  346. if meta.get('cid') is not None:
  347. data['cid'] = meta['cid']
  348. self.mysqlHelper.update_chapter(data)
  349. else:
  350. data['prev_cid'] = 0
  351. data['next_cid'] = 0
  352. data['updated_at'] = now
  353. data['created_at'] = now
  354. chapter_item = ChapterItem(data)
  355. yield chapter_item
  356. def get_chapter_list_url(self, bid):
  357. raise NotImplementedError
  358. def chapter_list_result(self, response):
  359. raise NotImplementedError
  360. def get_chapter_content_url(self, bid, cid):
  361. raise NotImplementedError
  362. def chapter_content_result(self, response):
  363. raise NotImplementedError
  364. class fixBookInfoSpider(scrapy.Spider):
  365. name = ''
  366. source = ''
  367. source_name = ''
  368. source_id = 0
  369. def __init__(self, host, user, password, db, bid_list, stats):
  370. scrapy.Spider.__init__(self)
  371. source = self.source
  372. self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db, source=source,source_id=self.source_id)
  373. self.bid_list = bid_list
  374. @classmethod
  375. def from_crawler(cls, crawler, *args, **kwargs):
  376. settings = crawler.settings
  377. host = settings.get('MYSQL_HOST')
  378. user = settings.get('MYSQL_USER')
  379. password = settings.get('MYSQL_PWD')
  380. db = settings.get('MYSQL_DB')
  381. bid = kwargs.get('bid')
  382. if bid is not None:
  383. bid_list = bid.split(',')
  384. else:
  385. bid_list = []
  386. return cls(host=host, user=user, password=password, db=db, bid_list=bid_list, stats=crawler.stats)
  387. def start_requests(self):
  388. if self.bid_list is None:
  389. yield
  390. return
  391. for book in self.bid_list:
  392. info = self.mysqlHelper.get_book_info_by_id(book)
  393. if info is None:
  394. continue
  395. url = self.get_book_info_url(info['cp_bid'])
  396. meta = {'bid': book, 'cp_bid': info['cp_bid']}
  397. yield scrapy.Request(url, callback=self.parse_book_info, meta=meta)
  398. def parse_book_info(self, response):
  399. if response.text == '':
  400. return None
  401. result = self.book_info_result(response)
  402. if result is None:
  403. yield
  404. return
  405. result['bid'] = response.meta['bid']
  406. self.mysqlHelper.update_book_info(result)
  407. def get_book_info_url(self, bid):
  408. raise NotImplementedError
  409. def book_info_result(self, response):
  410. raise NotImplementedError