zycontentSpider.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103
  1. # -*- coding: utf-8 -*-
  2. import scrapy
  3. from ydyspider.mysql import msyqlHelper
  4. import json
  5. import time
  6. class zycontentSpider(scrapy.Spider):
  7. name = 'zycontent'
  8. allowed_domains = ['60.204.150.173']
  9. query = '?channel_name=zhuishuyun&channel_key=123456'
  10. base_url = 'http://60.204.150.173:8094/api/output'
  11. custom_settings = {
  12. 'DOWNLOAD_DELAY': 0.01,
  13. 'LOG_FILE': 'ydyspider/log/' + name + time.strftime("%Y-%m-%d", time.localtime()) + '.log'
  14. }
  15. def start_requests(self):
  16. self.crawler.stats.set_value('bid_list', [])
  17. if hasattr(self,'bid'):
  18. param = self.bid
  19. else:
  20. param = None
  21. if hasattr(self,'cp_name'):
  22. cp_name = '' if self.cp_name is None or self.cp_name == 'zycontent' else self.cp_name
  23. else:
  24. cp_name = 'zycontent'
  25. if param is not None:
  26. bid_list = param.split(',')
  27. for bid in bid_list:
  28. url = self.base_url + '/bookdetail/{}'.format(bid) + self.query
  29. yield scrapy.Request(url, callback=self.parse2, meta={"zhiyu_book_id": bid, "i": 0})
  30. else:
  31. url = self.base_url + '/booklist' + self.query + '&cp_name={}'.format(cp_name)
  32. self.logger.info(url)
  33. yield scrapy.Request(url, callback=self.parse1)
  34. def parse1(self,response):
  35. res = response.text
  36. res = self.json_encode(res)
  37. book_list = res.get('data')
  38. mysql = msyqlHelper()
  39. for book_item in book_list:
  40. bid = book_item['bid']
  41. zhiyu_book = mysql.getZyBook(bid)
  42. if zhiyu_book is None:
  43. url = self.base_url + '/bookdetail/{}'.format(bid) + self.query
  44. yield scrapy.Request(url, callback=self.parse2, meta={"zhiyu_book_id": bid, "i": 0})
  45. def parse2(self, response):
  46. mysql = msyqlHelper()
  47. res = response.text
  48. res = self.json_encode(res)
  49. if res['code'] == 10000:
  50. data = dict()
  51. data['zhiyu_book_id'] = res['data']['bid']
  52. data['source_name'] = 'zy_content'
  53. data['name'] = res['data']['book_name']
  54. data['author'] = res['data']['author']
  55. data['intro'] = res['data']['Introduction']
  56. data['cover'] = res['data']['cover']
  57. data['category_name'] = res['data']['category_name']
  58. data['category_id'] = res['data']['category_id']
  59. data['status'] = res['data']['status']
  60. data['sequence'] = response.meta['i']
  61. bid = mysql.insertZyBook(data)
  62. self.crawler.stats.get_value('bid_list').append(bid)
  63. mysql.close()
  64. url = self.base_url + '/chapterlist/{}'.format(res['data']['bid']) + self.query
  65. yield scrapy.Request(url, meta={"bid": bid, "book_id": res['data']['bid']}, callback=self.parse3)
  66. def parse3(self, response):
  67. res = response.text
  68. res = self.json_encode(res)
  69. if res['code'] == 10000:
  70. for chapter in res['data']:
  71. chapter['bid'] = response.meta['bid']
  72. chapter['source_chapter_id'] = chapter['chapter_id']
  73. url = self.base_url + '/chaptercontent/{}/chapterid/{}'.format(response.meta['book_id'], chapter['chapter_id'])+ self.query
  74. yield scrapy.Request(url, meta=chapter, callback=self.parse4)
  75. def parse4(self, response):
  76. res = response.text
  77. res = self.json_encode(res)
  78. if res['code'] == 10000:
  79. mysql = msyqlHelper()
  80. meta = response.meta
  81. data = dict()
  82. data['bid'] = meta['bid']
  83. data['name'] = res['data']['chapter_name']
  84. data['sequence'] = meta['sequence']
  85. data['size'] = meta['size']
  86. data['is_vip'] = meta['is_vip']
  87. data['prev_cid'] = 0
  88. data['next_cid'] = 0
  89. data['recent_update_at'] = meta['updated_at']
  90. data['content'] = res['data']['chapter_content']
  91. data['ly_chapter_id'] = meta['source_chapter_id']
  92. mysql.inseraAll(data)
  93. mysql.close()
  94. def json_encode(self, jsonstr):
  95. return json.loads(jsonstr)