123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103 |
- # -*- coding: utf-8 -*-
- import scrapy
- from ydyspider.mysql import msyqlHelper
- import json
- import time
- class zycontentSpider(scrapy.Spider):
- name = 'zycontent'
- allowed_domains = ['60.204.150.173']
- query = '?channel_name=zhuishuyun&channel_key=123456'
- base_url = 'http://60.204.150.173:8094/api/output'
- custom_settings = {
- 'DOWNLOAD_DELAY': 0.01,
- 'LOG_FILE': 'ydyspider/log/' + name + time.strftime("%Y-%m-%d", time.localtime()) + '.log'
- }
- def start_requests(self):
- self.crawler.stats.set_value('bid_list', [])
- if hasattr(self,'bid'):
- param = self.bid
- else:
- param = None
- if hasattr(self,'cp_name'):
- cp_name = '' if self.cp_name is None or self.cp_name == 'zycontent' else self.cp_name
- else:
- cp_name = 'zycontent'
- if param is not None:
- bid_list = param.split(',')
- for bid in bid_list:
- url = self.base_url + '/bookdetail/{}'.format(bid) + self.query
- yield scrapy.Request(url, callback=self.parse2, meta={"zhiyu_book_id": bid, "i": 0})
- else:
- url = self.base_url + '/booklist' + self.query + '&cp_name={}'.format(cp_name)
- self.logger.info(url)
- yield scrapy.Request(url, callback=self.parse1)
- def parse1(self,response):
- res = response.text
- res = self.json_encode(res)
- book_list = res.get('data')
- mysql = msyqlHelper()
- for book_item in book_list:
- bid = book_item['bid']
- zhiyu_book = mysql.getZyBook(bid)
- if zhiyu_book is None:
- url = self.base_url + '/bookdetail/{}'.format(bid) + self.query
- yield scrapy.Request(url, callback=self.parse2, meta={"zhiyu_book_id": bid, "i": 0})
- def parse2(self, response):
- mysql = msyqlHelper()
- res = response.text
- res = self.json_encode(res)
- if res['code'] == 10000:
- data = dict()
- data['zhiyu_book_id'] = res['data']['bid']
- data['source_name'] = 'zy_content'
- data['name'] = res['data']['book_name']
- data['author'] = res['data']['author']
- data['intro'] = res['data']['Introduction']
- data['cover'] = res['data']['cover']
- data['category_name'] = res['data']['category_name']
- data['category_id'] = res['data']['category_id']
- data['status'] = res['data']['status']
- data['sequence'] = response.meta['i']
- bid = mysql.insertZyBook(data)
- self.crawler.stats.get_value('bid_list').append(bid)
- mysql.close()
- url = self.base_url + '/chapterlist/{}'.format(res['data']['bid']) + self.query
- yield scrapy.Request(url, meta={"bid": bid, "book_id": res['data']['bid']}, callback=self.parse3)
- def parse3(self, response):
- res = response.text
- res = self.json_encode(res)
- if res['code'] == 10000:
- for chapter in res['data']:
- chapter['bid'] = response.meta['bid']
- chapter['source_chapter_id'] = chapter['chapter_id']
- url = self.base_url + '/chaptercontent/{}/chapterid/{}'.format(response.meta['book_id'], chapter['chapter_id'])+ self.query
- yield scrapy.Request(url, meta=chapter, callback=self.parse4)
- def parse4(self, response):
- res = response.text
- res = self.json_encode(res)
- if res['code'] == 10000:
- mysql = msyqlHelper()
- meta = response.meta
- data = dict()
- data['bid'] = meta['bid']
- data['name'] = res['data']['chapter_name']
- data['sequence'] = meta['sequence']
- data['size'] = meta['size']
- data['is_vip'] = meta['is_vip']
- data['prev_cid'] = 0
- data['next_cid'] = 0
- data['recent_update_at'] = meta['updated_at']
- data['content'] = res['data']['chapter_content']
- data['ly_chapter_id'] = meta['source_chapter_id']
- mysql.inseraAll(data)
- mysql.close()
- def json_encode(self, jsonstr):
- return json.loads(jsonstr)
|