tgz 2 anni fa
commit
b54a37fde6

+ 11 - 0
scrapy.cfg

@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.org/en/latest/deploy.html
+
+[settings]
+default = ydyspider.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = ydyspider

+ 5 - 0
ydyspider/.gitignore

@@ -0,0 +1,5 @@
+.idea/
+*.pyc
+*/__pycache__
+*/*.pyc
+.vscode/

+ 0 - 0
ydyspider/__init__.py


+ 428 - 0
ydyspider/baseSpider.py

@@ -0,0 +1,428 @@
+# -*- coding: utf-8 -*-
+
+import time
+import scrapy
+from ydyspider.items import BookInfoItem, ChapterItem
+from ydyspider.mysqlHelper import MysqlHelper
+import hashlib
+import random
+
+from ydyspider.pipelines import formatcontent, removePunctuation
+
+
+def md5(token):
+    m = hashlib.md5()
+    m.update(token.encode('utf-8'))
+    return m.hexdigest()
+
+
+def sign(param, key):
+    param = sorted(param.items(), key=lambda x: x[0])
+    string = ''
+    for item in param:
+        string = string + '{}={}&'.format(str(item[0]), str(item[1]))
+    string = string + 'key={}'.format(key)
+    return md5(string).upper()
+
+
+def sha1(token):
+    m = hashlib.sha1()
+    m.update(token.encode('utf-8'))
+    return m.hexdigest()
+
+
+def random_str(slen=10):
+    seed = "1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
+    sa = []
+    for i in range(slen):
+        sa.append(random.choice(seed))
+    return ''.join(sa)
+
+
+class baseSpider(scrapy.Spider):
+    name = ''
+    allowed_domains = []
+    base_url = ''
+    source = ''
+    source_name = ''
+
+    def __init__(self, host, user, password, db, stats):
+        scrapy.Spider.__init__(self)
+        source = self.source
+        self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db,source=source)
+        self.__stats = stats
+        self.__source = self.source
+        self.__stats.set_value('bid_list', [])
+        self.__stats.set_value('spider_type', 'add')
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        settings = crawler.settings
+        host = settings.get('MYSQL_HOST')
+        user = settings.get('MYSQL_USER')
+        password = settings.get('MYSQL_PWD')
+        db = settings.get('MYSQL_DB')
+
+        return cls(host=host, user=user, password=password, db=db, stats=crawler.stats)
+
+    def start_requests(self):
+        yield scrapy.Request(self.get_start_url(), callback=self.parse_book_list)
+
+    def parse_book_list(self, response):
+        result = self.bid_list_result(response)
+        for item in result:
+            bid = item['id']
+            result = self.mysqlHelper.get_book_info_by_source(bid)
+            if result is not None:
+                continue
+            url = self.get_book_info_url(bid)
+            yield scrapy.Request(url, callback=self.parse_book_info)
+
+    def parse_book_info(self, response):
+        if response.text == '':
+            return None
+        now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+        result = self.book_info_result(response)
+        if result is None:
+            return None
+        source_bid = result.get('bid')
+        book_info_item = BookInfoItem()
+        book_info_item['source_bid'] = source_bid
+        book_info_item['name'] = result['name']
+        book_info_item['author'] = result['author']
+        book_info_item['intro'] = result['intro']
+        book_info_item['cover'] = result['cover']
+        book_info_item['keyword'] = result['keyword']
+        book_info_item['category_id'] = 0 if result.get('category_id') is None else result.get('category_id')
+        book_info_item['status'] = result['status']
+        book_info_item['chapter_count'] = 0 if result.get('chapter_count') is None else result.get('chapter_count')
+        book_info_item['first_cid'] = 0
+        book_info_item['last_cid'] = 0
+        book_info_item['size'] = 0 if result.get('size') is None else result.get('size')
+        book_info_item['last_chapter'] = '' if result.get('last_chapter') is None else result.get('last_chapter')
+        book_info_item['category_name'] = result['category_name']
+        book_info_item['source_name'] = self.source
+        book_info_item['gender'] = 0 if result.get('gender') is None else result.get('gender')
+        book_info_item['updated_at'] = now
+        book_info_item['created_at'] = now
+        bid = self.mysqlHelper.insert_book(book_info_item)
+        self.__stats.get_value('bid_list').append(bid)
+        url = self.get_chapter_list_url(source_bid)
+        meta = {'bid': bid, 'source_bid': source_bid}
+        yield scrapy.Request(url, self.parse_chapter_list, meta=meta)
+
+    def parse_chapter_list(self, response):
+        if response.text == '':
+            return None
+        result = self.chapter_list_result(response)
+        if result is None:
+            return None
+        bid = response.meta['bid']
+        source_bid = response.meta['source_bid']
+        for chapter_item in result:
+            meta = chapter_item
+            cid = chapter_item['source_chapter_id']
+            meta['bid'] = bid
+            url = self.get_chapter_content_url(source_bid, cid)
+            yield scrapy.Request(url, self.parse_chapter_content, meta=meta)
+
+    def parse_chapter_content(self, response):
+        if response.text == '':
+            return None
+        now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+        result = self.chapter_content_result(response)
+        meta = response.meta
+        chapter_item = ChapterItem()
+        chapter_item['bid'] = meta['bid']
+        chapter_item['name'] = meta['name']
+        chapter_item['sequence'] = meta['sequence']
+        chapter_item['is_vip'] = meta['is_vip']
+        chapter_item['prev_cid'] = 0
+        chapter_item['next_cid'] = 0
+        chapter_item['recent_update_at'] = meta['recent_update_at']
+        content = formatcontent(result['content'])
+        chapter_item['content'] = content
+        chapter_item['size'] = len(removePunctuation(content))
+        chapter_item['chapter_content_id'] = 0
+        chapter_item['source_chapter_id'] = meta['source_chapter_id']
+        chapter_item['created_at'] = now
+        chapter_item['updated_at'] = now
+        if result.get('size') is not None:
+            chapter_item['size'] = result.get('size')
+        if result.get('is_vip') is not None:
+            chapter_item['is_vip'] = result.get('is_vip')
+        if result.get('name') is not None:
+            chapter_item['name'] = result.get('name')
+        if result.get('recent_update_at') is not None:
+            chapter_item['recent_update_at'] = result.get('recent_update_at')
+        if result.get('source_chapter_id') is not None:
+            chapter_item['source_chapter_id'] = result.get('source_chapter_id')
+        yield chapter_item
+
+    def get_start_url(self):
+        raise NotImplementedError
+
+    def bid_list_result(self, response):
+        raise NotImplementedError
+
+    def get_book_info_url(self, bid):
+        raise NotImplementedError
+
+    def book_info_result(self, response):
+        raise NotImplementedError
+
+    def get_chapter_list_url(self, bid):
+        raise NotImplementedError
+
+    def chapter_list_result(self, response):
+        raise NotImplementedError
+
+    def get_chapter_content_url(self, bid, cid):
+        raise NotImplementedError
+
+    def chapter_content_result(self, response):
+        raise NotImplementedError
+
+
+class baseUpdateSpider(scrapy.Spider):
+    name = ''
+    allowed_domains = []
+    base_url = ''
+    source = ''
+    source_name = ''
+
+    def __init__(self, host, user, password, db, stats):
+        scrapy.Spider.__init__(self)
+        source = self.source
+        self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db, source=source)
+        self.__stats = stats
+        self.__stats.set_value('spider_type', 'update')
+        self.__stats.set_value('bid_list', [])
+        self.__is_first = True
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        settings = crawler.settings
+        host = settings.get('MYSQL_HOST')
+        user = settings.get('MYSQL_USER')
+        password = settings.get('MYSQL_PWD')
+        db = settings.get('MYSQL_DB')
+        return cls(host=host, user=user, password=password, db=db, stats=crawler.stats)
+
+    def start_requests(self):
+        book_list = self.mysqlHelper.get_need_update_book_list()
+        if book_list is not None:
+            for book in book_list:
+                url = self.get_chapter_list_url(book['source_bid'])
+                meta = {'bid': book['id'], 'source_bid': book['source_bid']}
+                yield scrapy.Request(url, callback=self.parse_chapter_list, meta=meta)
+
+    def parse_chapter_list(self, response):
+        if response.text == '':
+            return None
+        chapter_list = self.chapter_list_result(response)
+        meta = response.meta
+        if chapter_list is not None:
+            bid = response.meta.get('bid')
+            source_bid = response.meta.get('source_bid')
+            last_chapter = self.mysqlHelper.get_last_cid_by_bid(bid)
+            start = False
+            if last_chapter is None:
+                start = True
+                last_source_cid = 0
+                last_sequence = 0
+                last_chapter_id = 0
+            else:
+                last_source_cid = last_chapter['source_chapter_id']
+                last_sequence = last_chapter['sequence']
+                last_chapter_id = last_chapter['id']
+
+            has_new_chapter = False
+            for chapter_item in chapter_list:
+                if not start:
+                    if int(chapter_item['source_chapter_id']) == int(last_source_cid):
+                        start = True
+                    continue
+                if not has_new_chapter:
+                    self.__stats.get_value('bid_list').append(
+                        {"bid": meta['bid'], 'start': last_chapter_id})
+                    has_new_chapter = True
+                cid = chapter_item['source_chapter_id']
+                last_sequence = last_sequence + 1
+                if chapter_item['sequence'] == 0:
+                    chapter_item['sequence'] = last_sequence
+                meta = chapter_item
+                meta['bid'] = bid
+                url = self.get_chapter_content_url(source_bid, cid)
+                yield scrapy.Request(url, callback=self.parse_chapter_content, meta=meta)
+
+    def parse_chapter_content(self, response):
+        if response.text == '':
+            return None
+        now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+        result = self.chapter_content_result(response)
+        meta = response.meta
+        chapter_item = ChapterItem()
+        chapter_item['bid'] = meta['bid']
+        chapter_item['name'] = meta['name']
+        chapter_item['sequence'] = meta['sequence']
+        chapter_item['is_vip'] = meta['is_vip']
+        chapter_item['prev_cid'] = 0
+        chapter_item['next_cid'] = 0
+        chapter_item['recent_update_at'] = meta['recent_update_at']
+        content = formatcontent(result['content'])
+        chapter_item['size'] = len(removePunctuation(content))
+        chapter_item['content'] = content
+        chapter_item['chapter_content_id'] = 0
+        chapter_item['source_chapter_id'] = meta['source_chapter_id']
+        chapter_item['created_at'] = now
+        chapter_item['updated_at'] = now
+        if result.get('is_vip') is not None:
+            chapter_item['is_vip'] = result.get('is_vip')
+        if result.get('name') is not None:
+            chapter_item['name'] = result.get('name')
+        if result.get('recent_update_at') is not None:
+            chapter_item['recent_update_at'] = result.get('recent_update_at')
+        if result.get('source_chapter_id') is not None:
+            chapter_item['source_chapter_id'] = result.get('source_chapter_id')
+        yield chapter_item
+
+    def get_chapter_list_url(self, bid):
+        raise NotImplementedError
+
+    def chapter_list_result(self, response):
+        raise NotImplementedError
+
+    def get_chapter_content_url(self, bid, cid):
+        raise NotImplementedError
+
+    def chapter_content_result(self, response):
+        raise NotImplementedError
+
+
+class fixChapterSpider(scrapy.Spider):
+    name = ''
+    source = ''
+
+    def __init__(self, host, user, password, db, bid_list, stats):
+        scrapy.Spider.__init__(self)
+        source = self.source
+        self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db, source=source)
+        self.__stats = stats
+        self.__is_first = True
+        self.bid_list = bid_list
+
+    @classmethod
+    def from_crawler(cls, crawler, *args, **kwargs):
+        settings = crawler.settings
+        host = settings.get('MYSQL_HOST')
+        user = settings.get('MYSQL_USER')
+        password = settings.get('MYSQL_PWD')
+        db = settings.get('MYSQL_DB')
+        bid = kwargs.get('bid')
+        if bid is not None:
+            bid_list = bid.split(',')
+        else:
+            bid_list = []
+        return cls(host=host, user=user, password=password, db=db, bid_list=bid_list, stats=crawler.stats)
+
+    def start_requests(self):
+        if self.bid_list is None:
+            yield
+            return
+        for book in self.bid_list:
+            info = self.mysqlHelper.get_book_info_by_id(book)
+            if info is None:
+                continue
+            url = self.get_chapter_list_url(info['source_bid'])
+            meta = {'bid': book, 'source_bid': info['source_bid']}
+            yield scrapy.Request(url, callback=self.parse_chapter_list, meta=meta)
+
+    def parse_chapter_list(self, response):
+        if response.text == '':
+            return None
+        chapter_list = self.chapter_list_result(response)
+        if chapter_list is not None:
+            bid = response.meta.get('bid')
+            source_bid = response.meta.get('source_bid')
+            last_sequence = 0
+            for chapter_item in chapter_list:
+                last_sequence = last_sequence + 1
+
+                if chapter_item['sequence'] == 0:
+                    chapter_item['sequence'] = last_sequence
+                chapter_info = self.mysqlHelper.get_cid_by_bid_sequence(bid, last_sequence)
+                cid = chapter_item['source_chapter_id']
+
+                meta = chapter_item
+                if chapter_info is not None:
+                    meta['type'] = 'update'
+                    meta['chapter_content_id'] = chapter_info['chapter_content_id']
+                    meta['cid'] = chapter_info['id']
+                meta['bid'] = bid
+                url = self.get_chapter_content_url(source_bid, cid)
+                yield scrapy.Request(url, callback=self.parse_chapter_content, meta=meta)
+
+    def parse_chapter_content(self, response):
+        if response.text == '':
+            return None
+        now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+        result = self.chapter_content_result(response)
+        meta = response.meta
+        content = formatcontent(result['content'])
+        meta['size'] = len(removePunctuation(content))
+        meta['content'] = content
+        if result.get('size') is not None:
+            meta['size'] = result.get('size')
+        if result.get('is_vip') is not None:
+            meta['is_vip'] = result.get('is_vip')
+        if result.get('name') is not None:
+            meta['name'] = result.get('name')
+        if result.get('recent_update_at') is not None:
+            meta['recent_update_at'] = result.get('recent_update_at')
+        if result.get('source_chapter_id') is not None:
+            meta['source_chapter_id'] = result.get('source_chapter_id')
+        if meta.get('type') is not None:
+            self.mysqlHelper.update_content(meta['chapter_content_id'], meta['name'], result['content'])
+            self.mysqlHelper.update_chapter(meta)
+        else:
+            chapter_item = ChapterItem()
+            chapter_item['bid'] = meta['bid']
+            chapter_item['name'] = meta['name']
+            chapter_item['sequence'] = meta['sequence']
+            chapter_item['size'] = meta['size']
+            chapter_item['is_vip'] = meta['is_vip']
+            chapter_item['prev_cid'] = 0
+            chapter_item['next_cid'] = 0
+            chapter_item['recent_update_at'] = meta['recent_update_at']
+            chapter_item['content'] = result['content']
+            if meta.get('chapter_content_id') is not None:
+                chapter_item['chapter_content_id'] = meta['chapter_content_id']
+            else:
+                chapter_item['chapter_content_id'] = 0
+            chapter_item['source_chapter_id'] = meta['source_chapter_id']
+            chapter_item['created_at'] = now
+            chapter_item['updated_at'] = now
+            if result.get('size') is not None:
+                chapter_item['size'] = result.get('size')
+            if result.get('is_vip') is not None:
+                chapter_item['is_vip'] = result.get('is_vip')
+            if result.get('name') is not None:
+                chapter_item['name'] = result.get('name')
+            if result.get('recent_update_at') is not None:
+                chapter_item['recent_update_at'] = result.get('recent_update_at')
+            if result.get('source_chapter_id') is not None:
+                chapter_item['source_chapter_id'] = result.get('source_chapter_id')
+            yield chapter_item
+
+    def get_chapter_list_url(self, bid):
+        raise NotImplementedError
+
+    def chapter_list_result(self, response):
+        raise NotImplementedError
+
+    def get_chapter_content_url(self, bid, cid):
+        raise NotImplementedError
+
+    def chapter_content_result(self, response):
+        raise NotImplementedError

+ 51 - 0
ydyspider/items.py

@@ -0,0 +1,51 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class YdyspiderItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    pass
+
+
+class BookInfoItem(scrapy.Item):
+    source_bid = scrapy.Field()
+    source_name = scrapy.Field()
+    name = scrapy.Field()
+    author = scrapy.Field()
+    intro = scrapy.Field()
+    cover = scrapy.Field()
+    keyword = scrapy.Field()
+    category_id = scrapy.Field()
+    status = scrapy.Field()
+    chapter_count = scrapy.Field()
+    first_cid = scrapy.Field()
+    last_cid = scrapy.Field()
+    size = scrapy.Field()
+    last_chapter = scrapy.Field()
+    category_name = scrapy.Field()
+    gender = scrapy.Field()
+    updated_at = scrapy.Field()
+    created_at = scrapy.Field()
+
+
+class ChapterItem(scrapy.Item):
+    bid = scrapy.Field()
+    name = scrapy.Field()
+    sequence = scrapy.Field()
+    size = scrapy.Field()
+    is_vip = scrapy.Field()
+    prev_cid = scrapy.Field()
+    next_cid = scrapy.Field()
+    recent_update_at = scrapy.Field()
+    content = scrapy.Field()
+    chapter_content_id = scrapy.Field()
+    source_chapter_id = scrapy.Field()
+    created_at = scrapy.Field()
+    updated_at = scrapy.Field()

+ 0 - 0
ydyspider/log/log.log


+ 56 - 0
ydyspider/middlewares.py

@@ -0,0 +1,56 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+
+class YdyspiderSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Response, dict
+        # or Item objects.
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)

+ 101 - 0
ydyspider/mysql.py

@@ -0,0 +1,101 @@
+# -*- coding: utf-8 -*-
+import time
+import pymysql.cursors
+class msyqlHelper(object):
+	def __init__(self):
+		#self.conn = pymysql.connect(host='rm-bp1sc28q8w1slr0l4.mysql.rds.aliyuncs.com',user='zhuishuyun',password='Zhuishu!zwkj2066',db='yueduyun',charset='utf8mb4',cursorclass=pymysql.cursors.DictCursor)
+		self.conn = pymysql.connect(host='172.16.115.166',user='zsy_online',password='zsyBeid!2511',db='yueduyun',charset='utf8mb4',cursorclass=pymysql.cursors.DictCursor)
+		self.encoding = 'utf-8'
+	
+	def insertbook(self,data):
+		now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+		sql = "insert into books (ly_bid,name,author,intro,cover,category_name,category_id,status,sequence,chapter_count,first_cid,last_cid,size,last_chapter,`created_at`,`updated_at`) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
+		id = 0
+		with self.conn.cursor() as cursor:
+			res = cursor.execute(sql,(data['ly_bid'],data['name'],data['author'],data['intro'],data['cover'],data['category_name'],data['category_id'],data['status'],data['sequence'],'0','0','0','0','0',now,now))
+			id = int(cursor.lastrowid)
+		self.conn.commit()
+		return id
+
+
+	def insertbookV2(self,data):
+		now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+		sql = "insert into books (ly_bid,name,author,intro,cover,category_name,category_id,status,sequence,chapter_count,first_cid,last_cid,size,last_chapter,`created_at`,`updated_at`,source_name) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
+		id = 0
+		with self.conn.cursor() as cursor:
+			res = cursor.execute(sql,(data['ly_bid'],data['name'],data['author'],data['intro'],data['cover'],data['category_name'],data['category_id'],data['status'],data['sequence'],'0','0','0','0','0',now,now,data['source_name']))
+			id = int(cursor.lastrowid)
+		self.conn.commit()
+		return id
+
+	def insertZwBook(self, data):
+		now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+		sql = "insert into books (zw_id,name,author,intro,cover,category_name,category_id,status,sequence,chapter_count,first_cid,last_cid,size,last_chapter,`created_at`,`updated_at`,source_name) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
+		with self.conn.cursor() as cursor:
+			res = cursor.execute(sql,(data['zw_bid'],data['name'],data['author'],data['intro'],data['cover'],data['category_name'],data['category_id'],data['status'],data['sequence'],'0','0','0','0','0',now,now,data['source_name']))
+			id = int(cursor.lastrowid)
+		self.conn.commit()
+		return id
+	
+	def insertZyBook(self, data):
+		now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+		sql = "insert into books (zhiyu_book_id,name,author,intro,cover,category_name,category_id,status,sequence,chapter_count,first_cid,last_cid,size,last_chapter,`created_at`,`updated_at`,source_name) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
+		with self.conn.cursor() as cursor:
+			res = cursor.execute(sql,(data['zhiyu_book_id'],data['name'],data['author'],data['intro'],data['cover'],data['category_name'],data['category_id'],data['status'],data['sequence'],'0','0','0','0','0',now,now,data['source_name']))
+			id = int(cursor.lastrowid)
+		self.conn.commit()
+		return id
+
+	def inseraAll(self,data):
+		now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+		sql = "INSERT INTO `chapters` (`bid`, `name`,`sequence`,`size`,`is_vip`,`prev_cid`,`next_cid`,`recent_update_at`,`created_at`,`updated_at`,`content`,`ly_chapter_id`) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
+		with self.conn.cursor() as cursor:
+			cursor.execute(sql,(data['bid'],data['name'],data['sequence'],data['size'],data['is_vip'],data['prev_cid'],data['next_cid'],now,now,now,data['content'],data['ly_chapter_id']))
+			id = int(cursor.lastrowid)
+		self.conn.commit()
+		return id
+
+	def selectbylyid(self,id):
+		with self.conn.cursor() as cursor:
+			sql = "select ly_bid from books where ly_bid=%s"
+			cursor.execute(sql,(id))
+			result = cursor.fetchone()
+		self.conn.commit()
+		return result
+	
+	def getUncaompleteBook(self):
+		now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+		sql = "select id,ly_bid from books where status=0"
+		result = None
+		with self.conn.cursor() as cursor:
+			cursor.execute(sql)
+			result = cursor.fetchall()
+		self.conn.commit()
+		return result
+
+	def getChapterByBidAndName(self,bid,name):
+		sql = "select id from chapters where bid=%s and name=%s"
+		result = None
+		with self.conn.cursor() as cursor:
+			cursor.execute(sql,(bid,name))
+			result = cursor.fetchone()
+		self.conn.commit()
+		return result
+
+	def close(self):
+		self.conn.close()
+	def getByZwSource(self, bid):
+		sql = 'select id from books where zw_id=%s'
+		with self.conn.cursor() as cursor:
+			cursor.execute(sql, (bid, ))
+			result = cursor.fetchone()
+		self.conn.commit()
+		return result
+
+	def getLianshang(self):
+		sql = "SELECT zw_id FROM books a JOIN book_configs b on a.id = b.bid WHERE b.cp_source = 'lianshang' order by zw_id desc LIMIT 1"
+		with self.conn.cursor() as cursor:
+			cursor.execute(sql)
+			result = cursor.fetchone()
+		self.conn.commit()
+		return result

+ 141 - 0
ydyspider/mysqlHelper.py

@@ -0,0 +1,141 @@
+# -*- coding: utf-8 -*-
+import time
+
+import pymysql.cursors
+
+
+class MysqlHelper(object):
+    def __init__(self, host, user, password, db, source):
+        self.__conn = pymysql.connect(host=host, user=user, password=password, db=db, charset='utf8mb4',
+                                      cursorclass=pymysql.cursors.DictCursor)
+        self.source = source
+
+    def get_connection(self):
+        return self.__conn
+
+    def get_book_info_by_source(self, source_bid):
+        sql = 'select id from books where source_bid=%s and  source = %s'
+        with self.__conn.cursor() as cursor:
+            cursor.execute(sql, (source_bid, self.source))
+            result = cursor.fetchone()
+        self.__conn.commit()
+        return result
+
+    def get_need_update_book_list(self):
+        sql = 'select id,source_bid from books where source=%s and `status` = 0'
+        with self.__conn.cursor() as cursor:
+            cursor.execute(sql, (self.source, ))
+            result = cursor.fetchall()
+        self.__conn.commit()
+        return result
+
+    def get_last_cid_by_bid(self, bid):
+        sql = "select id,bid,`name`,sequence,source_chapter_id from chapters where bid = %s" \
+              " order by sequence desc limit 1"
+        with self.__conn.cursor() as cursor:
+            cursor.execute(sql, (int(bid), ))
+            result = cursor.fetchone()
+        self.__conn.commit()
+        return result
+
+    def insert_book(self, item):
+        sql = '''
+        insert into books(source_bid, `name`,author, intro, cover ,keyword , category_id,status,
+        chapter_count,first_cid,last_cid,`size`,last_chapter,category_name,source,updated_at,created_at)
+        values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
+        '''
+        with self.__conn.cursor() as cursor:
+            cursor.execute(sql, (item.get('source_bid'),
+                                 item.get('name'),
+                                 item.get('author'),
+                                 item.get('intro'),
+                                 item.get('cover'),
+                                 item.get('keyword'),
+                                 item.get('category_id'),
+                                 item.get('status'),
+                                 item.get('chapter_count'),
+                                 item.get('first_cid'),
+                                 item.get('last_cid'),
+                                 item.get('size'),
+                                 item.get('last_chapter'),
+                                 item.get('category_name'),
+                                 item.get('source'),
+                                 item.get('updated_at'),
+                                 item.get('created_at')
+                                 ))
+            bid = int(cursor.lastrowid)
+        self.__conn.commit()
+        return bid
+
+    def insert_chapter(self, item):
+        chapter_content_id = self.insert_content(item)
+        sql = "INSERT INTO `chapters` (`bid`, `name`,`sequence`,`size`,`is_vip`,`prev_cid`,`next_cid`,`recent_update_at`,`created_at`,`updated_at`,`chapter_content_id`,source_chapter_id) " \
+              "VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
+        with self.__conn.cursor() as cursor:
+            cursor.execute(sql, (
+                item['bid'], item['name'], item['sequence'], item['size'], item['is_vip'], item['prev_cid'],
+                item['next_cid'], item['recent_update_at'], item['created_at'], item['updated_at'], chapter_content_id,
+                item['source_chapter_id']))
+            cid = int(cursor.lastrowid)
+        self.__conn.commit()
+        return cid
+
+    def insert_content(self, item):
+        sql = "insert into chapter_contents (chapter_name,content,created_at,updated_at) values (%s,%s,%s,%s)"
+        with self.__conn.cursor() as cursor:
+            cursor.execute(sql, (
+                item['name'], item['content'], item['created_at'], item['updated_at']))
+            content_id = int(cursor.lastrowid)
+        self.__conn.commit()
+        return content_id
+
+    def get_book_list(self):
+        sql = "select id,source_bid from books where source=%s"
+        with self.__conn.cursor() as cursor:
+            cursor.execute(sql, (self.source,))
+            result = cursor.fetchall()
+        self.__conn.commit()
+        return result
+
+    def get_chapter_info_by_source_cid(self, bid, source_chapter_id):
+        sql = 'select id from chapters where bid=%s and  source_chapter_id = %s'
+        with self.__conn.cursor() as cursor:
+            cursor.execute(sql, (bid, source_chapter_id))
+            result = cursor.fetchone()
+        self.__conn.commit()
+        return result
+
+    def get_book_info_by_id(self, bid):
+        sql = 'select source_bid from books where   id = %s and source=%s '
+        with self.__conn.cursor() as cursor:
+            cursor.execute(sql, (int(bid), self.source))
+            result = cursor.fetchone()
+        self.__conn.commit()
+        return result
+
+    def get_cid_by_bid_sequence(self, bid, sequence):
+        sql = "select id,chapter_content_id from chapters where  bid = %s and sequence=%s"
+        with self.__conn.cursor() as cursor:
+            cursor.execute(sql, (int(bid), int(sequence)))
+            result = cursor.fetchone()
+        self.__conn.commit()
+        return result
+
+    def update_content(self, content_id, chapter_name, content):
+        now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+        sql = 'update chapter_contents set chapter_name=%s,content=%s,updated_at=%s  where id=%s'
+        with self.__conn.cursor() as cursor:
+            cursor.execute(sql, (
+                chapter_name, content, now, int(content_id)))
+        self.__conn.commit()
+
+    def update_chapter(self, item):
+        now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+        sql = 'update chapters set `name`=%s,`sequence`=%s,`size`=%s,`is_vip`=%s,' \
+              'updated_at=%s,`source_chapter_id`=%s where id = %s'
+        with self.__conn.cursor() as cursor:
+            cid = int(item['cid'])
+            cursor.execute(sql, (
+                item['name'], item['sequence'], item['size'], item['is_vip'], now,
+                item['source_chapter_id'], cid))
+        self.__conn.commit()

+ 49 - 0
ydyspider/pipelines.py

@@ -0,0 +1,49 @@
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import os
+import re
+
+
+def formatcontent(content):
+    content = content.replace(' ', '')
+    content = content.replace('<p>', '')
+    content = content.replace('</p>', "\r\n")
+    content = content.splitlines()
+    content = map(lambda s: s.strip(), content)
+    content = filter(lambda s: s != '', content)
+    content = '\r\n'.join(content)
+    return content.strip()
+
+
+def removePunctuation(text):
+    punctuation = '!,;:?"\'、,;!”“。?,'
+    text = re.sub(r'[{}]+'.format(punctuation), ' ', text)
+    return text.strip().replace('\r\n', '').replace('\n', '').replace('\r', '')
+
+
+class YdyspiderPipeline(object):
+
+    def __init__(self, stats):
+        self.__stats = stats
+
+    def process_item(self, item, spider):
+        return item
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        stats = crawler.stats
+        return cls(
+            stats=stats
+        )
+
+    def close_spider(self, spider):
+        bid_list = self.__stats.get_value('bid_list')
+        if bid_list is not None:
+            for bid in bid_list:
+                command = '/usr/local/php/bin/php /home/www/zhuishuyun_wap/artisan book:afs %s ' % bid
+                os.system(command)
+

+ 90 - 0
ydyspider/settings.py

@@ -0,0 +1,90 @@
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for ydyspider project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     http://doc.scrapy.org/en/latest/topics/settings.html
+#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'ydyspider'
+
+SPIDER_MODULES = ['ydyspider.spiders']
+NEWSPIDER_MODULE = 'ydyspider.spiders'
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'ydyspider (+http://www.yourdomain.com)'
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = False
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+#DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+#COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+DEFAULT_REQUEST_HEADERS = {
+   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+   'Accept-Language': 'en',
+}
+
+# Enable or disable spider middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'ydyspider.middlewares.YdyspiderSpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'ydyspider.middlewares.MyCustomDownloaderMiddleware': 543,
+#}
+
+# Enable or disable extensions
+# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+    'ydyspider.pipelines.YdyspiderPipeline': 300,
+}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

+ 75 - 0
ydyspider/spiders/7lou.py

@@ -0,0 +1,75 @@
+# -*- coding: utf-8 -*-
+import scrapy
+from ydyspider.mysql import msyqlHelper
+import json
+
+class A7louSpider(scrapy.Spider):
+	name = '7lou'
+	allowed_domains = ['zwapi.ycsd.cn']
+	base_url = 'http://zwapi.ycsd.cn/api/book'
+
+	def start_requests(self):
+		url = self.base_url + '/booklist?cp=zw-7lou'
+		yield scrapy.Request(url, callback=self.parse)
+
+	def parse(self, response):
+		result = self.json_encode(response.text)
+		mysql = msyqlHelper()
+		for item in result['data']:
+			exist = mysql.getByZwSource(item['bid'])
+			if exist is not None:
+				continue
+			url = self.base_url + '/bookInfo/{}'.format(item['bid'])
+			yield scrapy.Request(url, callback=self.parse2, meta={"zw_id": item['bid'], "i":0})
+
+	def parse2(self, response):
+		mysql = msyqlHelper()
+		res = response.text
+		res = self.json_encode(res)
+		data = dict()
+		data['zw_bid'] = res['data']['bid']
+		data['name'] = res['data']['name']
+		data['author'] = res['data']['author']
+		data['intro'] = res['data']['intro']
+		data['cover'] = res['data']['cover']
+		data['category_name'] = res['data']['category_name']
+		data['category_id'] = 0
+		data['status'] = res['data']['status']
+		data['sequence'] = response.meta['i']
+		bid = mysql.insertZwBook(data)
+		mysql.close()
+		url = self.base_url + '/chapterlist/{}'.format(res['data']['bid'])
+		yield scrapy.Request(url, meta={"bid": bid, "book_id": res['data']['bid']}, callback=self.parse3)
+	
+	def parse3(self, response):
+		res = response.text
+		res = self.json_encode(res)
+		if res['code'] == 0:
+			for chapter in res['data']:
+				chapter['bid'] = response.meta['bid']
+				chapter['source_chapter_id'] = chapter['chapter_id']
+				url = self.base_url + '/chapterContent/{}/{}'.format(response.meta['book_id'], chapter['chapter_id'])
+				yield scrapy.Request(url, meta=chapter, callback=self.parse4)
+
+	def parse4(self, response):
+		res = response.text
+		res = self.json_encode(res)
+		if res['code'] == 0:
+			mysql = msyqlHelper()
+			meta = response.meta
+			data = dict()
+			data['bid'] = meta['bid']
+			data['name'] = res['data']['chapter_name']
+			data['sequence'] = meta['sequence']
+			data['size'] = meta['size']
+			data['is_vip'] = meta['is_vip']
+			data['prev_cid'] = 0
+			data['next_cid'] = 0
+			data['recent_update_at'] = meta['updated_at']
+			data['content'] = res['data']['content']
+			data['ly_chapter_id'] = meta['source_chapter_id']
+			mysql.inseraAll(data)
+			mysql.close()
+	
+	def json_encode(self,jsonstr):
+		return json.loads(jsonstr)

+ 4 - 0
ydyspider/spiders/__init__.py

@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.

+ 87 - 0
ydyspider/spiders/wyy.py

@@ -0,0 +1,87 @@
+# -*- coding: utf-8 -*-
+import scrapy
+from scrapy.http import FormRequest
+import json
+#from books.mysql import msyqlHelper
+import time
+
+class WyySpider(scrapy.Spider):
+    name = 'wyy'
+    allowed_domains = ['m.emeixs.com']
+    start_urls = ['http://m.emeixs.com/']
+
+    start_urls = ['10916',{"bookid":"10916","allpage":1}]
+                
+    cookies = {
+            "CRC":"41ac5d8f45826eacd7d609e65ef11ce9",
+            "OPENID":"o2rtIwVmtewF74MDgUS2bjtw5r8w",
+            "VALIDON":"1546056916",
+            "admin_id":"91",
+            "pen_name":'Don',
+            "portrait":"http%3A%2F%2Fthirdwx.qlogo.cn%2Fmmopen%2FuchmtWQh7iarv9fB1SPnCCaTibra2HjCEIXsrFEp8bnoeNhialwORg1EHhyOoNicYIzzhhib4YrBPYKZialOTAtWBeyw%2F132",
+            'prid':'0',
+            'shell':'766bc63e269cc8ae07b22ece476e1134',
+            'subscribe':'1',
+            'uid':'2',
+            'user_id':'252323197',
+            'user_name':'we20181227',
+            'PHPSESSID':'n6manrv5a8gq2ai1boremc2tr0',
+            'getuserinfo':'1'
+        }
+
+    headers = {
+                "User-Agent":"Mozilla/5.0 (Linux; Android 8.0; MI 6 Build/OPR1.170623.027; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.132 MQQBrowser/6.2 TBS/044405 Mobile Safari/537.36 MMWEBID/223 MicroMessenger/6.7.3.1360(0x2607033D) NetType/WIFI Language/zh_CN Process/tools",
+                "Origin":"https://m.emeixs.com",
+                "Content-Type":"application/x-www-form-urlencoded; charset=UTF-8",
+                "X-Requested-With":"XMLHttpRequest",
+                "Referer":"https://m.emeixs.com/chapter/12514/0/fromaid/91.html"
+        }
+            
+    def start_requests(self):
+        url = 'https://m.emeixs.com/Moreinfo/nextchapter/fromaid/91.html'
+        meta = {"bid":self.start_urls[0]}
+        for i in range(self.start_urls[1]['allpage']):
+            page = i+1
+            #body = {"bookid":self.start_urls[1]['bookid'],"page":str(page),"paixu":"asc"}
+            body = 'bookid={bookid}&page={page}&paixu={paixu}'.format(bookid=self.start_urls[1]['bookid'],page=page,paixu='asc')
+            yield scrapy.Request(url,headers=self.headers,callback=self.parselist,meta=meta,cookies=self.cookies,method='POST',body=body)
+            
+    def parselist(self, response):
+        result = response.xpath('//a')
+        #self.logger.info(result)    
+        data = [];
+        bookid = self.start_urls[0]
+        i = 1;
+        #self.headers['Referer'] = 'https://m.emeixs.com/chapter/10916/0/fromaid/91.html'
+        for a in result:
+            href = a.xpath('@href').extract_first();
+            a_list = href.split('/')
+            cid = a_list[3]
+            meta = {'sequence':cid}
+            i = i+1;
+            if i >3:
+                break;
+            cid = i+1
+            #self.cookies['VALIDON'] = int(time.time())+3600*2
+            #self.logger.info(self.cookies)
+            #      https://m.emeixs.com/ChapterContent/content/fromaid/91.html?bookid=10916&num=5        
+            #url = 'https://m.emeixs.com/ChapterContent/collection/fromaid/91.html?bookid={bookid}&num={sequence}'.format(bookid=bookid,sequence=cid);
+            #yield scrapy.Request(url,headers=self.headers,meta=meta,callback=self.parse2,cookies=self.cookies)
+            url = 'https://m.emeixs.com/ChapterContent/content/fromaid/91.html?bookid={bookid}&num={sequence}'.format(bookid=bookid,sequence=cid);
+            self.logger.info('url is :'+url)
+            yield scrapy.Request(url,headers=self.headers,meta=meta,callback=self.parsecontent,cookies=self.cookies)
+            #c2 = int(cid)+1
+            #c2 = str(c2)
+            #url = 'https://m.emeixs.com/ChapterContent/content/fromaid/91.html?bookid={bookid}&num={sequence}'.format(bookid=bookid,sequence=c2);
+            #yield scrapy.Request(url,headers=self.headers,callback=self.parsecontent,meta=meta,cookies=self.cookies)
+
+    def parsecontent(self,response):
+        #res = self.json_decode(response.text)
+        self.logger.info(response.text)
+    
+    def parse2(self,response):
+        #res = self.json_decode(response.text)
+        self.logger.info(response.text)    
+
+    def json_decode(self,jsonstr):
+        return json.loads(jsonstr)    

+ 71 - 0
ydyspider/spiders/ycsd.py

@@ -0,0 +1,71 @@
+# -*- coding: utf-8 -*-
+import scrapy
+from ydyspider.mysql import msyqlHelper
+import json
+import time
+
+
+class YcsdSpider(scrapy.Spider):
+    name = 'ycsd'
+    allowed_domains = ['cp.yqsd.cn']
+    base_url = 'http://cp.yqsd.cn/cp/booksource/?mcp=zhuishuyun&'
+
+    def start_requests(self):
+        bid_t = self.bid
+        bid_list = bid_t.split(',')
+        for ids in bid_list:
+            url = self.base_url + 'method=bookinfo&bid={}'.format(ids)
+            yield scrapy.Request(url, callback=self.parse2, meta={"ly_bid": ids, "i": 0})
+
+    def parse2(self, response):
+        mysql = msyqlHelper()
+        res = response.text
+        res = self.json_encode(res)
+        data = dict()
+        data['ly_bid'] = res['data']['book_id']
+        data['source_name'] = 'ycsd'
+        data['name'] = res['data']['book_name']
+        data['author'] = res['data']['book_author']
+        data['intro'] = res['data']['introduction']
+        data['cover'] = res['data']['cover_url']
+        data['category_name'] = res['data']['book_category_name']
+        data['category_id'] = res['data']['book_category_id']
+        data['status'] = res['data']['book_state']
+        data['sequence'] = response.meta['i']
+        bid = mysql.insertbookV2(data)
+        mysql.close()
+        url = self.base_url + 'method=chapterList&bid={}'.format(res['data']['book_id'])
+        yield scrapy.Request(url, meta={"bid": bid, "book_id": res['data']['book_id']}, callback=self.parse3)
+
+    def parse3(self, response):
+        res = response.text
+        res = self.json_encode(res)
+        for chapter in res['data']:
+            chapter['bid'] = response.meta['bid']
+            bid = response.meta['book_id']
+            url = self.base_url + 'method=chapter&bid={}&cid={}'.format(bid, chapter['chapter_id'])
+            yield scrapy.Request(url, meta=chapter, callback=self.parse4)
+
+    def parse4(self, response):
+        res = response.text
+        res = self.json_encode(res)
+        mysql = msyqlHelper()
+        meta = response.meta
+        data = dict()
+        data['bid'] = meta['bid']
+        data['name'] = meta['chapter_name']
+        data['sequence'] = meta['chapter_order_number'] + 1
+        data['size'] = len(res['data']['chapter_content'])
+        data['is_vip'] = meta['chapter_need_pay']
+        data['prev_cid'] = 0
+        data['next_cid'] = 0
+        data['recent_update_at'] = time.strftime("%Y-%m-%d %H:%M:%S",
+                                                 time.localtime(meta['chapter_last_update_time']))
+        data['content'] = res['data']['chapter_content']
+        data['ly_chapter_id'] = res['data']['chapter_id']
+        mysql.inseraAll(data)
+        mysql.close()
+
+    def json_encode(self, jsonstr):
+        return json.loads(jsonstr)
+

+ 72 - 0
ydyspider/spiders/ydy.py

@@ -0,0 +1,72 @@
+# -*- coding: utf-8 -*-
+import scrapy
+from ydyspider.mysql import msyqlHelper
+import json
+import time
+
+class YunduyunSpider(scrapy.Spider):
+	name = 'ydy'
+	allowed_domains = ['leyuee.com']
+	start_urls = ['http://www.leyuee.com/services/zwfx.aspx?method=booklist&token=sefaf23h7face']
+	def start_requests(self):
+		bid_t = self.bid
+		bid_list = bid_t.split(',')
+		for ids in bid_list:
+			#self.logger.info(id)
+			#yield scrapy.Request("http://www.leyuee.com/services/zwfx.aspx?method=bookinfo&token=sefaf23h7face&bid=%s" % item['book_id'],callback=self.parse2qqq,meta={"ly_bid":item['book_id'],"i":0})
+			yield scrapy.Request("http://www.leyuee.com/services/zwfx.aspx?method=bookinfo&token=sefaf23h7face&bid=%s" % ids,callback=self.parse2,meta={"ly_bid":ids,"i":0})
+
+
+	def parse2(self,response):
+		mysql = msyqlHelper()
+		res = response.text
+		res = self.json_encode(res)
+		data = dict()
+		data['ly_bid'] = res['data']['book_id']
+		data['name'] = res['data']['book_name']
+		data['author'] = res['data']['book_author']
+		data['intro'] = res['data']['introduction']
+		data['cover'] = res['data']['cover_url']
+		data['category_name'] = res['data']['book_tags']
+		data['category_id'] = res['data']['book_category_id']
+		data['status'] = res['data']['book_state']
+		data['sequence'] = response.meta['i']
+		bid = mysql.insertbook(data)
+		self.crawler.stats.set_value('bid_list', [])
+		mysql.close()
+		self.logger.info(data)
+		yield scrapy.Request("http://www.leyuee.com/services/zwfx.aspx?method=chapterlist&bid=%s&token=sefaf23h7face" % res['data']['book_id'],meta={"bid":bid,"book_id":res['data']['book_id']},callback=self.parse3)
+	
+	def parse3(self,response):
+		res = response.text;
+		res = self.json_encode(res)
+		if res['code']  == 200:
+			#mysql = msyqlHelper()
+			for volume in res['data']:
+				for chapter in volume['chapters']:
+					chapter['bid'] = response.meta['bid']
+					yield scrapy.Request('http://www.leyuee.com/services/zwfx.aspx?method=chapter&bid=%s&cid=%s&token=sefaf23h7face' % (response.meta['book_id'],chapter['chapter_id']),meta=chapter,callback=self.parse4)
+	
+	def parse4(self,response):
+		res = response.text;
+		res = self.json_encode(res)
+		if res['code'] == 200:
+			mysql = msyqlHelper()
+			meta = response.meta
+			data = dict()
+			data['bid'] = meta['bid']
+			data['name'] = meta['chapter_name']
+			data['sequence'] = meta['chapter_order_number']+1
+			data['size'] = len(res['data']['chapter_content'])
+			data['is_vip'] = meta['chapter_need_pay']
+			data['prev_cid'] = 0
+			data['next_cid'] = 0
+			data['recent_update_at'] = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(meta['chapter_last_update_time']))
+			data['content'] = res['data']['chapter_content']
+			data['ly_chapter_id'] = res['data']['chapter_id']
+			mysql.inseraAll(data)
+			mysql.close()
+	
+	def json_encode(self,jsonstr):
+		return json.loads(jsonstr)
+

+ 79 - 0
ydyspider/spiders/yunduyun.py

@@ -0,0 +1,79 @@
+# -*- coding: utf-8 -*-
+import scrapy
+from ydyspider.mysql import msyqlHelper
+import json
+import time
+
+class YunduyunSpider(scrapy.Spider):
+	name = 'yunduyun'
+	allowed_domains = ['leyuee.com']
+	start_urls = ['http://www.leyuee.com/services/zwfx.aspx?method=booklist&token=sefaf23h7face']
+	def parse(self, response):
+		res = response.text
+		res = self.json_encode(res)
+		self.logger.info(res)
+		i = 0
+		mysql = msyqlHelper()
+		#id = 3
+		#yield scrapy.Request("http://www.leyuee.com/services/zwfx.aspx?method=bookinfo&token=sefaf23h7face&bid=%s" % id,callback=self.parse2,meta={"ly_bid":id,"i":i})
+		
+		for item in res['data']:
+			if item['book_id'] <=1501:
+				continue
+			exist = mysql.selectbylyid(item['book_id'])
+			if exist is not None:
+				self.logger.info(exist)
+				continue
+			yield scrapy.Request("http://www.leyuee.com/services/zwfx.aspx?method=bookinfo&token=sefaf23h7face&bid=%s" % item['book_id'],callback=self.parse2,meta={"ly_bid":item['book_id'],"i":i})
+
+	def parse2(self,response):
+		mysql = msyqlHelper()
+		res = response.text
+		res = self.json_encode(res)
+		data = dict()
+		data['ly_bid'] = res['data']['book_id']
+		data['name'] = res['data']['book_name']
+		data['author'] = res['data']['book_author']
+		data['intro'] = res['data']['introduction']
+		data['cover'] = res['data']['cover_url']
+		data['category_name'] = res['data']['book_tags']
+		data['category_id'] = res['data']['book_category_id']
+		data['status'] = res['data']['book_state']
+		data['sequence'] = response.meta['i']
+		bid = mysql.insertbook(data)
+		mysql.close()
+		self.logger.info(data)
+		yield scrapy.Request("http://www.leyuee.com/services/zwfx.aspx?method=chapterlist&bid=%s&token=sefaf23h7face" % res['data']['book_id'],meta={"bid":bid,"book_id":res['data']['book_id']},callback=self.parse3)
+	
+	def parse3(self,response):
+		res = response.text;
+		res = self.json_encode(res)
+		if res['code']  == 200:
+			#mysql = msyqlHelper()
+			for volume in res['data']:
+				for chapter in volume['chapters']:
+					chapter['bid'] = response.meta['bid']
+					yield scrapy.Request('http://www.leyuee.com/services/zwfx.aspx?method=chapter&bid=%s&cid=%s&token=sefaf23h7face' % (response.meta['book_id'],chapter['chapter_id']),meta=chapter,callback=self.parse4)
+	
+	def parse4(self,response):
+		res = response.text;
+		res = self.json_encode(res)
+		if res['code'] == 200:
+			mysql = msyqlHelper()
+			meta = response.meta
+			data = dict()
+			data['bid'] = meta['bid']
+			data['name'] = meta['chapter_name']
+			data['sequence'] = meta['chapter_order_number']+1
+			data['size'] = len(res['data']['chapter_content'])
+			data['is_vip'] = meta['chapter_need_pay']
+			data['prev_cid'] = 0
+			data['next_cid'] = 0
+			data['recent_update_at'] = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(meta['chapter_last_update_time']))
+			data['content'] = res['data']['chapter_content']
+			data['ly_chapter_id'] = res['data']['chapter_id']
+			mysql.inseraAll(data)
+			mysql.close()
+	
+	def json_encode(self,jsonstr):
+		return json.loads(jsonstr)

+ 72 - 0
ydyspider/spiders/zwcontentSpider.py

@@ -0,0 +1,72 @@
+# -*- coding: utf-8 -*-
+import scrapy
+from ydyspider.mysql import msyqlHelper
+import json
+
+
+class zwcontentSpider(scrapy.Spider):
+    name = 'zwcontent'
+    allowed_domains = ['cp.yqsd.cn']
+    base_url = 'http://cp.yqsd.cn/api/book'
+
+    def start_requests(self):
+        self.crawler.stats.set_value('bid_list', [])
+        param = self.bid
+        bid_list = param.split(',')
+        for bid in bid_list:
+            url = self.base_url + '/bookInfo/{}'.format(bid)
+            yield scrapy.Request(url, callback=self.parse2, meta={"zw_id": bid, "i": 0})
+
+    def parse2(self, response):
+        mysql = msyqlHelper()
+        res = response.text
+        res = self.json_encode(res)
+        data = dict()
+        data['zw_bid'] = res['data']['bid']
+        data['source_name'] = 'zw_content'
+        data['name'] = res['data']['name']
+        data['author'] = res['data']['author']
+        data['intro'] = res['data']['intro']
+        data['cover'] = res['data']['cover']
+        data['category_name'] = res['data']['category_name']
+        data['category_id'] = 0
+        data['status'] = res['data']['status']
+        data['sequence'] = response.meta['i']
+        bid = mysql.insertZwBook(data)
+        self.crawler.stats.get_value('bid_list').append(bid)
+        mysql.close()
+        url = self.base_url + '/chapterlist/{}'.format(res['data']['bid'])
+        yield scrapy.Request(url, meta={"bid": bid, "book_id": res['data']['bid']}, callback=self.parse3)
+
+    def parse3(self, response):
+        res = response.text
+        res = self.json_encode(res)
+        if res['code'] == 0:
+            for chapter in res['data']:
+                chapter['bid'] = response.meta['bid']
+                chapter['source_chapter_id'] = chapter['chapter_id']
+                url = self.base_url + '/chapterContent/{}/{}'.format(response.meta['book_id'], chapter['chapter_id'])
+                yield scrapy.Request(url, meta=chapter, callback=self.parse4)
+
+    def parse4(self, response):
+        res = response.text
+        res = self.json_encode(res)
+        if res['code'] == 0:
+            mysql = msyqlHelper()
+            meta = response.meta
+            data = dict()
+            data['bid'] = meta['bid']
+            data['name'] = res['data']['chapter_name']
+            data['sequence'] = meta['sequence']
+            data['size'] = meta['size']
+            data['is_vip'] = meta['is_vip']
+            data['prev_cid'] = 0
+            data['next_cid'] = 0
+            data['recent_update_at'] = meta['updated_at']
+            data['content'] = res['data']['content']
+            data['ly_chapter_id'] = meta['source_chapter_id']
+            mysql.inseraAll(data)
+            mysql.close()
+
+    def json_encode(self, jsonstr):
+        return json.loads(jsonstr)

+ 37 - 0
ydyspider/spiders/zwcontentlianshangSpider.py

@@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+import time
+
+import scrapy
+from ydyspider.mysql import msyqlHelper
+import json
+from . import zwcontentSpider
+
+
+class zwcontentlianshangSpider(zwcontentSpider.zwcontentSpider):
+    name = 'zwcontentlianshnag'
+    allowed_domains = ['cp.yqsd.cn']
+    base_url = 'http://cp.yqsd.cn/api/book'
+    custom_settings = {
+        'DOWNLOAD_DELAY': 0.01,
+        'LOG_FILE': 'ydyspider/log/' + name + time.strftime("%Y-%m-%d", time.localtime()) + '.log'
+    }
+
+    def start_requests(self):
+        self.crawler.stats.set_value('bid_list', [])
+        mysql = msyqlHelper()
+        last_book = mysql.getLianshang()
+        bid = last_book['zw_id']
+        start_url = self.base_url + '/booklist/lianshang/{}'.format(bid)
+        yield scrapy.Request(start_url, callback=self.parse_book_list)
+
+    def parse_book_list(self, response):
+        result = json.loads(response.text)
+        if result.get('data') is not None:
+            i = 0
+            for item in result['data']:
+                i = i+1
+                if i > 1000:
+                    break
+                bid = item['id']
+                url = self.base_url + '/bookInfo/{}'.format(bid)
+                yield scrapy.Request(url, callback=self.parse2, meta={"zw_id": bid, "i": 0})

+ 74 - 0
ydyspider/spiders/zycontentSpider.py

@@ -0,0 +1,74 @@
+# -*- coding: utf-8 -*-
+import scrapy
+from ydyspider.mysql import msyqlHelper
+import json
+
+
+class zycontentSpider(scrapy.Spider):
+    name = 'zycontent'
+    allowed_domains = ['121.37.183.29']
+    query = '?channel_name=zhuishuyun&channel_key=123456'
+    base_url = 'http://121.37.183.29:8093/api/output'
+
+    def start_requests(self):
+        self.crawler.stats.set_value('bid_list', [])
+        param = self.bid
+        bid_list = param.split(',')
+        for bid in bid_list:
+            url = self.base_url + '/bookdetail/{}'.format(bid) + self.query
+            yield scrapy.Request(url, callback=self.parse2, meta={"zhiyu_book_id": bid, "i": 0})
+
+    def parse2(self, response):
+        mysql = msyqlHelper()
+        res = response.text
+        res = self.json_encode(res)
+        if res['code'] == 10000:
+            data = dict()
+            data['zhiyu_book_id'] = res['data']['bid']
+            data['source_name'] = 'zy_content'
+            data['name'] = res['data']['book_name']
+            data['author'] = res['data']['author']
+            data['intro'] = res['data']['Introduction']
+            data['cover'] = res['data']['cover']
+            data['category_name'] = res['data']['category_name']
+            data['category_id'] = res['data']['category_id']
+            data['status'] = res['data']['status']
+            data['sequence'] = response.meta['i']
+            bid = mysql.insertZyBook(data)
+            self.crawler.stats.get_value('bid_list').append(bid)
+            mysql.close()
+            url = self.base_url + '/chapterlist/{}'.format(res['data']['bid']) + self.query
+            yield scrapy.Request(url, meta={"bid": bid, "book_id": res['data']['bid']}, callback=self.parse3)
+
+    def parse3(self, response):
+        res = response.text
+        res = self.json_encode(res)
+        if res['code'] == 10000:
+            for chapter in res['data']:
+                chapter['bid'] = response.meta['bid']
+                chapter['source_chapter_id'] = chapter['chapter_id']
+                url = self.base_url + '/chaptercontent/{}/chapterid/{}'.format(response.meta['book_id'], chapter['chapter_id'])+ self.query
+                yield scrapy.Request(url, meta=chapter, callback=self.parse4)
+
+    def parse4(self, response):
+        res = response.text
+        res = self.json_encode(res)
+        if res['code'] == 10000:
+            mysql = msyqlHelper()
+            meta = response.meta
+            data = dict()
+            data['bid'] = meta['bid']
+            data['name'] = res['data']['chapter_name']
+            data['sequence'] = meta['sequence']
+            data['size'] = meta['size']
+            data['is_vip'] = meta['is_vip']
+            data['prev_cid'] = 0
+            data['next_cid'] = 0
+            data['recent_update_at'] = meta['updated_at']
+            data['content'] = res['data']['chapter_content']
+            data['ly_chapter_id'] = meta['source_chapter_id']
+            mysql.inseraAll(data)
+            mysql.close()
+
+    def json_encode(self, jsonstr):
+        return json.loads(jsonstr)