zz 3 rokov pred
rodič
commit
53eb588352
4 zmenil súbory, kde vykonal 612 pridanie a 0 odobranie
  1. 418 0
      ydyspider/baseSpider.py
  2. 35 0
      ydyspider/items.py
  3. 141 0
      ydyspider/mysqlHelper.py
  4. 18 0
      ydyspider/pipelines.py

+ 418 - 0
ydyspider/baseSpider.py

@@ -0,0 +1,418 @@
+# -*- coding: utf-8 -*-
+
+import time
+import scrapy
+from ydyspider.items import BookInfoItem, ChapterItem
+from ydyspider.mysqlHelper import MysqlHelper
+import hashlib
+import random
+
+from ydyspider.pipelines import formatcontent, removePunctuation
+
+
+def md5(token):
+    m = hashlib.md5()
+    m.update(token.encode('utf-8'))
+    return m.hexdigest()
+
+
+def sha1(token):
+    m = hashlib.sha1()
+    m.update(token.encode('utf-8'))
+    return m.hexdigest()
+
+
+def random_str(slen=10):
+    seed = "1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
+    sa = []
+    for i in range(slen):
+        sa.append(random.choice(seed))
+    return ''.join(sa)
+
+
+class baseSpider(scrapy.Spider):
+    name = ''
+    allowed_domains = []
+    base_url = ''
+    source = ''
+    source_name = ''
+
+    def __init__(self, host, user, password, db, stats):
+        scrapy.Spider.__init__(self)
+        source = self.source
+        self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db,source=source)
+        self.__stats = stats
+        self.__source = self.source
+        self.__stats.set_value('bid_list', [])
+        self.__stats.set_value('spider_type', 'add')
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        settings = crawler.settings
+        host = settings.get('MYSQL_HOST')
+        user = settings.get('MYSQL_USER')
+        password = settings.get('MYSQL_PWD')
+        db = settings.get('MYSQL_DB')
+
+        return cls(host=host, user=user, password=password, db=db, stats=crawler.stats)
+
+    def start_requests(self):
+        yield scrapy.Request(self.get_start_url(), callback=self.parse_book_list)
+
+    def parse_book_list(self, response):
+        result = self.bid_list_result(response)
+        for item in result:
+            bid = item['id']
+            result = self.mysqlHelper.get_book_info_by_source(bid)
+            if result is not None:
+                continue
+            url = self.get_book_info_url(bid)
+            yield scrapy.Request(url, callback=self.parse_book_info)
+
+    def parse_book_info(self, response):
+        if response.text == '':
+            return None
+        now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+        result = self.book_info_result(response)
+        if result is None:
+            return None
+        source_bid = result.get('bid')
+        book_info_item = BookInfoItem()
+        book_info_item['source_bid'] = source_bid
+        book_info_item['name'] = result['name']
+        book_info_item['author'] = result['author']
+        book_info_item['intro'] = result['intro']
+        book_info_item['cover'] = result['cover']
+        book_info_item['keyword'] = result['keyword']
+        book_info_item['category_id'] = 0 if result.get('category_id') is None else result.get('category_id')
+        book_info_item['status'] = result['status']
+        book_info_item['chapter_count'] = 0
+        book_info_item['first_cid'] = 0
+        book_info_item['last_cid'] = 0
+        book_info_item['size'] = 0
+        book_info_item['last_chapter'] = ''
+        book_info_item['category_name'] = result['category']
+        book_info_item['source'] = self.source
+        book_info_item['updated_at'] = now
+        book_info_item['created_at'] = now
+        bid = self.mysqlHelper.insert_book(book_info_item)
+        self.__stats.get_value('bid_list').append(bid)
+        url = self.get_chapter_list_url(source_bid)
+        meta = {'bid': bid, 'source_bid': source_bid}
+        yield scrapy.Request(url, self.parse_chapter_list, meta=meta)
+
+    def parse_chapter_list(self, response):
+        if response.text == '':
+            return None
+        result = self.chapter_list_result(response)
+        if result is None:
+            return None
+        bid = response.meta['bid']
+        source_bid = response.meta['source_bid']
+        for chapter_item in result:
+            meta = chapter_item
+            cid = chapter_item['source_chapter_id']
+            meta['bid'] = bid
+            url = self.get_chapter_content_url(source_bid, cid)
+            yield scrapy.Request(url, self.parse_chapter_content, meta=meta)
+
+    def parse_chapter_content(self, response):
+        if response.text == '':
+            return None
+        now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+        result = self.chapter_content_result(response)
+        meta = response.meta
+        chapter_item = ChapterItem()
+        chapter_item['bid'] = meta['bid']
+        chapter_item['name'] = meta['name']
+        chapter_item['sequence'] = meta['sequence']
+        chapter_item['is_vip'] = meta['is_vip']
+        chapter_item['prev_cid'] = 0
+        chapter_item['next_cid'] = 0
+        chapter_item['recent_update_at'] = meta['recent_update_at']
+        content = formatcontent(result['content'])
+        chapter_item['content'] = content
+        chapter_item['size'] = len(removePunctuation(content))
+        chapter_item['chapter_content_id'] = 0
+        chapter_item['source_chapter_id'] = meta['source_chapter_id']
+        chapter_item['created_at'] = now
+        chapter_item['updated_at'] = now
+        if result.get('size') is not None:
+            chapter_item['size'] = result.get('size')
+        if result.get('is_vip') is not None:
+            chapter_item['is_vip'] = result.get('is_vip')
+        if result.get('name') is not None:
+            chapter_item['name'] = result.get('name')
+        if result.get('recent_update_at') is not None:
+            chapter_item['recent_update_at'] = result.get('recent_update_at')
+        if result.get('source_chapter_id') is not None:
+            chapter_item['source_chapter_id'] = result.get('source_chapter_id')
+        yield chapter_item
+
+    def get_start_url(self):
+        raise NotImplementedError
+
+    def bid_list_result(self, response):
+        raise NotImplementedError
+
+    def get_book_info_url(self, bid):
+        raise NotImplementedError
+
+    def book_info_result(self, response):
+        raise NotImplementedError
+
+    def get_chapter_list_url(self, bid):
+        raise NotImplementedError
+
+    def chapter_list_result(self, response):
+        raise NotImplementedError
+
+    def get_chapter_content_url(self, bid, cid):
+        raise NotImplementedError
+
+    def chapter_content_result(self, response):
+        raise NotImplementedError
+
+
+class baseUpdateSpider(scrapy.Spider):
+    name = ''
+    allowed_domains = []
+    base_url = ''
+    source = ''
+    source_name = ''
+
+    def __init__(self, host, user, password, db, stats):
+        scrapy.Spider.__init__(self)
+        source = self.source
+        self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db, source=source)
+        self.__stats = stats
+        self.__stats.set_value('spider_type', 'update')
+        self.__stats.set_value('bid_list', [])
+        self.__is_first = True
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        settings = crawler.settings
+        host = settings.get('MYSQL_HOST')
+        user = settings.get('MYSQL_USER')
+        password = settings.get('MYSQL_PWD')
+        db = settings.get('MYSQL_DB')
+        return cls(host=host, user=user, password=password, db=db, stats=crawler.stats)
+
+    def start_requests(self):
+        book_list = self.mysqlHelper.get_need_update_book_list()
+        if book_list is not None:
+            for book in book_list:
+                url = self.get_chapter_list_url(book['source_bid'])
+                meta = {'bid': book['id'], 'source_bid': book['source_bid']}
+                yield scrapy.Request(url, callback=self.parse_chapter_list, meta=meta)
+
+    def parse_chapter_list(self, response):
+        if response.text == '':
+            return None
+        chapter_list = self.chapter_list_result(response)
+        meta = response.meta
+        if chapter_list is not None:
+            bid = response.meta.get('bid')
+            source_bid = response.meta.get('source_bid')
+            last_chapter = self.mysqlHelper.get_last_cid_by_bid(bid)
+            start = False
+            if last_chapter is None:
+                start = True
+                last_source_cid = 0
+                last_sequence = 0
+                last_chapter_id = 0
+            else:
+                last_source_cid = last_chapter['source_chapter_id']
+                last_sequence = last_chapter['sequence']
+                last_chapter_id = last_chapter['id']
+
+            has_new_chapter = False
+            for chapter_item in chapter_list:
+                if not start:
+                    if int(chapter_item['source_chapter_id']) == int(last_source_cid):
+                        start = True
+                    continue
+                if not has_new_chapter:
+                    self.__stats.get_value('bid_list').append(
+                        {"bid": meta['bid'], 'start': last_chapter_id})
+                    has_new_chapter = True
+                cid = chapter_item['source_chapter_id']
+                last_sequence = last_sequence + 1
+                if chapter_item['sequence'] == 0:
+                    chapter_item['sequence'] = last_sequence
+                meta = chapter_item
+                meta['bid'] = bid
+                url = self.get_chapter_content_url(source_bid, cid)
+                yield scrapy.Request(url, callback=self.parse_chapter_content, meta=meta)
+
+    def parse_chapter_content(self, response):
+        if response.text == '':
+            return None
+        now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+        result = self.chapter_content_result(response)
+        meta = response.meta
+        chapter_item = ChapterItem()
+        chapter_item['bid'] = meta['bid']
+        chapter_item['name'] = meta['name']
+        chapter_item['sequence'] = meta['sequence']
+        chapter_item['is_vip'] = meta['is_vip']
+        chapter_item['prev_cid'] = 0
+        chapter_item['next_cid'] = 0
+        chapter_item['recent_update_at'] = meta['recent_update_at']
+        content = formatcontent(result['content'])
+        chapter_item['size'] = len(removePunctuation(content))
+        chapter_item['content'] = content
+        chapter_item['chapter_content_id'] = 0
+        chapter_item['source_chapter_id'] = meta['source_chapter_id']
+        chapter_item['created_at'] = now
+        chapter_item['updated_at'] = now
+        if result.get('is_vip') is not None:
+            chapter_item['is_vip'] = result.get('is_vip')
+        if result.get('name') is not None:
+            chapter_item['name'] = result.get('name')
+        if result.get('recent_update_at') is not None:
+            chapter_item['recent_update_at'] = result.get('recent_update_at')
+        if result.get('source_chapter_id') is not None:
+            chapter_item['source_chapter_id'] = result.get('source_chapter_id')
+        yield chapter_item
+
+    def get_chapter_list_url(self, bid):
+        raise NotImplementedError
+
+    def chapter_list_result(self, response):
+        raise NotImplementedError
+
+    def get_chapter_content_url(self, bid, cid):
+        raise NotImplementedError
+
+    def chapter_content_result(self, response):
+        raise NotImplementedError
+
+
+class fixChapterSpider(scrapy.Spider):
+    name = ''
+    source = ''
+
+    def __init__(self, host, user, password, db, bid_list, stats):
+        scrapy.Spider.__init__(self)
+        source = self.source
+        self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db, source=source)
+        self.__stats = stats
+        self.__is_first = True
+        self.bid_list = bid_list
+
+    @classmethod
+    def from_crawler(cls, crawler, *args, **kwargs):
+        settings = crawler.settings
+        host = settings.get('MYSQL_HOST')
+        user = settings.get('MYSQL_USER')
+        password = settings.get('MYSQL_PWD')
+        db = settings.get('MYSQL_DB')
+        bid = kwargs.get('bid')
+        if bid is not None:
+            bid_list = bid.split(',')
+        else:
+            bid_list = []
+        return cls(host=host, user=user, password=password, db=db, bid_list=bid_list, stats=crawler.stats)
+
+    def start_requests(self):
+        if self.bid_list is None:
+            yield
+            return
+        for book in self.bid_list:
+            info = self.mysqlHelper.get_book_info_by_id(book)
+            if info is None:
+                continue
+            url = self.get_chapter_list_url(info['source_bid'])
+            meta = {'bid': book, 'source_bid': info['source_bid']}
+            yield scrapy.Request(url, callback=self.parse_chapter_list, meta=meta)
+
+    def parse_chapter_list(self, response):
+        if response.text == '':
+            return None
+        chapter_list = self.chapter_list_result(response)
+        if chapter_list is not None:
+            bid = response.meta.get('bid')
+            source_bid = response.meta.get('source_bid')
+            last_sequence = 0
+            for chapter_item in chapter_list:
+                last_sequence = last_sequence + 1
+
+                if chapter_item['sequence'] == 0:
+                    chapter_item['sequence'] = last_sequence
+                chapter_info = self.mysqlHelper.get_cid_by_bid_sequence(bid, last_sequence)
+                cid = chapter_item['source_chapter_id']
+
+                meta = chapter_item
+                if chapter_info is not None:
+                    meta['type'] = 'update'
+                    meta['chapter_content_id'] = chapter_info['chapter_content_id']
+                    meta['cid'] = chapter_info['id']
+                meta['bid'] = bid
+                url = self.get_chapter_content_url(source_bid, cid)
+                yield scrapy.Request(url, callback=self.parse_chapter_content, meta=meta)
+
+    def parse_chapter_content(self, response):
+        if response.text == '':
+            return None
+        now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+        result = self.chapter_content_result(response)
+        meta = response.meta
+        content = formatcontent(result['content'])
+        meta['size'] = len(removePunctuation(content))
+        meta['content'] = content
+        if result.get('size') is not None:
+            meta['size'] = result.get('size')
+        if result.get('is_vip') is not None:
+            meta['is_vip'] = result.get('is_vip')
+        if result.get('name') is not None:
+            meta['name'] = result.get('name')
+        if result.get('recent_update_at') is not None:
+            meta['recent_update_at'] = result.get('recent_update_at')
+        if result.get('source_chapter_id') is not None:
+            meta['source_chapter_id'] = result.get('source_chapter_id')
+        if meta.get('type') is not None:
+            self.mysqlHelper.update_content(meta['chapter_content_id'], meta['name'], result['content'])
+            self.mysqlHelper.update_chapter(meta)
+        else:
+            chapter_item = ChapterItem()
+            chapter_item['bid'] = meta['bid']
+            chapter_item['name'] = meta['name']
+            chapter_item['sequence'] = meta['sequence']
+            chapter_item['size'] = meta['size']
+            chapter_item['is_vip'] = meta['is_vip']
+            chapter_item['prev_cid'] = 0
+            chapter_item['next_cid'] = 0
+            chapter_item['recent_update_at'] = meta['recent_update_at']
+            chapter_item['content'] = result['content']
+            if meta.get('chapter_content_id') is not None:
+                chapter_item['chapter_content_id'] = meta['chapter_content_id']
+            else:
+                chapter_item['chapter_content_id'] = 0
+            chapter_item['source_chapter_id'] = meta['source_chapter_id']
+            chapter_item['created_at'] = now
+            chapter_item['updated_at'] = now
+            if result.get('size') is not None:
+                chapter_item['size'] = result.get('size')
+            if result.get('is_vip') is not None:
+                chapter_item['is_vip'] = result.get('is_vip')
+            if result.get('name') is not None:
+                chapter_item['name'] = result.get('name')
+            if result.get('recent_update_at') is not None:
+                chapter_item['recent_update_at'] = result.get('recent_update_at')
+            if result.get('source_chapter_id') is not None:
+                chapter_item['source_chapter_id'] = result.get('source_chapter_id')
+            yield chapter_item
+
+    def get_chapter_list_url(self, bid):
+        raise NotImplementedError
+
+    def chapter_list_result(self, response):
+        raise NotImplementedError
+
+    def get_chapter_content_url(self, bid, cid):
+        raise NotImplementedError
+
+    def chapter_content_result(self, response):
+        raise NotImplementedError

+ 35 - 0
ydyspider/items.py

@@ -12,3 +12,38 @@ class YdyspiderItem(scrapy.Item):
     # define the fields for your item here like:
     # name = scrapy.Field()
     pass
+
+class BookInfoItem(scrapy.Item):
+    source_bid = scrapy.Field()
+    source = scrapy.Field()
+    name = scrapy.Field()
+    author = scrapy.Field()
+    intro = scrapy.Field()
+    cover = scrapy.Field()
+    keyword = scrapy.Field()
+    category_id = scrapy.Field()
+    status = scrapy.Field()
+    chapter_count = scrapy.Field()
+    first_cid = scrapy.Field()
+    last_cid = scrapy.Field()
+    size = scrapy.Field()
+    last_chapter = scrapy.Field()
+    category_name = scrapy.Field()
+    updated_at = scrapy.Field()
+    created_at = scrapy.Field()
+
+
+class ChapterItem(scrapy.Item):
+    bid = scrapy.Field()
+    name = scrapy.Field()
+    sequence = scrapy.Field()
+    size = scrapy.Field()
+    is_vip = scrapy.Field()
+    prev_cid = scrapy.Field()
+    next_cid = scrapy.Field()
+    recent_update_at = scrapy.Field()
+    content = scrapy.Field()
+    chapter_content_id = scrapy.Field()
+    source_chapter_id = scrapy.Field()
+    created_at = scrapy.Field()
+    updated_at = scrapy.Field()

+ 141 - 0
ydyspider/mysqlHelper.py

@@ -0,0 +1,141 @@
+# -*- coding: utf-8 -*-
+import time
+
+import pymysql.cursors
+
+
+class MysqlHelper(object):
+    def __init__(self, host, user, password, db, source):
+        self.__conn = pymysql.connect(host=host, user=user, password=password, db=db, charset='utf8mb4',
+                                      cursorclass=pymysql.cursors.DictCursor)
+        self.source = source
+
+    def get_connection(self):
+        return self.__conn
+
+    def get_book_info_by_source(self, source_bid):
+        sql = 'select id from books where source_bid=%s and  source = %s'
+        with self.__conn.cursor() as cursor:
+            cursor.execute(sql, (source_bid, self.source))
+            result = cursor.fetchone()
+        self.__conn.commit()
+        return result
+
+    def get_need_update_book_list(self):
+        sql = 'select id,source_bid from books where source=%s and `status` = 0'
+        with self.__conn.cursor() as cursor:
+            cursor.execute(sql, (self.source, ))
+            result = cursor.fetchall()
+        self.__conn.commit()
+        return result
+
+    def get_last_cid_by_bid(self, bid):
+        sql = "select id,bid,`name`,sequence,source_chapter_id from chapters where bid = %s" \
+              " order by sequence desc limit 1"
+        with self.__conn.cursor() as cursor:
+            cursor.execute(sql, (int(bid), ))
+            result = cursor.fetchone()
+        self.__conn.commit()
+        return result
+
+    def insert_book(self, item):
+        sql = '''
+        insert into books(source_bid, `name`,author, intro, cover ,keyword , category_id,status,
+        chapter_count,first_cid,last_cid,`size`,last_chapter,category_name,source,updated_at,created_at)
+        values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
+        '''
+        with self.__conn.cursor() as cursor:
+            cursor.execute(sql, (item.get('source_bid'),
+                                 item.get('name'),
+                                 item.get('author'),
+                                 item.get('intro'),
+                                 item.get('cover'),
+                                 item.get('keyword'),
+                                 item.get('category_id'),
+                                 item.get('status'),
+                                 item.get('chapter_count'),
+                                 item.get('first_cid'),
+                                 item.get('last_cid'),
+                                 item.get('size'),
+                                 item.get('last_chapter'),
+                                 item.get('category_name'),
+                                 item.get('source'),
+                                 item.get('updated_at'),
+                                 item.get('created_at')
+                                 ))
+            bid = int(cursor.lastrowid)
+        self.__conn.commit()
+        return bid
+
+    def insert_chapter(self, item):
+        chapter_content_id = self.insert_content(item)
+        sql = "INSERT INTO `chapters` (`bid`, `name`,`sequence`,`size`,`is_vip`,`prev_cid`,`next_cid`,`recent_update_at`,`created_at`,`updated_at`,`chapter_content_id`,source_chapter_id) " \
+              "VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
+        with self.__conn.cursor() as cursor:
+            cursor.execute(sql, (
+                item['bid'], item['name'], item['sequence'], item['size'], item['is_vip'], item['prev_cid'],
+                item['next_cid'], item['recent_update_at'], item['created_at'], item['updated_at'], chapter_content_id,
+                item['source_chapter_id']))
+            cid = int(cursor.lastrowid)
+        self.__conn.commit()
+        return cid
+
+    def insert_content(self, item):
+        sql = "insert into chapter_contents (chapter_name,content,created_at,updated_at) values (%s,%s,%s,%s)"
+        with self.__conn.cursor() as cursor:
+            cursor.execute(sql, (
+                item['name'], item['content'], item['created_at'], item['updated_at']))
+            content_id = int(cursor.lastrowid)
+        self.__conn.commit()
+        return content_id
+
+    def get_book_list(self):
+        sql = "select id,source_bid from books where source=%s"
+        with self.__conn.cursor() as cursor:
+            cursor.execute(sql, (self.source,))
+            result = cursor.fetchall()
+        self.__conn.commit()
+        return result
+
+    def get_chapter_info_by_source_cid(self, bid, source_chapter_id):
+        sql = 'select id from chapters where bid=%s and  source_chapter_id = %s'
+        with self.__conn.cursor() as cursor:
+            cursor.execute(sql, (bid, source_chapter_id))
+            result = cursor.fetchone()
+        self.__conn.commit()
+        return result
+
+    def get_book_info_by_id(self, bid):
+        sql = 'select source_bid from books where   id = %s and source=%s '
+        with self.__conn.cursor() as cursor:
+            cursor.execute(sql, (int(bid), self.source))
+            result = cursor.fetchone()
+        self.__conn.commit()
+        return result
+
+    def get_cid_by_bid_sequence(self, bid, sequence):
+        sql = "select id,chapter_content_id from chapters where  bid = %s and sequence=%s"
+        with self.__conn.cursor() as cursor:
+            cursor.execute(sql, (int(bid), int(sequence)))
+            result = cursor.fetchone()
+        self.__conn.commit()
+        return result
+
+    def update_content(self, content_id, chapter_name, content):
+        now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+        sql = 'update chapter_contents set chapter_name=%s,content=%s,updated_at=%s  where id=%s'
+        with self.__conn.cursor() as cursor:
+            cursor.execute(sql, (
+                chapter_name, content, now, int(content_id)))
+        self.__conn.commit()
+
+    def update_chapter(self, item):
+        now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+        sql = 'update chapters set `name`=%s,`sequence`=%s,`size`=%s,`is_vip`=%s,' \
+              'updated_at=%s,`source_chapter_id`=%s where id = %s'
+        with self.__conn.cursor() as cursor:
+            cid = int(item['cid'])
+            cursor.execute(sql, (
+                item['name'], item['sequence'], item['size'], item['is_vip'], now,
+                item['source_chapter_id'], cid))
+        self.__conn.commit()

+ 18 - 0
ydyspider/pipelines.py

@@ -5,6 +5,24 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 import os
+import re
+
+
+def formatcontent(content):
+    content = content.replace(' ', '')
+    content = content.replace('<p>', '')
+    content = content.replace('</p>', "\r\n")
+    content = content.splitlines()
+    content = map(lambda s: s.strip(), content)
+    content = filter(lambda s: s != '', content)
+    content = '\r\n'.join(content)
+    return content.strip()
+
+
+def removePunctuation(text):
+    punctuation = '!,;:?"\'、,;!”“。?,'
+    text = re.sub(r'[{}]+'.format(punctuation), ' ', text)
+    return text.strip().replace('\r\n', '').replace('\n', '').replace('\r', '')
 
 
 class YdyspiderPipeline(object):