root 6 роки тому
коміт
7354637387

+ 11 - 0
scrapy.cfg

@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.org/en/latest/deploy.html
+
+[settings]
+default = ydyspider.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = ydyspider

+ 0 - 0
ydyspider/__init__.py


BIN
ydyspider/__init__.pyc


+ 14 - 0
ydyspider/items.py

@@ -0,0 +1,14 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class YdyspiderItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    pass

+ 56 - 0
ydyspider/middlewares.py

@@ -0,0 +1,56 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+
+class YdyspiderSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Response, dict
+        # or Item objects.
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)

+ 54 - 0
ydyspider/mysql.py

@@ -0,0 +1,54 @@
+# -*- coding: utf-8 -*-
+import time
+import pymysql.cursors
+class msyqlHelper(object):
+	def __init__(self):
+		self.conn = pymysql.connect(host='rm-bp1sc28q8w1slr0l4.mysql.rds.aliyuncs.com',user='zhuishuyun',password='Zhuishu!zwkj2066',db='yueduyun',charset='utf8mb4',cursorclass=pymysql.cursors.DictCursor)
+		#self.conn = pymysql.connect(host='rm-bp1z1dto3n2rdb02f.mysql.rds.aliyuncs.com',user='yueduyun',password='yueduyun2017#Ydy',db='yueduyun',charset='utf8mb4',cursorclass=pymysql.cursors.DictCursor)
+		self.encoding = 'utf-8'
+	
+	def insertbook(self,data):
+		now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+		sql = "insert into books (ly_bid,name,author,intro,cover,category_name,category_id,status,sequence,chapter_count,first_cid,last_cid,size,last_chapter,`created_at`,`updated_at`) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
+		id = 0
+		with self.conn.cursor() as cursor:
+			res = cursor.execute(sql,(data['ly_bid'],data['name'],data['author'],data['intro'],data['cover'],data['category_name'],data['category_id'],data['status'],data['sequence'],'0','0','0','0','0',now,now))
+			id = int(cursor.lastrowid)
+		self.conn.commit()
+		return id
+	def inseraAll(self,data):
+		now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+		sql = "INSERT INTO `chapters` (`bid`, `name`,`sequence`,`size`,`is_vip`,`prev_cid`,`next_cid`,`recent_update_at`,`created_at`,`updated_at`,`content`,`ly_chapter_id`) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
+		with self.conn.cursor() as cursor:
+			cursor.execute(sql,(data['bid'],data['name'],data['sequence'],data['size'],data['is_vip'],data['prev_cid'],data['next_cid'],now,now,now,data['content'],data['ly_chapter_id']))
+			id = int(cursor.lastrowid)
+		self.conn.commit()
+		return id
+	def selectbylyid(self,id):
+		result = None
+		with self.conn.cursor() as cursor:
+			sql = "select ly_bid from books where ly_bid=%s"
+			cursor.execute(sql,(id))
+			result = cursor.fetchone()
+		self.conn.commit()
+		return 	result
+	
+	def getUncaompleteBook(self):
+		now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+		sql = "select id,ly_bid from books where status=0"
+		result = None
+		with self.conn.cursor() as cursor:
+			cursor.execute(sql)
+			result = cursor.fetchall()
+		self.conn.commit()
+		return result
+	def getChapterByBidAndName(self,bid,name):
+		sql = "select id from chapters where bid=%s and name=%s"
+		result = None
+		with self.conn.cursor() as cursor:
+			cursor.execute(sql,(bid,name))
+			result = cursor.fetchone()
+		self.conn.commit()
+		return result
+	def close(self):
+		self.conn.close()

BIN
ydyspider/mysql.pyc


+ 11 - 0
ydyspider/pipelines.py

@@ -0,0 +1,11 @@
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+
+
+class YdyspiderPipeline(object):
+    def process_item(self, item, spider):
+        return item

+ 90 - 0
ydyspider/settings.py

@@ -0,0 +1,90 @@
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for ydyspider project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     http://doc.scrapy.org/en/latest/topics/settings.html
+#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'ydyspider'
+
+SPIDER_MODULES = ['ydyspider.spiders']
+NEWSPIDER_MODULE = 'ydyspider.spiders'
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'ydyspider (+http://www.yourdomain.com)'
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = False
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+#DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+#COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+DEFAULT_REQUEST_HEADERS = {
+   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+   'Accept-Language': 'en',
+}
+
+# Enable or disable spider middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'ydyspider.middlewares.YdyspiderSpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'ydyspider.middlewares.MyCustomDownloaderMiddleware': 543,
+#}
+
+# Enable or disable extensions
+# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
+#ITEM_PIPELINES = {
+#    'ydyspider.pipelines.YdyspiderPipeline': 300,
+#}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

BIN
ydyspider/settings.pyc


+ 4 - 0
ydyspider/spiders/__init__.py

@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.

BIN
ydyspider/spiders/__init__.pyc


+ 87 - 0
ydyspider/spiders/wyy.py

@@ -0,0 +1,87 @@
+# -*- coding: utf-8 -*-
+import scrapy
+from scrapy.http import FormRequest
+import json
+#from books.mysql import msyqlHelper
+import time
+
+class WyySpider(scrapy.Spider):
+    name = 'wyy'
+    allowed_domains = ['m.emeixs.com']
+    start_urls = ['http://m.emeixs.com/']
+
+    start_urls = ['10916',{"bookid":"10916","allpage":1}]
+                
+    cookies = {
+            "CRC":"41ac5d8f45826eacd7d609e65ef11ce9",
+            "OPENID":"o2rtIwVmtewF74MDgUS2bjtw5r8w",
+            "VALIDON":"1546056916",
+            "admin_id":"91",
+            "pen_name":'Don',
+            "portrait":"http%3A%2F%2Fthirdwx.qlogo.cn%2Fmmopen%2FuchmtWQh7iarv9fB1SPnCCaTibra2HjCEIXsrFEp8bnoeNhialwORg1EHhyOoNicYIzzhhib4YrBPYKZialOTAtWBeyw%2F132",
+            'prid':'0',
+            'shell':'766bc63e269cc8ae07b22ece476e1134',
+            'subscribe':'1',
+            'uid':'2',
+            'user_id':'252323197',
+            'user_name':'we20181227',
+            'PHPSESSID':'n6manrv5a8gq2ai1boremc2tr0',
+            'getuserinfo':'1'
+        }
+
+    headers = {
+                "User-Agent":"Mozilla/5.0 (Linux; Android 8.0; MI 6 Build/OPR1.170623.027; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.132 MQQBrowser/6.2 TBS/044405 Mobile Safari/537.36 MMWEBID/223 MicroMessenger/6.7.3.1360(0x2607033D) NetType/WIFI Language/zh_CN Process/tools",
+                "Origin":"https://m.emeixs.com",
+                "Content-Type":"application/x-www-form-urlencoded; charset=UTF-8",
+                "X-Requested-With":"XMLHttpRequest",
+                "Referer":"https://m.emeixs.com/chapter/12514/0/fromaid/91.html"
+        }
+            
+    def start_requests(self):
+        url = 'https://m.emeixs.com/Moreinfo/nextchapter/fromaid/91.html'
+        meta = {"bid":self.start_urls[0]}
+        for i in range(self.start_urls[1]['allpage']):
+            page = i+1
+            #body = {"bookid":self.start_urls[1]['bookid'],"page":str(page),"paixu":"asc"}
+            body = 'bookid={bookid}&page={page}&paixu={paixu}'.format(bookid=self.start_urls[1]['bookid'],page=page,paixu='asc')
+            yield scrapy.Request(url,headers=self.headers,callback=self.parselist,meta=meta,cookies=self.cookies,method='POST',body=body)
+            
+    def parselist(self, response):
+        result = response.xpath('//a')
+        #self.logger.info(result)    
+        data = [];
+        bookid = self.start_urls[0]
+        i = 1;
+        #self.headers['Referer'] = 'https://m.emeixs.com/chapter/10916/0/fromaid/91.html'
+        for a in result:
+            href = a.xpath('@href').extract_first();
+            a_list = href.split('/')
+            cid = a_list[3]
+            meta = {'sequence':cid}
+            i = i+1;
+            if i >3:
+                break;
+            cid = i+1
+            #self.cookies['VALIDON'] = int(time.time())+3600*2
+            #self.logger.info(self.cookies)
+            #      https://m.emeixs.com/ChapterContent/content/fromaid/91.html?bookid=10916&num=5        
+            #url = 'https://m.emeixs.com/ChapterContent/collection/fromaid/91.html?bookid={bookid}&num={sequence}'.format(bookid=bookid,sequence=cid);
+            #yield scrapy.Request(url,headers=self.headers,meta=meta,callback=self.parse2,cookies=self.cookies)
+            url = 'https://m.emeixs.com/ChapterContent/content/fromaid/91.html?bookid={bookid}&num={sequence}'.format(bookid=bookid,sequence=cid);
+            self.logger.info('url is :'+url)
+            yield scrapy.Request(url,headers=self.headers,meta=meta,callback=self.parsecontent,cookies=self.cookies)
+            #c2 = int(cid)+1
+            #c2 = str(c2)
+            #url = 'https://m.emeixs.com/ChapterContent/content/fromaid/91.html?bookid={bookid}&num={sequence}'.format(bookid=bookid,sequence=c2);
+            #yield scrapy.Request(url,headers=self.headers,callback=self.parsecontent,meta=meta,cookies=self.cookies)
+
+    def parsecontent(self,response):
+        #res = self.json_decode(response.text)
+        self.logger.info(response.text)
+    
+    def parse2(self,response):
+        #res = self.json_decode(response.text)
+        self.logger.info(response.text)    
+
+    def json_decode(self,jsonstr):
+        return json.loads(jsonstr)    

BIN
ydyspider/spiders/wyy.pyc


+ 71 - 0
ydyspider/spiders/ydy.py

@@ -0,0 +1,71 @@
+# -*- coding: utf-8 -*-
+import scrapy
+from ydyspider.mysql import msyqlHelper
+import json
+import time
+
+class YunduyunSpider(scrapy.Spider):
+	name = 'ydy'
+	allowed_domains = ['leyuee.com']
+	start_urls = ['http://www.leyuee.com/services/zwfx.aspx?method=booklist&token=sefaf23h7face']
+	def start_requests(self):
+		bid_t = self.bid
+		bid_list = bid_t.split(',')
+		for ids in bid_list:
+			#self.logger.info(id)
+			#yield scrapy.Request("http://www.leyuee.com/services/zwfx.aspx?method=bookinfo&token=sefaf23h7face&bid=%s" % item['book_id'],callback=self.parse2qqq,meta={"ly_bid":item['book_id'],"i":0})
+			yield scrapy.Request("http://www.leyuee.com/services/zwfx.aspx?method=bookinfo&token=sefaf23h7face&bid=%s" % ids,callback=self.parse2,meta={"ly_bid":ids,"i":0})
+
+
+	def parse2(self,response):
+		mysql = msyqlHelper()
+		res = response.text
+		res = self.json_encode(res)
+		data = dict()
+		data['ly_bid'] = res['data']['book_id']
+		data['name'] = res['data']['book_name']
+		data['author'] = res['data']['book_author']
+		data['intro'] = res['data']['introduction']
+		data['cover'] = res['data']['cover_url']
+		data['category_name'] = res['data']['book_tags']
+		data['category_id'] = res['data']['book_category_id']
+		data['status'] = res['data']['book_state']
+		data['sequence'] = response.meta['i']
+		bid = mysql.insertbook(data)
+		mysql.close()
+		self.logger.info(data)
+		yield scrapy.Request("http://www.leyuee.com/services/zwfx.aspx?method=chapterlist&bid=%s&token=sefaf23h7face" % res['data']['book_id'],meta={"bid":bid,"book_id":res['data']['book_id']},callback=self.parse3)
+	
+	def parse3(self,response):
+		res = response.text;
+		res = self.json_encode(res)
+		if res['code']  == 200:
+			#mysql = msyqlHelper()
+			for volume in res['data']:
+				for chapter in volume['chapters']:
+					chapter['bid'] = response.meta['bid']
+					yield scrapy.Request('http://www.leyuee.com/services/zwfx.aspx?method=chapter&bid=%s&cid=%s&token=sefaf23h7face' % (response.meta['book_id'],chapter['chapter_id']),meta=chapter,callback=self.parse4)
+	
+	def parse4(self,response):
+		res = response.text;
+		res = self.json_encode(res)
+		if res['code'] == 200:
+			mysql = msyqlHelper()
+			meta = response.meta
+			data = dict()
+			data['bid'] = meta['bid']
+			data['name'] = meta['chapter_name']
+			data['sequence'] = meta['chapter_order_number']+1
+			data['size'] = len(res['data']['chapter_content'])
+			data['is_vip'] = meta['chapter_need_pay']
+			data['prev_cid'] = 0
+			data['next_cid'] = 0
+			data['recent_update_at'] = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(meta['chapter_last_update_time']))
+			data['content'] = res['data']['chapter_content']
+			data['ly_chapter_id'] = res['data']['chapter_id']
+			mysql.inseraAll(data)
+			mysql.close()
+	
+	def json_encode(self,jsonstr):
+		return json.loads(jsonstr)
+

BIN
ydyspider/spiders/ydy.pyc


+ 79 - 0
ydyspider/spiders/yunduyun.py

@@ -0,0 +1,79 @@
+# -*- coding: utf-8 -*-
+import scrapy
+from ydyspider.mysql import msyqlHelper
+import json
+import time
+
+class YunduyunSpider(scrapy.Spider):
+	name = 'yunduyun'
+	allowed_domains = ['leyuee.com']
+	start_urls = ['http://www.leyuee.com/services/zwfx.aspx?method=booklist&token=sefaf23h7face']
+	def parse(self, response):
+        	res = response.text
+		res = self.json_encode(res)
+		self.logger.info(res);
+		i = 0
+		mysql = msyqlHelper()
+		#id = 3
+		#yield scrapy.Request("http://www.leyuee.com/services/zwfx.aspx?method=bookinfo&token=sefaf23h7face&bid=%s" % id,callback=self.parse2,meta={"ly_bid":id,"i":i})
+		
+		for item in res['data']:
+			if item['book_id'] <=1501:
+				continue
+			exist = mysql.selectbylyid(item['book_id'])
+			if exist is not None:
+				self.logger.info(exist)
+				continue
+			yield scrapy.Request("http://www.leyuee.com/services/zwfx.aspx?method=bookinfo&token=sefaf23h7face&bid=%s" % item['book_id'],callback=self.parse2,meta={"ly_bid":item['book_id'],"i":i})
+
+	def parse2(self,response):
+		mysql = msyqlHelper()
+		res = response.text
+		res = self.json_encode(res)
+		data = dict()
+		data['ly_bid'] = res['data']['book_id']
+		data['name'] = res['data']['book_name']
+        	data['author'] = res['data']['book_author']
+        	data['intro'] = res['data']['introduction']
+        	data['cover'] = res['data']['cover_url']
+        	data['category_name'] = res['data']['book_tags']
+        	data['category_id'] = res['data']['book_category_id']
+        	data['status'] = res['data']['book_state']
+        	data['sequence'] = response.meta['i']
+        	bid = mysql.insertbook(data)
+        	mysql.close()
+		self.logger.info(data)
+		yield scrapy.Request("http://www.leyuee.com/services/zwfx.aspx?method=chapterlist&bid=%s&token=sefaf23h7face" % res['data']['book_id'],meta={"bid":bid,"book_id":res['data']['book_id']},callback=self.parse3)
+	
+	def parse3(self,response):
+		res = response.text;
+		res = self.json_encode(res)
+		if res['code']  == 200:
+			#mysql = msyqlHelper()
+			for volume in res['data']:
+				for chapter in volume['chapters']:
+					chapter['bid'] = response.meta['bid']
+					yield scrapy.Request('http://www.leyuee.com/services/zwfx.aspx?method=chapter&bid=%s&cid=%s&token=sefaf23h7face' % (response.meta['book_id'],chapter['chapter_id']),meta=chapter,callback=self.parse4)
+	
+	def parse4(self,response):
+		res = response.text;
+		res = self.json_encode(res)
+		if res['code'] == 200:
+			mysql = msyqlHelper()
+			meta = response.meta
+			data = dict()
+			data['bid'] = meta['bid']
+			data['name'] = meta['chapter_name']
+			data['sequence'] = meta['chapter_order_number']+1
+			data['size'] = len(res['data']['chapter_content'])
+			data['is_vip'] = meta['chapter_need_pay']
+			data['prev_cid'] = 0
+			data['next_cid'] = 0
+			data['recent_update_at'] = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(meta['chapter_last_update_time']))
+			data['content'] = res['data']['chapter_content']
+			data['ly_chapter_id'] = res['data']['chapter_id']
+			mysql.inseraAll(data)
+			mysql.close()
+	
+	def json_encode(self,jsonstr):
+		return json.loads(jsonstr)

BIN
ydyspider/spiders/yunduyun.pyc