zz преди 3 години
родител
ревизия
daa900b4fc

+ 4 - 0
ydyspider/.gitignore

@@ -0,0 +1,4 @@
+.idea/
+*.pyc
+*/__pycache__
+*/*.pyc

+ 8 - 0
ydyspider/mysql.py

@@ -81,3 +81,11 @@ class msyqlHelper(object):
 			result = cursor.fetchone()
 		self.conn.commit()
 		return result
+
+	def getLianshang(self):
+		sql = "SELECT zw_id FROM books a JOIN book_configs b on a.id = b.bid WHERE b.cp_source = 'lianshang' order by zw_id desc LIMIT 1"
+		with self.conn.cursor() as cursor:
+			cursor.execute(sql)
+			result = cursor.fetchone()
+		self.conn.commit()
+		return result

+ 20 - 0
ydyspider/pipelines.py

@@ -4,8 +4,28 @@
 #
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import os
 
 
 class YdyspiderPipeline(object):
+
+    def __init__(self, stats):
+        self.__stats = stats
+
     def process_item(self, item, spider):
         return item
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        stats = crawler.stats
+        return cls(
+            stats=stats
+        )
+
+    def close_spider(self, spider):
+        bid_list = self.__stats.get_value('bid_list')
+        if bid_list is not None:
+            for bid in bid_list:
+                command = '/usr/local/php/bin/php /home/www/zhuishuyun_wap/artisan book:afs %s ' % bid
+                os.system(command)
+

+ 3 - 3
ydyspider/settings.py

@@ -64,9 +64,9 @@ DEFAULT_REQUEST_HEADERS = {
 
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
-#    'ydyspider.pipelines.YdyspiderPipeline': 300,
-#}
+ITEM_PIPELINES = {
+    'ydyspider.pipelines.YdyspiderPipeline': 300,
+}
 
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

+ 1 - 0
ydyspider/spiders/ydy.py

@@ -32,6 +32,7 @@ class YunduyunSpider(scrapy.Spider):
 		data['status'] = res['data']['book_state']
 		data['sequence'] = response.meta['i']
 		bid = mysql.insertbook(data)
+		self.crawler.stats.set_value('bid_list', [])
 		mysql.close()
 		self.logger.info(data)
 		yield scrapy.Request("http://www.leyuee.com/services/zwfx.aspx?method=chapterlist&bid=%s&token=sefaf23h7face" % res['data']['book_id'],meta={"bid":bid,"book_id":res['data']['book_id']},callback=self.parse3)

+ 4 - 2
ydyspider/spiders/zwcontentSpider.py

@@ -6,10 +6,11 @@ import json
 
 class zwcontentSpider(scrapy.Spider):
     name = 'zwcontent'
-    allowed_domains = ['zwapi.ycsd.cn']
-    base_url = 'http://zwapi.ycsd.cn/api/book'
+    allowed_domains = ['cp.yqsd.cn']
+    base_url = 'http://cp.yqsd.cn/api/book'
 
     def start_requests(self):
+        self.crawler.stats.set_value('bid_list', [])
         param = self.bid
         bid_list = param.split(',')
         for bid in bid_list:
@@ -32,6 +33,7 @@ class zwcontentSpider(scrapy.Spider):
         data['status'] = res['data']['status']
         data['sequence'] = response.meta['i']
         bid = mysql.insertZwBook(data)
+        self.crawler.stats.get_value('bid_list').append(bid)
         mysql.close()
         url = self.base_url + '/chapterlist/{}'.format(res['data']['bid'])
         yield scrapy.Request(url, meta={"bid": bid, "book_id": res['data']['bid']}, callback=self.parse3)

+ 37 - 0
ydyspider/spiders/zwcontentlianshangSpider.py

@@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+import time
+
+import scrapy
+from ydyspider.mysql import msyqlHelper
+import json
+from . import zwcontentSpider
+
+
+class zwcontentlianshangSpider(zwcontentSpider.zwcontentSpider):
+    name = 'zwcontentlianshnag'
+    allowed_domains = ['/cp.yqsd.cn']
+    base_url = 'http://cp.yqsd.cn/api/book'
+    custom_settings = {
+        'DOWNLOAD_DELAY': 0.01,
+        'LOG_FILE': 'content_spider/log/' + name + time.strftime("%Y-%m-%d", time.localtime()) + '.log'
+    }
+
+    def start_requests(self):
+        self.crawler.stats.set_value('bid_list', [])
+        mysql = msyqlHelper()
+        last_book = mysql.getLianshang()
+        bid = last_book['zw_id']
+        start_url = self.base_url + '/booklist/lianshang/{}'.format(bid)
+        yield scrapy.Request(start_url, callback=self.parse_book_list)
+
+    def parse_book_list(self, response):
+        result = json.loads(response.text)
+        if result.get('data') is not None:
+            i = 0
+            for item in result['data']:
+                i = i+1
+                if i > 1000:
+                    break
+                bid = item['id']
+                url = self.base_url + '/bookInfo/{}'.format(bid)
+                yield scrapy.Request(url, callback=self.parse2, meta={"zw_id": bid, "i": 0})