pipelines.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990
  1. # Define your item pipelines here
  2. #
  3. # Don't forget to add your pipeline to the ITEM_PIPELINES setting
  4. # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
  5. # useful for handling different item types with a single interface
  6. # from itemadapter import ItemAdapter
  7. import os
  8. import re
  9. import redis
  10. from content_spider.mysqlHelper import MysqlHelper
  11. from content_spider.Util import my_log
  12. redis_crawl_flag_key = 'book:crawl:stats'
  13. redis_update_flag_key = 'book:update:stats'
  14. redis_fix_flag_key = 'book:fix:stats'
  15. def formatcontent(content):
  16. content = content.replace(' ', '')
  17. content = content.replace('<p>', '')
  18. content = content.replace('</p>', "\r\n")
  19. content = content.splitlines()
  20. content = map(lambda s: s.strip(), content)
  21. content = filter(lambda s: s != '', content)
  22. content = '\r\n'.join(content)
  23. return content.strip()
  24. def removePunctuation(text):
  25. punctuation = '!,;:?"\'、,;!”“。?,'
  26. text = re.sub(r'[{}]+'.format(punctuation), ' ', text)
  27. return text.strip().replace('\r\n', '').replace('\n', '').replace('\r', '')
  28. class ContentSpiderPipeline:
  29. def process_item(self, item, spider):
  30. return item
  31. class ChapterItemPipeline:
  32. def __init__(self, host, user, password, db, source, stats,settings):
  33. self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db, source=source,source_id=0)
  34. self.__stats = stats
  35. @classmethod
  36. def from_crawler(cls, crawler):
  37. stats = crawler.stats
  38. return cls(
  39. host=crawler.settings.get('MYSQL_HOST'),
  40. user=crawler.settings.get('MYSQL_USER'),
  41. password=crawler.settings.get('MYSQL_PWD'),
  42. db=crawler.settings.get('MYSQL_DB'),
  43. source=crawler.settings.get('SOURCE'),
  44. stats=stats,
  45. settings=crawler.settings
  46. )
  47. def process_item(self, item, spider):
  48. if item.get('content') != "":
  49. content = formatcontent(item['content'])
  50. item['content'] = content
  51. item['size'] = len(removePunctuation(content))
  52. self.mysqlHelper.insert_chapter(item)
  53. def close_spider(self, spider):
  54. spider_type = self.__stats.get_value('spider_type')
  55. if spider_type == 'update':
  56. self.__redis_conn.hset(redis_update_flag_key, spider.name, 0)
  57. my_log(spider.name, 'update end ....')
  58. book_list = self.__stats.get_value('bid_list')
  59. if book_list is not None:
  60. for book in book_list:
  61. spider.mysqlHelper.after_spider(book['bid'],book['start_sequence'])
  62. if spider_type == 'add':
  63. self.__redis_conn.hset(redis_crawl_flag_key, spider.name, 0)
  64. my_log(spider.name, 'crawl end ....')
  65. bid_list = self.__stats.get_value('bid_list')
  66. if bid_list is not None:
  67. for bid in bid_list:
  68. spider.mysqlHelper.after_spider(bid,1)
  69. if spider_type == 'fix':
  70. bid_list = self.__stats.get_value('bid_list')
  71. if bid_list is not None:
  72. for book in bid_list:
  73. spider.mysqlHelper.after_fix_delete_unnecessary(book['bid'],book['end'])
  74. spider.mysqlHelper.after_spider(book['bid'],1)