pipelines.py 3.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283
  1. # Define your item pipelines here
  2. #
  3. # Don't forget to add your pipeline to the ITEM_PIPELINES setting
  4. # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
  5. # useful for handling different item types with a single interface
  6. # from itemadapter import ItemAdapter
  7. import os
  8. import re
  9. from content_spider.mysqlHelper import MysqlHelper
  10. from content_spider.Util import my_log
  11. def formatcontent(content):
  12. content = content.replace(' ', '')
  13. content = content.replace('<p>', '')
  14. content = content.replace('</p>', "\r\n")
  15. content = content.splitlines()
  16. content = map(lambda s: s.strip(), content)
  17. content = filter(lambda s: s != '', content)
  18. content = '\r\n'.join(content)
  19. return content.strip()
  20. def removePunctuation(text):
  21. punctuation = '!,;:?"\'、,;!”“。?,'
  22. text = re.sub(r'[{}]+'.format(punctuation), ' ', text)
  23. return text.strip().replace('\r\n', '').replace('\n', '').replace('\r', '')
  24. class ContentSpiderPipeline:
  25. def process_item(self, item, spider):
  26. return item
  27. class ChapterItemPipeline:
  28. def __init__(self, host, user, password, db, source, stats,settings):
  29. self.mysqlHelper = MysqlHelper(host=host, user=user, password=password, db=db, source=source,source_id=0)
  30. self.__stats = stats
  31. @classmethod
  32. def from_crawler(cls, crawler):
  33. stats = crawler.stats
  34. return cls(
  35. host=crawler.settings.get('MYSQL_HOST'),
  36. user=crawler.settings.get('MYSQL_USER'),
  37. password=crawler.settings.get('MYSQL_PWD'),
  38. db=crawler.settings.get('MYSQL_DB'),
  39. source=crawler.settings.get('SOURCE'),
  40. stats=stats,
  41. settings=crawler.settings
  42. )
  43. def process_item(self, item, spider):
  44. if item.get('content') != "":
  45. content = formatcontent(item['content'])
  46. item['content'] = content
  47. item['size'] = len(removePunctuation(content))
  48. self.mysqlHelper.insert_chapter(item)
  49. def close_spider(self, spider):
  50. spider_type = self.__stats.get_value('spider_type')
  51. if spider_type == 'update':
  52. my_log(spider.name, 'update end ....')
  53. book_list = self.__stats.get_value('bid_list')
  54. if book_list is not None:
  55. for book in book_list:
  56. spider.mysqlHelper.after_spider(book['bid'],book['start_sequence'])
  57. if spider_type == 'add':
  58. my_log(spider.name, 'crawl end ....')
  59. bid_list = self.__stats.get_value('bid_list')
  60. if bid_list is not None:
  61. for bid in bid_list:
  62. spider.mysqlHelper.after_spider(bid,1)
  63. if spider_type == 'fix':
  64. bid_list = self.__stats.get_value('bid_list')
  65. if bid_list is not None:
  66. for book in bid_list:
  67. spider.mysqlHelper.after_fix_delete_unnecessary(book['bid'],book['end'])
  68. spider.mysqlHelper.after_spider(book['bid'],1)