# Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
classStudyItem(scrapy.Item): # define the fields for your item here like: title = scrapy.Field() content=scrapy.Field()
pipelines.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
# Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface from itemadapter import ItemAdapter
# Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface from itemadapter import ItemAdapter import json
# useful for handling different item types with a single interface from itemadapter import is_item, ItemAdapter
classNewdemoSpiderMiddleware: # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects.
@classmethod deffrom_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s # 省略... defspider_opened(self, spider): spider.logger.info("Spider opened: %s" % spider.name)
# Define here the models for your spider middleware # # See documentation in: # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals import random
# useful for handling different item types with a single interface from itemadapter import is_item, ItemAdapter
classStudySpiderMiddleware: # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects.
@classmethod deffrom_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s
defprocess_spider_input(self, response, spider): # Called for each response that goes through the spider # middleware and into the spider.
# Should return None or raise an exception. returnNone
defprocess_spider_output(self, response, result, spider): # Called with the results returned from the Spider, after # it has processed the response.
# Must return an iterable of Request, or item objects. for i in result: yield i
defprocess_spider_exception(self, response, exception, spider): # Called when a spider or process_spider_input() method # (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects. pass
defprocess_start_requests(self, start_requests, spider): # Called with the start requests of the spider, and works # similarly to the process_spider_output() method, except # that it doesn’t have a response associated.
# Must return only requests (not items). for r in start_requests: yield r
classStudyDownloaderMiddleware: # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects.
my_requests=0
@classmethod deffrom_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s
defprocess_request(self, request, spider): print("StudyDownloaderMiddleware:",request.headers) # Called for each request that goes through the downloader # middleware.
# Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called returnNone
defprocess_response(self, request, response, spider): # Called with the response returned from the downloader.
# Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest return response
defprocess_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception.
# Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain pass
classHeadersDownloaderMiddleware: @classmethod deffrom_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() return s
defprocess_request(self, request, spider): uas=['Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)'] print("HeadersDownloaderMiddleware:",request.headers) request.headers['User-Agent']=random.choice(uas) returnNone
classHeadersDownloaderMiddleware: @classmethod deffrom_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() return s
defprocess_request(self, request, spider): # - or return a Response object prosess_response处理 # - or return a Request object process_request递归 # - or raise IgnoreRequest: process_exception() methods of process_exception处理 # installed downloader middleware will be called print("Processing request: %s" % request.url) uas=['Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)'] print("HeadersDownloaderMiddleware:",request.headers) request.headers['User-Agent']=random.choice(uas) #return None #return scrapy.http.Response(url=request.url,body='我修改的body'.encode('utf-8')) #return scrapy.http.Request(url=request.url,dont_filter=True) raise scrapy.exceptions.IgnoreRequest("我把请求丢弃了")
classHeadersDownloaderMiddleware: @classmethod deffrom_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() return s
defprocess_request(self, request, spider): # - or return a Response object prosess_response处理 # - or return a Request object process_request递归 # - or raise IgnoreRequest: process_exception() methods of process_exception处理 # installed downloader middleware will be called print("Processing request: %s" % request.url) uas=['Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)'] print("HeadersDownloaderMiddleware:",request.headers) request.headers['User-Agent']=random.choice(uas) returnNone #return scrapy.http.Response(url=request.url,body='我修改的body'.encode('utf-8')) #return scrapy.http.Request(url=request.url,dont_filter=True) #raise scrapy.exceptions.IgnoreRequest("我把请求丢弃了")
defprocess_response(self, request, response, spider): print("HeadersDownloaderMiddleware process_response:",response.body.decode('utf-8')) # Called with the response returned from the downloader.
# Must either; # - return a Response object 直接返回给爬虫 # - return a Request object 交给process_request去处理 #raise 没有处理结果 #return response raise scrapy.exceptions.IgnoreRequest("HeadersDownloaderMiddleware触发错误")
defprocess_start_requests(self, start_requests, spider): # Called with the start requests of the spider, and works # similarly to the process_spider_output() method, except # that it doesn’t have a response associated. # Must return only requests (not items). spider.logger.info("爬虫开始请求") for r in start_requests: withopen(settings.TMP_FILE,'r')as f: cc=f.read() if (r.url in cc): print("爬虫忽略",r.url) continue yield r
# Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3
import scrapy from bs4 import BeautifulSoup from xjb import items
classTestSpider(scrapy.Spider): name = 'test' allowed_domains = ['lz13.cn'] # 翻页,页面的变化范围是1-254 start_urls = [f'https://www.lz13.cn/lizhi/mingrenmingyan-{i}.html' for i inrange(1,254)]
defparse(self, response): soup=BeautifulSoup(response.text,'lxml') # 获取详情页的url small_url=soup.select('.PostHead span h3 a') for url in small_url: # 抛出Request对象,并且把响应结果丢给parse_content函数,不指定的话默认交给parse了 # yield scrapy.Request会自动发起请求 yield scrapy.Request(url=url['href'],callback=self.parse_content)
defparse_content(self,response): item=items.XjbItem() # 获取详情页的数据 soup=BeautifulSoup(response.text,'lxml') title=soup.select_one('.PostContent p').text content=soup.select('.PostContent p')[1:] tmp='' for c in content: tmp+=c.text
# Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
classXjbItem(scrapy.Item): # define the fields for your item here like: title = scrapy.Field() content=scrapy.Field()
pipeline.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
# Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface from itemadapter import ItemAdapter
import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from bs4 import BeautifulSoup from xjb import items
soup=BeautifulSoup(response.text,'lxml') title=soup.select_one('.PostContent p').text content=soup.select('.PostContent p')[1:] tmp='' for c in content: tmp+=c.text