創建項目
neo@MacBook-Pro ~/Documents % scrapy startproject photo
neo@MacBook-Pro ~/Documents % cd photo
安裝依賴庫
neo@MacBook-Pro ~/Documents/photo % pip3 install image
創建爬蟲
neo@MacBook-Pro ~/Documents/photo % scrapy genspider jiandan jandan.net
忽略 robots.txt 規則
# Obey robots.txt rules ROBOTSTXT_OBEY = False
配置圖片保存路徑與縮圖
#圖片保存路徑 IMAGES_STORE='/tmp/photo' #DOWNLOAD_DELAY = 0.25 #縮略圖的尺寸,設置這個值就會產生縮略圖 IMAGES_THUMBS = { 'small': (50, 50), 'big': (200, 200), }
加入 process_item()與 item_completed() 方法
注意:PhotoPipeline(ImagesPipeline) 需要繼承 ImagesPipeline
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html import scrapy from scrapy.pipelines.images import ImagesPipeline from scrapy.exceptions import DropItem class PhotoPipeline(ImagesPipeline): # def process_item(self, item, spider): # return item def get_media_requests(self, item, info): for image_url in item['image_urls']: yield scrapy.http.Request('http:'+image_url) def item_completed(self, results, item, info): image_paths = [x['path'] for ok, x in results if ok] if not image_paths: raise DropItem("Item contains no images") item['image_paths'] = image_paths return item
忽略 robots.txt 規則
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html import scrapy class PhotoItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() #圖片的連結 image_urls = scrapy.Field() images = scrapy.Field() image_paths = scrapy.Field() pass
# -*- coding: utf-8 -*- import scrapy from scrapy.loader import ItemLoader from photo.items import PhotoItem class JiandanSpider(scrapy.Spider): name = 'jiandan' # allowed_domains = ['jandan.net'] allowed_domains = [] start_urls = ['http://jandan.net/ooxx'] def parse(self, response): l = ItemLoader(item=PhotoItem(), response=response) l.add_xpath('image_urls','//img//@src' ) yield l.load_item() next_page = response.xpath('//a[@class="previous-comment-page"]//@href').extract_first() #翻頁 if next_page: yield response.follow(next_page,self.parse) pass def parse_page(self, response): l = ItemLoader(item=PhotoItem(), response=response) l.add_xpath('image_urls','//img//@src' ) return l.load_item()