創建項目
neo@MacBook-Pro ~/Documents % scrapy startproject photo
neo@MacBook-Pro ~/Documents % cd photo
安裝依賴庫
neo@MacBook-Pro ~/Documents/photo % pip3 install image
創建爬蟲
neo@MacBook-Pro ~/Documents/photo % scrapy genspider jiandan jandan.net
忽略 robots.txt 規則
# Obey robots.txt rules ROBOTSTXT_OBEY = False
配置圖片保存路徑與縮圖
#圖片保存路徑
IMAGES_STORE='/tmp/photo'
#DOWNLOAD_DELAY = 0.25
#縮略圖的尺寸,設置這個值就會產生縮略圖
IMAGES_THUMBS = {
'small': (50, 50),
'big': (200, 200),
}
加入 process_item()與 item_completed() 方法
注意:PhotoPipeline(ImagesPipeline) 需要繼承 ImagesPipeline
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import scrapy
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
class PhotoPipeline(ImagesPipeline):
# def process_item(self, item, spider):
# return item
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield scrapy.http.Request('http:'+image_url)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
忽略 robots.txt 規則
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class PhotoItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
#圖片的連結
image_urls = scrapy.Field()
images = scrapy.Field()
image_paths = scrapy.Field()
pass
# -*- coding: utf-8 -*-
import scrapy
from scrapy.loader import ItemLoader
from photo.items import PhotoItem
class JiandanSpider(scrapy.Spider):
name = 'jiandan'
# allowed_domains = ['jandan.net']
allowed_domains = []
start_urls = ['http://jandan.net/ooxx']
def parse(self, response):
l = ItemLoader(item=PhotoItem(), response=response)
l.add_xpath('image_urls','//img//@src' )
yield l.load_item()
next_page = response.xpath('//a[@class="previous-comment-page"]//@href').extract_first() #翻頁
if next_page:
yield response.follow(next_page,self.parse)
pass
def parse_page(self, response):
l = ItemLoader(item=PhotoItem(), response=response)
l.add_xpath('image_urls','//img//@src' )
return l.load_item()