下载图片
准备工作
1.创建项目
2.创建爬虫
第一种.开始文件配置–pipeline
1.seetings文件
1 2 3 4 5 6 7 8 9 10 11
| ROBOTSTXT_OBEY = False DOWNLOAD_DELAY = 1 DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11' } ITEM_PIPELINES = { '项目名字.pipelines.管道文件类': 300, }
|
2.items文件
1 2 3 4
| class items文件类(scrapy.Item): items文件类变量 = scrapy.Field() items文件类变量 = scrapy.Field()
|
3.爬虫文件
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
| import scrapy from 项目名.items import items文件类 class Bmw5Spider(scrapy.Spider): name = '爬虫名字' allowed_domains = ['网站域'] start_urls = ['爬取的首页'] def parse(self, response): ui_boxes = response.xpath('xpath代码')[1:] for ui_box in ui_boxes: category = ui_box.xpath('xpath代码').get() urls = ui_box.xpath('xpath代码').getall() urls = list(map(lambda url: response.urljoin(url), urls)) item = items文件类(items文件类变量 = category, items文件类变量 = urls) yield item
|
4.pipeline文件
生成文件在images
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
| import os import pypinyin from urllib import request
class BmwPipeline: def __init__(self): self.images_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'images') self.create_dir(self.images_path) def create_dir(self, dir_path): if not os.path.exists(dir_path): os.mkdir(dir_path) def process_item(self, item, spider): category = item['category'] urls = item['urls'] category_path = os.path.join(self.images_path, self._convert_to_pinyin(category)) self.create_dir(category_path) for image_url in urls: image_name = image_url.split('_')[-1] request.urlretrieve(image_url, os.path.join(category_path, image_name)) return item def _convert_to_pinyin(self, world): to_pinyin = [''.join(i) for i in pypinyin.pinyin(world, style=pypinyin.NORMAL)] to_pinyin = ''.join(to_pinyin) return to_pinyin
|
pip安装pypinyin
修正:pip install pypinyin
出现错误:
错误:2021-04-11 22:33:59 [scrapy.middleware] WARNING: Disabled PianImgPipeline: ImagesPipeline requires installing Pillow 4.0.0 or later
修正:pip install pillow
第二种.开始文件配置–Files Pipeline
1.settings文件
1 2 3 4 5 6
| import os ITEM_PIPELINES = { 'scrapy.pipelines.images.ImagesPipeline': 1 # 使用scrapy内置的ImagesPipeline } # 配置文件的下载路径 IMAGES_STORE = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'my_images')
|
2.items文件
1 2 3 4 5
| import scrapy class BmwItem(scrapy.Item): category = scrapy.Field() image_urls = scrapy.Field() images = scrapy.Field()
|
3.爬虫文件
1 2 3 4 5 6 7 8 9 10 11 12 13 14
| class BmwSpider(scrapy.Spider): name = 'bmw' allowed_domains = ['car.autohome.com.cn'] start_urls = ['https://car.autohome.com.cn/pic/series/4472.html'] def parse(self, response): uiboxs = response.xpath('//div[@class="uibox"]')[1:] for uibox in uiboxs: category = uibox.xpath('.//div[@class="uibox-title"]/a/text()').get() url_list = uibox.xpath('.//ul/li/a/img/@src').getall() # urls = ['https:' + url for url in url_list] urls = list(map(lambda url: response.urljoin(url), url_list)) item = AutohomeItem(category = category, image_urls = urls) # 修改这里的urls为image_urls yield item
|
生成在my_images/full文件
参考:(Python爬虫之scrapy下载文件和图片_琴酒网络的博客-CSDN博客_scrapy 下载图片