1.2python爬虫scrapy_图片下载

发表于 2021-08-04 分类于编程， python ，爬虫scrapy 阅读次数：本文字数： 2.9k 阅读时长 ≈ 3 分钟

下载图片

准备工作

1.创建项目

2.创建爬虫

第一种.开始文件配置–pipeline

1.seetings文件

ROBOTSTXT_OBEY = False 
DOWNLOAD_DELAY = 1
DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',
  'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'
}
ITEM_PIPELINES = {
   '项目名字.pipelines.管道文件类': 300,
}

2.items文件

class items文件类(scrapy.Item):
    items文件类变量 = scrapy.Field()
    items文件类变量 = scrapy.Field()

3.爬虫文件

import scrapy
from 项目名.items import items文件类
class Bmw5Spider(scrapy.Spider):
    name = '爬虫名字'
    allowed_domains = ['网站域']
    start_urls = ['爬取的首页']
    def parse(self, response):
        ui_boxes = response.xpath('xpath代码')[1:]
        for ui_box in ui_boxes:
            category = ui_box.xpath('xpath代码').get()
            urls = ui_box.xpath('xpath代码').getall()
            urls = list(map(lambda url: response.urljoin(url), urls))
            item = items文件类(items文件类变量 = category, items文件类变量 = urls)
            yield item

4.pipeline文件

生成文件在images

import os
import pypinyin
from urllib import request

class BmwPipeline:
    def __init__(self):
        self.images_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'images')
        self.create_dir(self.images_path)
    def create_dir(self, dir_path):
        if not os.path.exists(dir_path): os.mkdir(dir_path)
    def process_item(self, item, spider):
        category = item['category']
        urls = item['urls']
        category_path = os.path.join(self.images_path, self._convert_to_pinyin(category))
        self.create_dir(category_path)
        for image_url in urls:
            image_name = image_url.split('_')[-1]
            request.urlretrieve(image_url, os.path.join(category_path, image_name))
        return item
    def _convert_to_pinyin(self, world):
        # 将中文汉字转换成不带声调的拼音
        to_pinyin = [''.join(i) for i in pypinyin.pinyin(world, style=pypinyin.NORMAL)]
        to_pinyin = ''.join(to_pinyin)
        return to_pinyin

pip安装pypinyin

修正：pip install pypinyin

出现错误：

错误：2021-04-11 22:33:59 [scrapy.middleware] WARNING: Disabled PianImgPipeline: ImagesPipeline requires installing Pillow 4.0.0 or later

修正：pip install pillow

第二种.开始文件配置–Files Pipeline

1.settings文件

import os
ITEM_PIPELINES = {
    'scrapy.pipelines.images.ImagesPipeline': 1    # 使用scrapy内置的ImagesPipeline
}
# 配置文件的下载路径
IMAGES_STORE = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'my_images')

2.items文件

import scrapy
class BmwItem(scrapy.Item):
    category = scrapy.Field()
    image_urls = scrapy.Field()
    images = scrapy.Field()

3.爬虫文件

class BmwSpider(scrapy.Spider):
    name = 'bmw'
    allowed_domains = ['car.autohome.com.cn']
    start_urls = ['https://car.autohome.com.cn/pic/series/4472.html']
    def parse(self, response):
        uiboxs = response.xpath('//div[@class="uibox"]')[1:]
        for uibox in uiboxs:
            category = uibox.xpath('.//div[@class="uibox-title"]/a/text()').get()
            url_list = uibox.xpath('.//ul/li/a/img/@src').getall()
            # urls = ['https:' + url for url in url_list]
            urls = list(map(lambda url: response.urljoin(url), url_list))
            item = AutohomeItem(category = category, image_urls = urls)	# 修改这里的urls为image_urls
            yield item

生成在my_images/full文件

参考：(Python爬虫之scrapy下载文件和图片_琴酒网络的博客-CSDN博客_scrapy 下载图片