1.2python爬虫scrapy_图片下载

下载图片

准备工作

1.创建项目

2.创建爬虫

第一种.开始文件配置–pipeline

1.seetings文件

1
2
3
4
5
6
7
8
9
10
11
ROBOTSTXT_OBEY = False 
DOWNLOAD_DELAY = 1
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'
}
ITEM_PIPELINES = {
'项目名字.pipelines.管道文件类': 300,
}

2.items文件

1
2
3
4
class items文件类(scrapy.Item):
items文件类变量 = scrapy.Field()
items文件类变量 = scrapy.Field()

3.爬虫文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
import scrapy
from 项目名.items import items文件类
class Bmw5Spider(scrapy.Spider):
name = '爬虫名字'
allowed_domains = ['网站域']
start_urls = ['爬取的首页']
def parse(self, response):
ui_boxes = response.xpath('xpath代码')[1:]
for ui_box in ui_boxes:
category = ui_box.xpath('xpath代码').get()
urls = ui_box.xpath('xpath代码').getall()
urls = list(map(lambda url: response.urljoin(url), urls))
item = items文件类(items文件类变量 = category, items文件类变量 = urls)
yield item

4.pipeline文件

生成文件在images

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import os
import pypinyin
from urllib import request

class BmwPipeline:
def __init__(self):
self.images_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'images')
self.create_dir(self.images_path)
def create_dir(self, dir_path):
if not os.path.exists(dir_path): os.mkdir(dir_path)
def process_item(self, item, spider):
category = item['category']
urls = item['urls']
category_path = os.path.join(self.images_path, self._convert_to_pinyin(category))
self.create_dir(category_path)
for image_url in urls:
image_name = image_url.split('_')[-1]
request.urlretrieve(image_url, os.path.join(category_path, image_name))
return item
def _convert_to_pinyin(self, world):
# 将中文汉字转换成不带声调的拼音
to_pinyin = [''.join(i) for i in pypinyin.pinyin(world, style=pypinyin.NORMAL)]
to_pinyin = ''.join(to_pinyin)
return to_pinyin

pip安装pypinyin

修正:pip install pypinyin

出现错误:

错误:2021-04-11 22:33:59 [scrapy.middleware] WARNING: Disabled PianImgPipeline: ImagesPipeline requires installing Pillow 4.0.0 or later

修正:pip install pillow

第二种.开始文件配置–Files Pipeline

1.settings文件

1
2
3
4
5
6
import os
ITEM_PIPELINES = {
'scrapy.pipelines.images.ImagesPipeline': 1 # 使用scrapy内置的ImagesPipeline
}
# 配置文件的下载路径
IMAGES_STORE = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'my_images')

2.items文件

1
2
3
4
5
import scrapy
class BmwItem(scrapy.Item):
category = scrapy.Field()
image_urls = scrapy.Field()
images = scrapy.Field()

3.爬虫文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
class BmwSpider(scrapy.Spider):
name = 'bmw'
allowed_domains = ['car.autohome.com.cn']
start_urls = ['https://car.autohome.com.cn/pic/series/4472.html']
def parse(self, response):
uiboxs = response.xpath('//div[@class="uibox"]')[1:]
for uibox in uiboxs:
category = uibox.xpath('.//div[@class="uibox-title"]/a/text()').get()
url_list = uibox.xpath('.//ul/li/a/img/@src').getall()
# urls = ['https:' + url for url in url_list]
urls = list(map(lambda url: response.urljoin(url), url_list))
item = AutohomeItem(category = category, image_urls = urls) # 修改这里的urls为image_urls
yield item

生成在my_images/full文件

参考:(Python爬虫之scrapy下载文件和图片_琴酒网络的博客-CSDN博客_scrapy 下载图片