写在前面
抓完了LOL,再来试试王者荣耀,单纯写scrapy的话没什么意思,就用requests和scrapy分别实现了抓取功能,并记录一个需要解决的问题。
Scrapy
创建项目
scrapy startproject wzryskin
cd wzry
scrapy genspider wzryskinspider "pvp.qq.com"
items.py
import scrapy
class WzryskinItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
hero_name = scrapy.Field()
skin_name = scrapy.Field()
skin_url = scrapy.Field()
wzryskinspider.py
import scrapy
from wzryskin.items import WzryskinItem
from bs4 import BeautifulSoup as bs
class WzryskinspiderSpider(scrapy.Spider):
name = 'wzryskinspider'
allowed_domains = ['pvp.qq.com', 'game.gtimg.cn']
start_urls = ['https://pvp.qq.com/web201605/js/herolist.json']
def parse(self, response):
hero_lists = response.json()
for hero in hero_lists:
url = 'https://pvp.qq.com/web201605/herodetail/{}.shtml'
heroid = hero['ename']
yield scrapy.Request(url = url.format(heroid),callback = self.get_skins,meta = {'hero':hero})
def get_skins(self,response):
hero = response.meta['hero']
url = 'https://game.gtimg.cn/images/yxzj/img201606/skin/hero-info/{}/{}-bigskin-{}.jpg'
bf = bs(response.text, 'lxml')
skins = bf.find('div', class_='pic-pf').find('ul').get('data-imgname')
skin_list = [x.split('&')[0] for x in skins.split('|')]
for i, skin_name in enumerate(skin_list):
item = WzryskinItem()
item['hero_name'] = hero['cname']
item['skin_name'] = skin_name.replace('/','')
item['skin_url'] = [url.format(hero['ename'], hero['ename'], i + 1)]
yield item
pipelines.py
from itemadapter import ItemAdapter
from scrapy.pipelines.images import ImagesPipeline
from scrapy.http import Request
class WzryskinPipeline:
def process_item(self, item, spider):
return item
class ImgPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
urls = ItemAdapter(item).get(self.images_urls_field, [])
return [Request(u,meta={"hero_name":item["hero_name"],"skin_name": item['skin_name']}) for u in urls]
def file_path(self, request, response=None, info=None):
hero_name = request.meta['hero_name']
skin_name = request.meta['skin_name']
return '{}/{}.jpg'.format(hero_name,skin_name)
settings.py(修改配置)
BOT_NAME = 'wzryskin'
SPIDER_MODULES = ['wzryskin.spiders']
NEWSPIDER_MODULE = 'wzryskin.spiders'
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 0.25
COOKIES_ENABLED = False
DOWNLOADER_MIDDLEWARES = {
'wzryskin.middlewares.WzryskinDownloaderMiddleware': 543,
}
ITEM_PIPELINES = {
'wzryskin.pipelines.ImgPipeline': 300,
}
IMAGES_STORE = '王者荣耀'
IMAGES_URLS_FIELD = 'skin_url'
IMAGES_RESULT_FIELD = 'skin_name'
middlewares.py
#主要修改一个方法
from fake_useragent import UserAgent
def process_request(self, request, spider):
ua = UserAgent().random
request.headers['User-Agent'] = ua
return None
requests
# -*- coding: utf-8 -*-
#操作文件夹、时间
import os,time
#网页请求
import requests
#伪造请求头
from fake_useragent import UserAgent
#解析网页
from bs4 import BeautifulSoup as bs
#进度条
from tqdm import tqdm
#创建请求头
headers = {'User-Agent':UserAgent().random}
#创建根目录
if not os.path.exists('王者荣耀'):
os.mkdir('王者荣耀')
#获取英雄列表的url
url_list = 'https://pvp.qq.com/web201605/js/herolist.json'
#访问地址获取response
res = requests.get(url = url_list,headers=headers)
#将response转换为json字典(这里好像是列表)
hero_lists = res.json()
#遍历
for hero in hero_lists:
#构造英雄详情页
url_detail = 'https://pvp.qq.com/web201605/herodetail/{}.shtml'.format(hero['ename'])
#访问页面
res = requests.get(url = url_detail,headers = headers)
#页面编码
res.encoding = res.apparent_encoding
#解析网页
bf = bs(res.text,'lxml')
#获取皮肤信息
skins = bf.find('div',class_ = 'pic-pf').find('ul').get('data-imgname')
#处理得到的字符串,构造单个英雄的皮肤列表
skin_list = [x.split('&')[0] for x in skins.split('|')]
#调用进度条,逐个下载
for i in tqdm(range(len(skin_list)),desc = '正在下载 {}'.format(hero['cname']),ncols=80):
#创建英雄文件夹
if not os.path.exists('王者荣耀/{}'.format(hero['cname'])):
os.mkdir('王者荣耀/{}'.format(hero['cname']))
#构造皮肤图片的url
url_skin = 'https://game.gtimg.cn/images/yxzj/img201606/skin/hero-info/{0}/{0}-bigskin-{1}.jpg'.format(hero['ename'],i + 1)
#请求图片地址
res = requests.get(url = url_skin,headers = headers)
#保存图片
open('王者荣耀/{}/{}.jpg'.format(hero['cname'],skin_list[i]),'wb').write(res.content)
#暂停1秒
time.sleep(1)
写在后面
两种方法得到的结果差不多,但是scrapy抓取到的图片是压缩过的,暂时还不知道如何绕过。为什么scrapy没有注释呢?偷个懒吧。。。有空再写