两种方式抓取王者荣耀皮肤


写在前面

抓完了LOL,再来试试王者荣耀,单纯写scrapy的话没什么意思,就用requests和scrapy分别实现了抓取功能,并记录一个需要解决的问题。

Scrapy

创建项目

scrapy startproject wzryskin
cd wzry
scrapy genspider wzryskinspider "pvp.qq.com"

items.py

import scrapy

class WzryskinItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    hero_name = scrapy.Field()
    skin_name = scrapy.Field()
    skin_url = scrapy.Field()

wzryskinspider.py

import scrapy
from wzryskin.items import WzryskinItem
from bs4 import BeautifulSoup as bs

class WzryskinspiderSpider(scrapy.Spider):
    name = 'wzryskinspider'
    allowed_domains = ['pvp.qq.com', 'game.gtimg.cn']
    start_urls = ['https://pvp.qq.com/web201605/js/herolist.json']

    def parse(self, response):
        hero_lists = response.json()
        for hero in hero_lists:
            url = 'https://pvp.qq.com/web201605/herodetail/{}.shtml'
            heroid = hero['ename']
            yield scrapy.Request(url = url.format(heroid),callback = self.get_skins,meta = {'hero':hero})

    def get_skins(self,response):
        hero = response.meta['hero']
        url = 'https://game.gtimg.cn/images/yxzj/img201606/skin/hero-info/{}/{}-bigskin-{}.jpg'
        bf = bs(response.text, 'lxml')
        skins = bf.find('div', class_='pic-pf').find('ul').get('data-imgname')
        skin_list = [x.split('&')[0] for x in skins.split('|')]
        for i, skin_name in enumerate(skin_list):
            item = WzryskinItem()
            item['hero_name'] = hero['cname']
            item['skin_name'] = skin_name.replace('/','')
            item['skin_url'] = [url.format(hero['ename'], hero['ename'], i + 1)]
            yield item

pipelines.py

from itemadapter import ItemAdapter
from scrapy.pipelines.images import ImagesPipeline
from scrapy.http import Request

class WzryskinPipeline:
    def process_item(self, item, spider):
        return item

class ImgPipeline(ImagesPipeline):

    def get_media_requests(self, item, info):
        urls = ItemAdapter(item).get(self.images_urls_field, [])
        return [Request(u,meta={"hero_name":item["hero_name"],"skin_name": item['skin_name']}) for u in urls]

    def file_path(self, request, response=None, info=None):
        hero_name = request.meta['hero_name']
        skin_name = request.meta['skin_name']
        return '{}/{}.jpg'.format(hero_name,skin_name)

settings.py(修改配置)

BOT_NAME = 'wzryskin'

SPIDER_MODULES = ['wzryskin.spiders']
NEWSPIDER_MODULE = 'wzryskin.spiders'

ROBOTSTXT_OBEY = False

DOWNLOAD_DELAY = 0.25

COOKIES_ENABLED = False

DOWNLOADER_MIDDLEWARES = {
    'wzryskin.middlewares.WzryskinDownloaderMiddleware': 543,
}

ITEM_PIPELINES = {
    'wzryskin.pipelines.ImgPipeline': 300,
}
IMAGES_STORE = '王者荣耀'

IMAGES_URLS_FIELD = 'skin_url'

IMAGES_RESULT_FIELD = 'skin_name'

middlewares.py

#主要修改一个方法
from fake_useragent import UserAgent
def process_request(self, request, spider):
    ua = UserAgent().random
    request.headers['User-Agent'] = ua
    return None

requests

# -*- coding: utf-8 -*-
#操作文件夹、时间
import os,time
#网页请求
import requests
#伪造请求头
from fake_useragent import UserAgent
#解析网页
from bs4 import BeautifulSoup as bs
#进度条
from tqdm import tqdm
#创建请求头
headers = {'User-Agent':UserAgent().random}
#创建根目录
if not os.path.exists('王者荣耀'):
    os.mkdir('王者荣耀')
#获取英雄列表的url
url_list = 'https://pvp.qq.com/web201605/js/herolist.json'
#访问地址获取response
res = requests.get(url = url_list,headers=headers)
#将response转换为json字典(这里好像是列表)
hero_lists = res.json()
#遍历
for hero in hero_lists:
    #构造英雄详情页
    url_detail = 'https://pvp.qq.com/web201605/herodetail/{}.shtml'.format(hero['ename'])
    #访问页面
    res = requests.get(url = url_detail,headers = headers)
    #页面编码
    res.encoding = res.apparent_encoding
    #解析网页
    bf = bs(res.text,'lxml')
    #获取皮肤信息
    skins = bf.find('div',class_ = 'pic-pf').find('ul').get('data-imgname')
    #处理得到的字符串,构造单个英雄的皮肤列表
    skin_list = [x.split('&')[0] for x in skins.split('|')]
    #调用进度条,逐个下载
    for i in tqdm(range(len(skin_list)),desc = '正在下载 {}'.format(hero['cname']),ncols=80):
        #创建英雄文件夹
        if not os.path.exists('王者荣耀/{}'.format(hero['cname'])):
            os.mkdir('王者荣耀/{}'.format(hero['cname']))
        #构造皮肤图片的url
        url_skin = 'https://game.gtimg.cn/images/yxzj/img201606/skin/hero-info/{0}/{0}-bigskin-{1}.jpg'.format(hero['ename'],i + 1)
        #请求图片地址
        res = requests.get(url = url_skin,headers = headers)
        #保存图片
        open('王者荣耀/{}/{}.jpg'.format(hero['cname'],skin_list[i]),'wb').write(res.content)
    #暂停1秒
    time.sleep(1)

写在后面

两种方法得到的结果差不多,但是scrapy抓取到的图片是压缩过的,暂时还不知道如何绕过。为什么scrapy没有注释呢?偷个懒吧。。。有空再写