用到的包
#请求网页
import requests
#创建和切换目录
import os
#解析网页
from bs4 import BeautifulSoup as bs
#selenium模拟浏览器行为
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
初始化
#创建并切换到英雄联盟文件夹
if not os.path.exists('英雄联盟'):
os.mkdir('英雄联盟')
os.chdir('英雄联盟')
#获取英雄关键词(称号或名称)
keyword = input('keyword:')
#定义一个Options对象用于隐藏chrome窗口
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
#全英雄数据页面
BASE_URL = 'http://lol.qq.com/data/'
#请求头
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
}
#初始化一个browser对象用于启动chrome并打开网页
browser = webdriver.Chrome(executable_path = 'chromedriver.exe',options = chrome_options)
获取英雄列表
def get_heros():
#访问目标地址
browser.get('https://lol.qq.com/data/info-heros.shtml')
#获取页面数据并解析
html = browser.page_source
bf = bs(html,'lxml')
heros = bf.find('ul',class_ = "imgtextlist").find_all('a')
#将英雄名称和详细页面保存为列表
hero_list = []
for hero in heros:
hero_name = hero.get('title')
hero_url = BASE_URL + hero.get('href')
temp = [hero_name,hero_url]
hero_list.append(temp)
#从全英雄中匹配关键词
for i in range(len(hero_list)):
if keyword in hero_list[i][0]:
namepath = hero_list[i][0]
#返回英雄列表和英雄名称
return hero_list,namepath
获取皮肤地址
def skin_list(hero_list):
#访问与关键词相匹配的英雄详细地址
for i in range(len(hero_list)):
if keyword in hero_list[i][0]:
browser.get(hero_list[i][1])
#解析网页
html = browser.page_source
html = bs(html,'lxml')
detail_info = html.find('div',class_ = "defail-skin-nav").find('ul').find_all('img')
#将所有皮肤名称及相应的图片地址保存为列表
hero_skin = []
for item in detail_info:
#获取皮肤名称,并替换到特殊字符
skin_name = item.get('alt').replace('/','')
#selenium只能抓取到缩略图地址,观察之后,将small替换为big即为大图地址
skin_url = item.get('src').replace('small','big')
hero_skin.append([skin_name,skin_url])
return hero_skin
保存皮肤
def save_skin(hero_skin,namepath):
#创建保存该英雄皮肤的目录
if not os.path.exists(namepath):
os.mkdir(namepath)
#遍历皮肤列表并保存
for item in hero_skin:
#使用requests请求皮肤地址
res = requests.get(url = item[1],headers = headers)
with open(namepath + '/' + item[0] + '.jpg','wb') as f:
f.write(res.content)
f.close()
主函数和入口
#主函数
def main():
hero_list,namepath = get_heros()
hero_skin = skin_list(hero_list)
save_skin(hero_skin,namepath)
#程序入口
if __name__ == '__main__':
main()
写在后面
因为怕被封IP所以没有设置批量爬取。 配合多线程可以快速获取全英雄所有皮肤。
批量爬取
使用多线程批量爬取英雄皮肤,暂时丢一下完整代码,后续有时间会添加详细注释。
更新于2020年10月25号。
# -*- coding: utf-8 -*-
"""
Created on Wed Sep 16 21:54:21 2020
@author: Administrator
"""
import requests
#创建和切换目录
import os
#解析网页
from bs4 import BeautifulSoup as bs
#selenium模拟浏览器行为
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import threading
from threading import Lock
import random,time
if not os.path.exists('英雄联盟'):
os.mkdir('英雄联盟')
os.chdir('英雄联盟')
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
}
proxies=[{'http':'114.103.168.183:3000'},{'http':'175.43.156.42:9999'},
{'http':'218.64.148.47:9000'},{'http':'221.224.136.211:35101'},
{'http:':'175.42.129.65:9999'},{'http':'117.64.225.154:1133'}]
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.binary_location = "D:\Google\Chrome\Application\chrome.exe"
chrome_options.add_argument('--disable-gpu')
class Get_LOL(object):
def __init__(self,keyword):
self.keyword = keyword
self.BASE_URL = 'http://lol.qq.com/data/'
self.browser = webdriver.Chrome(executable_path = 'chromedriver.exe',options = chrome_options)
def get_heros(self):
#访问目标地址
self.browser.get('https://lol.qq.com/data/info-heros.shtml')
time.sleep(1)
#获取页面数据并解析
html = self.browser.page_source
bf = bs(html,'lxml')
heros = bf.find('ul',class_ = "imgtextlist").find_all('a')
#将英雄名称和详细页面保存为列表
hero_list = []
for hero in heros:
hero_name = hero.get('title')
hero_url = self.BASE_URL + hero.get('href')
temp = [hero_name,hero_url]
hero_list.append(temp)
#从全英雄中匹配关键词
if self.keyword != 0:
for i in range(len(hero_list)):
if self.keyword in hero_list[i][0]:
namepath = hero_list[i][0]
return hero_list,namepath
else:
self.browser.quit()
return hero_list
def skin_list(self,hero_list):
#访问与关键词相匹配的英雄详细地址
for i in range(len(hero_list)):
if self.keyword in hero_list[i][0]:
print(hero_list[i][1])
self.browser.get(hero_list[i][1])
time.sleep(1)
#解析网页
html = self.browser.page_source
self.browser.quit()
html = bs(html,'lxml')
detail_info = html.find('div',class_ = "defail-skin-nav").find('ul').find_all('img')
#将所有皮肤名称及相应的图片地址保存为列表
hero_skin = []
for item in detail_info:
#获取皮肤名称,并替换到特殊字符
skin_name = item.get('alt').replace('/','')
#selenium只能抓取到缩略图地址,观察之后,将small替换为big即为大图地址
skin_url = item.get('src').replace('small','big')
hero_skin.append([skin_name,skin_url])
return hero_skin
def save_skin(self,hero_skin,namepath):
#创建保存该英雄皮肤的目录
if not os.path.exists(namepath):
os.mkdir(namepath)
#遍历皮肤列表并保存
for item in hero_skin:
#使用requests请求皮肤地址
res = requests.get(url = item[1],headers = headers,proxies = random.choice(proxies))
with open(namepath + '/' + item[0] + '.jpg','wb') as f:
f.write(res.content)
f.close()
#如果某个英雄出错,则重新获取
def restart(i,hero):
global Browser
Browser[i].get(hero[1])
html = Browser[i].page_source
html = bs(html,'lxml')
detail_info = html.find('div',class_ = "defail-skin-nav").find('ul').find_all('img')
hero_skin = []
for item in detail_info:
#获取皮肤名称,并替换特殊字符
skin_name = item.get('alt').replace('/','')
#selenium只能抓取到缩略图地址,观察之后,将small替换为big即为大图地址
skin_url = item.get('src').replace('small','big')
hero_skin.append([skin_name,skin_url])
return hero_skin
def run(i):
global hero_split
global error_list
global Browser
flag = 0
while True:
#当列表为空时退出循环
if len(hero_split) == 0:
break
try:
#每个线程获取到目标英雄时将该英雄从列表中移除
hero = hero_split.pop(0)
#使用目标Browser对象获取皮肤
Browser[i].get(hero[1])
html = Browser[i].page_source
html = bs(html,'lxml')
detail_info = html.find('div',class_ = "defail-skin-nav").find('ul').find_all('img')
#将所有皮肤名称及相应的图片地址保存为列表
#lock.acquire()
hero_skin = []
for item in detail_info:
#获取皮肤名称,并替换特殊字符
skin_name = item.get('alt').replace('/','')
#selenium只能抓取到缩略图地址,观察之后,将small替换为big即为大图地址
skin_url = item.get('src').replace('small','big')
hero_skin.append([skin_name,skin_url])
#处理获取不到皮肤的英雄
while len(hero_skin) == 0:
hero_skin = restart(i,hero)
#防止无限循环,重新获取10次后退出
flag += 1
if flag == 10 and hero not in error_list:
error_list.append(hero)
break
if not os.path.exists(hero[0]):
os.mkdir(hero[0])
#遍历皮肤列表并保存
for item in hero_skin:
#使用requests请求皮肤地址
res = requests.get(url = item[1],headers = headers,proxies = random.choice(proxies))
with open(hero[0] + '/' + item[0] + '.jpg','wb') as f:
f.write(res.content)
f.close()
print("线程-->%d-->%s-->%d个-->下载完成!\n" % (i,hero[0],len(hero_skin)))
#lock.release()
except Exception as e:
print(e)
pass
#获取单个英雄
def get_one(hero):
handle = Get_LOL(hero)
hero_list,namepath = handle.get_heros()
hero_skin = handle.skin_list(hero_list)
handle.save_skin(hero_skin,namepath)
#获取全部英雄
def get_all():
global error_list
global hero_split
global Browser
handle = Get_LOL(0)
hero_list = handle.get_heros()
hero_split = hero_list
l = []
#创建三个Browser对象
for i in range(3):
browser = webdriver.Chrome(executable_path = 'chromedriver.exe',options = chrome_options)
Browser.append(browser)
#创建三个线程
for i in range(3):
t = threading.Thread(target = run,args = (i,))
l.append(t)
t.start()
#将线程作为主线程的子线程
for t in l:
t.join()
#退出所有的Browser对象
for i in range(len(Browser)):
Browser[i].quit()
if len(error_list) != 0:
print(error_list)
print('正在使用单一方式下载...')
for error_hero in error_list:
get_one(error_hero)
print('程序执行完毕!')
#程序入口
if __name__ == '__main__':
z = input('请选择模式:\n1:下载特定英雄\n2:下载所有英雄\n')
if z == '1':
keyword = input('keyword:')
get_one(keyword)
elif z == '2':
error_list = []
Browser = []
lock = Lock()
get_all()
else:
print('error')