用到的包
#用于请求网页
import requests
#分别用于正则匹配、获取目录信息、字符串加载为字典
import re,os,json
#用于解析网页
from bs4 import BeautifulSoup as bs
#用于合并流媒体
from moviepy.editor import *
#用于获取随机的匿名IP
import random
初始化
#获取电影关键词
keyword = input('请输入关键词:')
#为电影创建目录
if not os.path.exists(keyword):
os.mkdir(keyword)
#切换到专属目录
os.chdir('./' + keyword)
#电影天堂的基础Url
BASE_URL = 'http://www.dytt.com'
#定义请求头模拟浏览器
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36'
}
#设定匿名IP地址
proxies=[{'http':'171.12.115.205:9999'},{'http':'123.149.136.182:9999'},
{'http':'121.232.148.67:9000'},{'http':'123.163.27.182:9999'},
{'http:':'171.13.136.235:9999'},{'http':'123.149.137.132:9999'}]
搜索结果
def Get_Search():
#构建搜索页的地址
Search_URL = 'http://www.dytt.com/vodsearch/-------------.html?wd={}'.format(keyword)
#定义两个空列表保存解析后的数据
movie_url = []
movie_title = []
#请求搜索页
res = requests.get(url = Search_URL,headers = headers,proxies = random.choice(proxies))
#设定网页字符格式
res.encoding = 'utf-8'
#使用bs4解析网页并逐条分析
movie_info = bs(res.text,'lxml').find_all('td',class_ = "textLeft")
for i in range(len(movie_info)):
if i % 2 == 0:
#获取目标地址并添加到列表
movie_url.append(BASE_URL + movie_info[i].find('a').get('href'))
#获取目标标题并添加到列表
movie_title.append(movie_info[i].find('a').text)
#返回保存结果的列表
return movie_url,movie_title
在线地址
def Get_Download(url):
#根据选定的项目进行请求
res = requests.get(url = url,headers = headers,proxies = random.choice(proxies))
#设定字符格式
res.encoding = 'utf-8'
#网页解析
bf = bs(res.text,'lxml')
#获取在线播放地址
online_url = BASE_URL + bf.find('ul',class_ = "dy_bofangul").find('a').get('href')
#获取迅雷下载地址
thunder_url = bf.find('div',class_ = 'downList').find_all('a',class_ = 'donw_xl thunder-link')
#返回两种地址
return online_url,thunder_url
流媒体Url
#g根据在线播放地址获取保存所有流媒体URL的网页
def Get_mp4Url(url):
#请求在线播放地址
res = requests.get(url = url,headers = headers,proxies = random.choice(proxies))
res.encoding = 'utf-8'
#解析网页
bf = bs(res.text,'lxml')
#想要的数据保存在这个div中
string = bf.find('div',style = 'height: 400px;').find('script')
#正则匹配{}中的内容
pattern = re.compile('({.*?})')
dict_temp = re.findall(pattern, str(string))
#加载为字典并获取url
mp4_flow = json.loads(dict_temp[0])['url']
#返回流媒体总地址
return mp4_flow
#根据总地址获取保存所有流媒体地址的列表
def Get_Flowtxt(url):
#定义一个空字符串
last_url = ''
#分割总地址并重新拼接为一个次级地址
temp = url.split('/')
for item in temp[:-1]:
last_url = last_url + item + '/'
#对总地址进行请求
res = requests.get(url = url,headers = headers,proxies = random.choice(proxies))
res.edcoding = 'utf-8'
#次级地址的另一部分保存在文本的第三行
final_url = last_url + res.text.split('\n')[-1]
#重新定义次级地址
last_url = ''
temp = final_url.split('/')
for item in temp[:-1]:
last_url = last_url + item + '/'
#对完整的次级地址发起请求
r = requests.get(url = final_url, headers = headers,proxies = random.choice(proxies))
r.encoding = 'utf-8'
#正则匹配所有be开头ts结尾的字符串
pattern = re.compile('(be.*?\.ts)\n#',re.S)
info = re.findall(pattern,r.text)
#info列表中所有元素与次级地址拼接
for i in range(len(info)):
info[i] = last_url + info[i]
#返回流媒体地址列表
return info
整体保存
def Save_movie(urls,filename):
#创建一个保存数据的文件并打开
with open(filename + '.mp4','ab') as f:
#遍历流媒体列表
for i in range(len(urls)):
try:
#逐个访问
res = requests.get(url = urls[i],headers = headers,proxies = random.choice(proxies))
#向文件中写入数据
f.write(res.content)
#维持文件状态
f.flush()
#反馈信息
print('第%s段下载完成!' % str(i + 1))
except:
#返回报错的流媒体段
print("error:{}".format(i))
#结束循环
break
#关闭文件
f.close()
流媒体合并
def Save_Flow(urls):
#空列表用于保存下载出错的流媒体段
error = []
#逐个下载
for i in range(len(urls)):
#构建文件名
filename = urls[i].split('/')[-1].split('.')[0]
#每段数据都创建一个文件用于保存
with open(filename + '.mp4','wb') as f:
try:
res = requests.get(url = urls[i],headers = headers,proxies = random.choice(proxies))
f.write(res.content)
#保存完毕就关闭文件
f.close()
except:
#统计错误的流媒体段
error.append[i]
#无视错误继续下载下一段视频
pass
#调用moviepy模块整合视频
def Merge_Flow(filename):
#空列表用于保存所有待合并视频
L = []
for name in os.listdir():
#如果扩展名为mp4,则向列表中添加一条数据
if name.split('.')[1] == 'mp4':
#使用VideoFileClip预加载视频
video = VideoFileClip(name)
#输出视频部分信息
#print("video time: %s, width: %s, height: %s, fps: %s" % (video.duration, video.w, video.h, video.fps))
L.append(video)
#拼接视频
final_clip = concatenate_videoclips(L,method='compose')
#生成目标视频文件
final_clip.write_videofile(filename + '.mp4',fps = 24,remove_temp = True)
主函数
def main():
#h获取搜索结果
movie_url, movie_title = Get_Search()
#打印所有搜索结果
for i,item in enumerate(movie_title):
print(str(i + 1) + ":" + item)
#用于选定要下载的视频
t = int(input('请输入要获取的索引:')) - 1
#获取文件名
filename = movie_title[t]
#获取在线地址和迅雷地址(可以看需求保存)
online_url,thunder_url = Get_Download(movie_url[t])
#从在线地址中获取总的流媒体地址
mp4_url = Get_mp4Url(online_url)
#从总的流媒体地址获取全部流媒体块地址
urls = Get_Flowtxt(mp4_url)
#选择保存视频的方式
select = input('请选择下载模式:\n1、整体下载\n2、流媒体合并\n')
if select == '1':
Save_movie(urls,filename)
elif select == '2':
Save_Flow(urls)
Merge_Flow(filename)
else:
print('error!')
完整代码
# -*- coding: utf-8 -*-
"""
Created on Tue Sep 8 08:06:55 2020
@author: ljc545w
"""
import requests
import re,os,json
from bs4 import BeautifulSoup as bs
from moviepy.editor import *
import random
keyword = input('请输入关键词:')
if not os.path.exists(keyword):
os.mkdir(keyword)
os.chdir('./' + keyword)
BASE_URL = 'http://www.dytt.com'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36'
}
proxies=[{'http':'171.12.115.205:9999'},{'http':'123.149.136.182:9999'},
{'http':'121.232.148.67:9000'},{'http':'123.163.27.182:9999'},
{'http:':'171.13.136.235:9999'},{'http':'123.149.137.132:9999'}]
def Get_Search():
Search_URL = 'http://www.dytt.com/vodsearch/-------------.html?wd={}'.format(keyword)
movie_url = []
movie_title = []
res = requests.get(url = Search_URL,headers = headers,proxies = random.choice(proxies))
res.encoding = 'utf-8'
movie_info = bs(res.text,'lxml').find_all('td',class_ = "textLeft")
for i in range(len(movie_info)):
if i % 2 == 0:
movie_url.append(BASE_URL + movie_info[i].find('a').get('href'))
movie_title.append(movie_info[i].find('a').text)
return movie_url,movie_title
def Get_Download(url):
res = requests.get(url = url,headers = headers,proxies = random.choice(proxies))
res.encoding = 'utf-8'
bf = bs(res.text,'lxml')
online_url = BASE_URL + bf.find('ul',class_ = "dy_bofangul").find('a').get('href')
thunder_url = bf.find('div',class_ = 'downList').find_all('a',class_ = 'donw_xl thunder-link')
return online_url,thunder_url
def Get_mp4Url(url):
res = requests.get(url = url,headers = headers,proxies = random.choice(proxies))
res.encoding = 'utf-8'
bf = bs(res.text,'lxml')
string = bf.find('div',style = 'height: 400px;').find('script')
pattern = re.compile('({.*?})')
dict_temp = re.findall(pattern, str(string))
mp4_flow = json.loads(dict_temp[0])['url']
return mp4_flow
def Get_Flowtxt(url):
last_url = ''
temp = url.split('/')
for item in temp[:-1]:
last_url = last_url + item + '/'
res = requests.get(url = url,headers = headers,proxies = random.choice(proxies))
res.edcoding = 'utf-8'
final_url = last_url + res.text.split('\n')[-1]
r = requests.get(url = final_url, headers = headers,proxies = random.choice(proxies))
r.encoding = 'utf-8'
pattern = re.compile('(be.*?\.ts)\n#',re.S)
info = re.findall(pattern,r.text)
for i in range(len(info)):
info[i] = last_url + info[i]
return info
def Save_movie(urls,filename):
with open(filename + '.mp4','ab') as f:
for i in range(len(urls)):
try:
res = requests.get(url = urls[i],headers = headers,proxies = random.choice(proxies))
f.write(res.content)
f.flush()
print('第%s段下载完成!' % str(i + 1))
"""
if i == 10:
break
"""
except:
print("error:{}".format(i))
break
f.close()
def Save_Flow(urls):
error = []
for i in range(len(urls)):
filename = urls[i].split('/')[-1].split('.')[0]
with open(filename + '.mp4','wb') as f:
try:
res = requests.get(url = urls[i],headers = headers,proxies = random.choice(proxies))
f.write(res.content)
f.close()
except:
error.append[i]
pass
def Merge_Flow(filename):
L = []
for name in os.listdir():
#print(name)
if name.split('.')[1] == 'mp4':
video = VideoFileClip(name)
#print("video time: %s, width: %s, height: %s, fps: %s" % (video.duration, video.w, video.h, video.fps))
L.append(video)
#拼接视频
final_clip = concatenate_videoclips(L,method='compose')
#生成目标视频文件
final_clip.write_videofile(filename + '.mp4',fps = 24,remove_temp = True)
def main():
movie_url, movie_title = Get_Search()
for i,item in enumerate(movie_title):
print(str(i + 1) + ":" + item)
t = int(input('请输入要获取的索引:')) - 1
filename = movie_title[t]
online_url,thunder_url = Get_Download(movie_url[t])
mp4_url = Get_mp4Url(online_url)
urls = Get_Flowtxt(mp4_url)
select = input('请选择下载模式:\n1、整体下载\n2、流媒体合并\n')
if select == '1':
Save_movie(urls,filename)
elif select == '2':
Save_Flow(urls)
Merge_Flow(filename)
else:
print('error!')
if __name__ == '__main__':
print('爬虫正在执行')
main()
print('爬虫执行完毕')
写在后面
这种方式获取数据相当麻烦,而且无法利用榨取全部网速,不如直接从电影天堂获取迅雷下载链接然后进行下载,本文最好只作为练习爬虫和学习解析网页用,若想要靠这个爬虫批量下载电影,实话说还是算了吧,甚至你可能会被封IP,得不偿失,另外,整体保存获取的视频可能在快进的时候出现卡顿,而moviepy整合过的则不会,但是整合过程很慢很慢。