爬取电影天堂流媒体并整合


用到的包

#用于请求网页
import requests
#分别用于正则匹配、获取目录信息、字符串加载为字典
import re,os,json
#用于解析网页
from bs4 import BeautifulSoup as bs
#用于合并流媒体
from moviepy.editor import *
#用于获取随机的匿名IP
import random

初始化

#获取电影关键词
keyword = input('请输入关键词:')
#为电影创建目录
if not os.path.exists(keyword):
    os.mkdir(keyword)
#切换到专属目录
os.chdir('./' + keyword)
#电影天堂的基础Url
BASE_URL = 'http://www.dytt.com'
#定义请求头模拟浏览器
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36'
    }
#设定匿名IP地址
proxies=[{'http':'171.12.115.205:9999'},{'http':'123.149.136.182:9999'},
         {'http':'121.232.148.67:9000'},{'http':'123.163.27.182:9999'},
         {'http:':'171.13.136.235:9999'},{'http':'123.149.137.132:9999'}]

搜索结果

def Get_Search():
    #构建搜索页的地址
    Search_URL = 'http://www.dytt.com/vodsearch/-------------.html?wd={}'.format(keyword)
    #定义两个空列表保存解析后的数据
    movie_url = []
    movie_title = []
    #请求搜索页
    res = requests.get(url = Search_URL,headers = headers,proxies = random.choice(proxies))
    #设定网页字符格式
    res.encoding = 'utf-8'
    #使用bs4解析网页并逐条分析
    movie_info = bs(res.text,'lxml').find_all('td',class_ = "textLeft")
    for i in range(len(movie_info)):
        if i % 2 == 0:
            #获取目标地址并添加到列表
            movie_url.append(BASE_URL + movie_info[i].find('a').get('href'))
            #获取目标标题并添加到列表
            movie_title.append(movie_info[i].find('a').text)
    #返回保存结果的列表
    return movie_url,movie_title

在线地址

def Get_Download(url):
    #根据选定的项目进行请求
    res = requests.get(url = url,headers = headers,proxies = random.choice(proxies))
    #设定字符格式
    res.encoding = 'utf-8'
    #网页解析
    bf = bs(res.text,'lxml')
    #获取在线播放地址
    online_url = BASE_URL + bf.find('ul',class_ = "dy_bofangul").find('a').get('href')
    #获取迅雷下载地址
    thunder_url = bf.find('div',class_ = 'downList').find_all('a',class_ = 'donw_xl thunder-link')
    #返回两种地址
    return online_url,thunder_url

流媒体Url

#g根据在线播放地址获取保存所有流媒体URL的网页
def Get_mp4Url(url):
    #请求在线播放地址
    res = requests.get(url = url,headers = headers,proxies = random.choice(proxies))
    res.encoding = 'utf-8'
    #解析网页
    bf = bs(res.text,'lxml')
    #想要的数据保存在这个div中
    string = bf.find('div',style = 'height: 400px;').find('script')
    #正则匹配{}中的内容
    pattern = re.compile('({.*?})')
    dict_temp = re.findall(pattern, str(string))
    #加载为字典并获取url
    mp4_flow = json.loads(dict_temp[0])['url']
    #返回流媒体总地址
    return mp4_flow
#根据总地址获取保存所有流媒体地址的列表
def Get_Flowtxt(url):
    #定义一个空字符串
    last_url = ''
    #分割总地址并重新拼接为一个次级地址
    temp = url.split('/')
    for item in temp[:-1]:
        last_url = last_url + item + '/'
    #对总地址进行请求
    res = requests.get(url = url,headers = headers,proxies = random.choice(proxies))
    res.edcoding = 'utf-8'
    #次级地址的另一部分保存在文本的第三行
    final_url = last_url + res.text.split('\n')[-1]
    #重新定义次级地址
    last_url = ''
    temp = final_url.split('/')
    for item in temp[:-1]:
        last_url = last_url + item + '/'
    #对完整的次级地址发起请求
    r = requests.get(url = final_url, headers = headers,proxies = random.choice(proxies))
    r.encoding = 'utf-8'
    #正则匹配所有be开头ts结尾的字符串
    pattern = re.compile('(be.*?\.ts)\n#',re.S)
    info = re.findall(pattern,r.text)
    #info列表中所有元素与次级地址拼接
    for i in range(len(info)):
        info[i] = last_url + info[i]
    #返回流媒体地址列表
    return info

整体保存

def Save_movie(urls,filename):
    #创建一个保存数据的文件并打开
    with open(filename + '.mp4','ab') as f:
        #遍历流媒体列表
        for i in range(len(urls)):
            try:
                #逐个访问
                res = requests.get(url = urls[i],headers = headers,proxies = random.choice(proxies))
                #向文件中写入数据
                f.write(res.content)
                #维持文件状态
                f.flush()
                #反馈信息
                print('第%s段下载完成!' % str(i + 1))
            except:
                #返回报错的流媒体段
                print("error:{}".format(i))
                #结束循环
                break
        #关闭文件
        f.close()

流媒体合并

def Save_Flow(urls):
    #空列表用于保存下载出错的流媒体段
    error = []
    #逐个下载
    for i in range(len(urls)):
        #构建文件名
        filename = urls[i].split('/')[-1].split('.')[0]
        #每段数据都创建一个文件用于保存
        with open(filename + '.mp4','wb') as f:
            try:
                res = requests.get(url = urls[i],headers = headers,proxies = random.choice(proxies))
                f.write(res.content)
                #保存完毕就关闭文件
                f.close()
            except:
                #统计错误的流媒体段
                error.append[i]
                #无视错误继续下载下一段视频
                pass
#调用moviepy模块整合视频
def Merge_Flow(filename):
    #空列表用于保存所有待合并视频
    L = []
    for name in os.listdir():
        #如果扩展名为mp4,则向列表中添加一条数据
        if name.split('.')[1] == 'mp4':
            #使用VideoFileClip预加载视频
            video = VideoFileClip(name)
            #输出视频部分信息
            #print("video time: %s, width: %s, height: %s, fps: %s" % (video.duration, video.w, video.h, video.fps))
            L.append(video)
    #拼接视频
    final_clip = concatenate_videoclips(L,method='compose')
    #生成目标视频文件
    final_clip.write_videofile(filename + '.mp4',fps = 24,remove_temp = True)

主函数

def main():
    #h获取搜索结果
    movie_url, movie_title = Get_Search()
    #打印所有搜索结果
    for i,item in enumerate(movie_title):
        print(str(i + 1) + ":" + item)
    #用于选定要下载的视频
    t = int(input('请输入要获取的索引:')) - 1
    #获取文件名
    filename = movie_title[t]
    #获取在线地址和迅雷地址(可以看需求保存)
    online_url,thunder_url = Get_Download(movie_url[t])
    #从在线地址中获取总的流媒体地址
    mp4_url = Get_mp4Url(online_url)
    #从总的流媒体地址获取全部流媒体块地址
    urls = Get_Flowtxt(mp4_url)
    #选择保存视频的方式
    select = input('请选择下载模式:\n1、整体下载\n2、流媒体合并\n')
    if select == '1':
        Save_movie(urls,filename)
    elif select == '2':
        Save_Flow(urls)
        Merge_Flow(filename)
    else:
        print('error!')

完整代码

# -*- coding: utf-8 -*-
"""
Created on Tue Sep  8 08:06:55 2020

@author: ljc545w
"""

import requests
import re,os,json
from bs4 import BeautifulSoup as bs
from moviepy.editor import *
import random
keyword = input('请输入关键词:')
if not os.path.exists(keyword):
    os.mkdir(keyword)
os.chdir('./' + keyword)

BASE_URL = 'http://www.dytt.com'

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36'
    }
proxies=[{'http':'171.12.115.205:9999'},{'http':'123.149.136.182:9999'},
         {'http':'121.232.148.67:9000'},{'http':'123.163.27.182:9999'},
         {'http:':'171.13.136.235:9999'},{'http':'123.149.137.132:9999'}]

def Get_Search():
    Search_URL = 'http://www.dytt.com/vodsearch/-------------.html?wd={}'.format(keyword)
    movie_url = []
    movie_title = []
    res = requests.get(url = Search_URL,headers = headers,proxies = random.choice(proxies))
    res.encoding = 'utf-8'
    movie_info = bs(res.text,'lxml').find_all('td',class_ = "textLeft")
    for i in range(len(movie_info)):
        if i % 2 == 0:
            movie_url.append(BASE_URL + movie_info[i].find('a').get('href'))
            movie_title.append(movie_info[i].find('a').text)
    return movie_url,movie_title

def Get_Download(url):
    res = requests.get(url = url,headers = headers,proxies = random.choice(proxies))
    res.encoding = 'utf-8'
    bf = bs(res.text,'lxml')
    online_url = BASE_URL + bf.find('ul',class_ = "dy_bofangul").find('a').get('href')
    thunder_url = bf.find('div',class_ = 'downList').find_all('a',class_ = 'donw_xl thunder-link')
    return online_url,thunder_url

def Get_mp4Url(url):
    res = requests.get(url = url,headers = headers,proxies = random.choice(proxies))
    res.encoding = 'utf-8'
    bf = bs(res.text,'lxml')
    string = bf.find('div',style = 'height: 400px;').find('script')
    pattern = re.compile('({.*?})')
    dict_temp = re.findall(pattern, str(string))
    mp4_flow = json.loads(dict_temp[0])['url']
    return mp4_flow

def Get_Flowtxt(url):
    last_url = ''
    temp = url.split('/')
    for item in temp[:-1]:
        last_url = last_url + item + '/'
    res = requests.get(url = url,headers = headers,proxies = random.choice(proxies))
    res.edcoding = 'utf-8'
    final_url = last_url + res.text.split('\n')[-1]
    r = requests.get(url = final_url, headers = headers,proxies = random.choice(proxies))
    r.encoding = 'utf-8'
    pattern = re.compile('(be.*?\.ts)\n#',re.S)
    info = re.findall(pattern,r.text)
    for i in range(len(info)):
        info[i] = last_url + info[i]
    return info

def Save_movie(urls,filename):
    with open(filename + '.mp4','ab') as f:
        for i in range(len(urls)):
            try:
                res = requests.get(url = urls[i],headers = headers,proxies = random.choice(proxies))
                f.write(res.content)
                f.flush()
                print('第%s段下载完成!' % str(i + 1))
                """
                if i == 10:
                    break
                """
            except:
                print("error:{}".format(i))
                break
        f.close()

def Save_Flow(urls):
    error = []
    for i in range(len(urls)):
        filename = urls[i].split('/')[-1].split('.')[0]
        with open(filename + '.mp4','wb') as f:
            try:
                res = requests.get(url = urls[i],headers = headers,proxies = random.choice(proxies))
                f.write(res.content)
                f.close()
            except:
                error.append[i]
                pass

def Merge_Flow(filename):
    L = []
    for name in os.listdir():
        #print(name)
        if name.split('.')[1] == 'mp4':
            video = VideoFileClip(name)
            #print("video time: %s, width: %s, height: %s, fps: %s" % (video.duration, video.w, video.h, video.fps))
            L.append(video)
    #拼接视频
    final_clip = concatenate_videoclips(L,method='compose')
    #生成目标视频文件
    final_clip.write_videofile(filename + '.mp4',fps = 24,remove_temp = True)

def main():
    movie_url, movie_title = Get_Search()
    for i,item in enumerate(movie_title):
        print(str(i + 1) + ":" + item)
    t = int(input('请输入要获取的索引:')) - 1
    filename = movie_title[t]
    online_url,thunder_url = Get_Download(movie_url[t])
    mp4_url = Get_mp4Url(online_url)
    urls = Get_Flowtxt(mp4_url)
    select = input('请选择下载模式:\n1、整体下载\n2、流媒体合并\n')
    if select == '1':
        Save_movie(urls,filename)
    elif select == '2':
        Save_Flow(urls)
        Merge_Flow(filename)
    else:
        print('error!')

if __name__ == '__main__':
    print('爬虫正在执行')
    main()
    print('爬虫执行完毕')

写在后面

这种方式获取数据相当麻烦,而且无法利用榨取全部网速,不如直接从电影天堂获取迅雷下载链接然后进行下载,本文最好只作为练习爬虫和学习解析网页用,若想要靠这个爬虫批量下载电影,实话说还是算了吧,甚至你可能会被封IP,得不偿失,另外,整体保存获取的视频可能在快进的时候出现卡顿,而moviepy整合过的则不会,但是整合过程很慢很慢。