用到的包
import requests
from bs4 import BeautifulSoup as bs
import time
import random
requests配置
URL = 'https://www.haoshuya.com'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36'
}
proxies=[{'http':'114.103.168.183:3000'},{'http':'175.43.156.42:9999'},
{'http':'218.64.148.47:9000'},{'http':'221.224.136.211:35101'},
{'http:':'175.42.129.65:9999'},{'http':'117.64.225.154:1133'}]
目录与章节地址
url = URL + '/1/1627'
response = requests.get(url = url ,headers = headers)
response.encoding = response.apparent_encoding
html = response.text
bf = bs(html,'lxml')
html = bf.find('div',class_ = "cbn1zbis1t")
list_first = html.find_all('a',class_ = "cbn1afm1t1")
list_second = html.find_all('a',target = "_blank")
titles = []
urls = []
##以下可以改用if条件句
for i in range(3,1103):
titles.append(list_second[i]['title'][6:])
url = URL + list_second[i]['href']
urls.append(url)
for i in range(1103,len(list_second)):
titles.append(list_second[i]['title'][6:])
url = list_second[i]['href']
urls.append(url)
开始爬取
#统计错误页
error = []
for i in range(1120,len(urls)):
#从IP代理池中随机获取一个
j = random.randint(0,len(proxies) - 1)
proxy = proxies[j]
try:
txt = requests.get(url = urls[i],headers = headers,proxies = proxy)
except:
error.append(i)
print('第%s章下载s失败!' % str(i + 1))
print('代理ip为{}'.format(proxy))
continue
txt.encoding = txt.apparent_encoding
html = txt.text
bf = bs(html,'lxml')
#print(bf)
#网站有一点小小的反爬
context = bf.find('div',class_="cbd1con1so1").text
if len(context) == 0:
context = bf.find('div',class_="cbdlconlsol").text
with open('雪中悍刀行.txt','a',encoding = 'utf-8') as f:
f.write(titles[i] + '\n')
f.write(context + '\n')
f.close()
print('第%s章下载完成!' % str(i + 1))
time.sleep(1)
print(error)