温习爬虫(使用IP代理池)


用到的包

import requests
from bs4 import BeautifulSoup as bs
import time
import random

requests配置

URL = 'https://www.haoshuya.com'
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36'
    }
proxies=[{'http':'114.103.168.183:3000'},{'http':'175.43.156.42:9999'},
         {'http':'218.64.148.47:9000'},{'http':'221.224.136.211:35101'},
         {'http:':'175.42.129.65:9999'},{'http':'117.64.225.154:1133'}]

目录与章节地址

url = URL + '/1/1627'
response = requests.get(url = url ,headers = headers)
response.encoding = response.apparent_encoding
html = response.text
bf = bs(html,'lxml')
html = bf.find('div',class_ = "cbn1zbis1t")
list_first = html.find_all('a',class_ = "cbn1afm1t1")
list_second = html.find_all('a',target = "_blank")
titles = []
urls = []
##以下可以改用if条件句
for i in range(3,1103):
    titles.append(list_second[i]['title'][6:])
    url = URL + list_second[i]['href']
    urls.append(url)
for i in range(1103,len(list_second)):
    titles.append(list_second[i]['title'][6:])
    url = list_second[i]['href']
    urls.append(url)

开始爬取

#统计错误页
error = []
for i in range(1120,len(urls)):
    #从IP代理池中随机获取一个
    j = random.randint(0,len(proxies) - 1)
    proxy = proxies[j]
    try:
        txt = requests.get(url = urls[i],headers = headers,proxies = proxy)
    except:
        error.append(i)
        print('第%s章下载s失败!' % str(i + 1))
        print('代理ip为{}'.format(proxy))
        continue
    txt.encoding = txt.apparent_encoding
    html = txt.text
    bf = bs(html,'lxml')
    #print(bf)
    #网站有一点小小的反爬
    context = bf.find('div',class_="cbd1con1so1").text
    if len(context) == 0:
        context = bf.find('div',class_="cbdlconlsol").text
    with open('雪中悍刀行.txt','a',encoding = 'utf-8') as f:
        f.write(titles[i] + '\n')
        f.write(context + '\n')
        f.close()
    print('第%s章下载完成!' % str(i + 1))
    time.sleep(1)
print(error)