147 lines
4.5 KiB
Python
Raw Permalink Normal View History

2024-09-27 16:59:07 +08:00
#!/usr/bin/python
# -*- coding:utf-8 -*-
import requests # 用来抓取网页的html源码
import random # 取随机数
from bs4 import BeautifulSoup # 用于代替正则式 取源码中相应标签中的内容
import sys
import time # 时间相关操作
import js2py
import os
from tqdm import tqdm
class downloader(object):
def __init__(self):
self.server = 'https://www.iimanhua.cc/'
self.target = 'https://www.iimanhua.cc/comic/2189/'
self.names = [] # 章节名
self.urls = [] # 章节链接
self.nums = 0 # 章节数
"""
获取html文档内容
"""
def get_content(self, url):
# 设置headers是为了模拟浏览器访问 否则的话可能会被拒绝 可通过浏览器获取,这里不用修改
header = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Connection': 'keep-alive',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN, zh',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
}
# 设置一个超时时间 取随机数 是为了防止网站被认定为爬虫,不用修改
timeout = random.choice(range(80, 180))
while True:
try:
req = requests.get(url=url, headers=header)
req.encoding = 'GBK' #这里是网页的编码转换,根据网页的实际需要进行修改,经测试这个编码没有问题
break
except Exception as e:
print('3', e)
time.sleep(random.choice(range(5, 10)))
return req.text
"""
获取下载的章节目录
"""
def get_download_catalogue(self, url):
# html = self.get_content(url)
# bf = BeautifulSoup(html, 'html.parser')
# print(bf)
# texts = bf.find_all('div', {'class': 'listmain'})
finename = "./kkk.txt"
f = open(finename,'r', encoding='utf-8') # 返回一个文件对象
line = f.readline()
while line:
# print(line.strip('\n'))
name, url = self.get_url(line)
self.names.append(name)
self.urls.append(self.server + url)
line = f.readline()
# print(self.urls)
self.nums = len(self.urls)
"""
获取下载的具体章节
"""
def get_url(self, url_str):
st = url_str.find("/comic")
ed = url_str.find("\" title")
st2 = url_str.find("")
ed2 = url_str.find("\">")
url = url_str[st:ed]
name = url_str[st2+1:ed2]
return name, url
def get_download_content(self, chap, path, name, url):
#html = self.get_content(url)
chappath = os.path.join(path, str(chap).zfill(3)+ '' +name)
os.makedirs(chappath, exist_ok=True)
html = self.get_content(url)
bf = BeautifulSoup(html, 'html.parser')
jscmd = bf.find('script', {'language': 'javascript', 'type': 'text/javascript'}).text
# print(jscmd)
jscmd += '''
\nvar b=base64decode(packed).slice(4);
var a = eval(b);
'''
# print(jscmd)
jsres = js2py.eval_js(jscmd) #执行js代码
imgurls = self.get_img_url(jsres)
page_no = 1
for imgurl in tqdm(imgurls):
r=requests.get(imgurl)
with open(chappath+'/'+str(page_no)+'.jpg','wb') as f:
f.write(r.content) #写入二进制内容
page_no += 1
"""
解析url
"""
def get_img_url(self, jsres):
imgserver = 'https://res.img.96youhuiquan.com/'
imgstrs = jsres.split(";")
imgurls = []
for imgstr in imgstrs:
if len(imgstr)>1:
st = imgstr.find("]=")
imgurl = imgstr[st+3:-1]
imgurls.append(imgserver+imgurl)
return imgurls
def writer(self, path, name, text):
write_flag = True
with open(path, 'a', encoding='utf-8') as f:
f.writelines(name)
f.write('\n')
f.writelines(text)
f.write('\n\n')
if __name__ == '__main__':
path = './duannao/'
dl = downloader()
dl.get_download_catalogue(dl.target)
for chap_no in range(77-1, dl.nums):
print("" + str(chap_no+1) + "")
dl.get_download_content(chap_no+1, path, dl.names[chap_no], dl.urls[chap_no])