2024-09-27 16:59:07 +08:00

147 lines
4.5 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/python
# -*- coding:utf-8 -*-
import requests # 用来抓取网页的html源码
import random # 取随机数
from bs4 import BeautifulSoup # 用于代替正则式 取源码中相应标签中的内容
import sys
import time # 时间相关操作
import js2py
import os
from tqdm import tqdm
class downloader(object):
def __init__(self):
self.server = 'https://www.iimanhua.cc/'
self.target = 'https://www.iimanhua.cc/comic/2189/'
self.names = [] # 章节名
self.urls = [] # 章节链接
self.nums = 0 # 章节数
"""
获取html文档内容
"""
def get_content(self, url):
# 设置headers是为了模拟浏览器访问 否则的话可能会被拒绝 可通过浏览器获取,这里不用修改
header = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Connection': 'keep-alive',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN, zh',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
}
# 设置一个超时时间 取随机数 是为了防止网站被认定为爬虫,不用修改
timeout = random.choice(range(80, 180))
while True:
try:
req = requests.get(url=url, headers=header)
req.encoding = 'GBK' #这里是网页的编码转换,根据网页的实际需要进行修改,经测试这个编码没有问题
break
except Exception as e:
print('3', e)
time.sleep(random.choice(range(5, 10)))
return req.text
"""
获取下载的章节目录
"""
def get_download_catalogue(self, url):
# html = self.get_content(url)
# bf = BeautifulSoup(html, 'html.parser')
# print(bf)
# texts = bf.find_all('div', {'class': 'listmain'})
finename = "./kkk.txt"
f = open(finename,'r', encoding='utf-8') # 返回一个文件对象
line = f.readline()
while line:
# print(line.strip('\n'))
name, url = self.get_url(line)
self.names.append(name)
self.urls.append(self.server + url)
line = f.readline()
# print(self.urls)
self.nums = len(self.urls)
"""
获取下载的具体章节
"""
def get_url(self, url_str):
st = url_str.find("/comic")
ed = url_str.find("\" title")
st2 = url_str.find("")
ed2 = url_str.find("\">")
url = url_str[st:ed]
name = url_str[st2+1:ed2]
return name, url
def get_download_content(self, chap, path, name, url):
#html = self.get_content(url)
chappath = os.path.join(path, str(chap).zfill(3)+ '' +name)
os.makedirs(chappath, exist_ok=True)
html = self.get_content(url)
bf = BeautifulSoup(html, 'html.parser')
jscmd = bf.find('script', {'language': 'javascript', 'type': 'text/javascript'}).text
# print(jscmd)
jscmd += '''
\nvar b=base64decode(packed).slice(4);
var a = eval(b);
'''
# print(jscmd)
jsres = js2py.eval_js(jscmd) #执行js代码
imgurls = self.get_img_url(jsres)
page_no = 1
for imgurl in tqdm(imgurls):
r=requests.get(imgurl)
with open(chappath+'/'+str(page_no)+'.jpg','wb') as f:
f.write(r.content) #写入二进制内容
page_no += 1
"""
解析url
"""
def get_img_url(self, jsres):
imgserver = 'https://res.img.96youhuiquan.com/'
imgstrs = jsres.split(";")
imgurls = []
for imgstr in imgstrs:
if len(imgstr)>1:
st = imgstr.find("]=")
imgurl = imgstr[st+3:-1]
imgurls.append(imgserver+imgurl)
return imgurls
def writer(self, path, name, text):
write_flag = True
with open(path, 'a', encoding='utf-8') as f:
f.writelines(name)
f.write('\n')
f.writelines(text)
f.write('\n\n')
if __name__ == '__main__':
path = './duannao/'
dl = downloader()
dl.get_download_catalogue(dl.target)
for chap_no in range(77-1, dl.nums):
print("" + str(chap_no+1) + "")
dl.get_download_content(chap_no+1, path, dl.names[chap_no], dl.urls[chap_no])