147 lines
4.5 KiB
Python
147 lines
4.5 KiB
Python
|
#!/usr/bin/python
|
|||
|
# -*- coding:utf-8 -*-
|
|||
|
|
|||
|
import requests # 用来抓取网页的html源码
|
|||
|
import random # 取随机数
|
|||
|
from bs4 import BeautifulSoup # 用于代替正则式 取源码中相应标签中的内容
|
|||
|
import sys
|
|||
|
import time # 时间相关操作
|
|||
|
import js2py
|
|||
|
import os
|
|||
|
from tqdm import tqdm
|
|||
|
|
|||
|
|
|||
|
class downloader(object):
|
|||
|
def __init__(self):
|
|||
|
self.server = 'https://www.iimanhua.cc/'
|
|||
|
self.target = 'https://www.iimanhua.cc/comic/2189/'
|
|||
|
self.names = [] # 章节名
|
|||
|
self.urls = [] # 章节链接
|
|||
|
self.nums = 0 # 章节数
|
|||
|
|
|||
|
"""
|
|||
|
获取html文档内容
|
|||
|
"""
|
|||
|
|
|||
|
def get_content(self, url):
|
|||
|
# 设置headers是为了模拟浏览器访问 否则的话可能会被拒绝 可通过浏览器获取,这里不用修改
|
|||
|
header = {
|
|||
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|||
|
'Connection': 'keep-alive',
|
|||
|
'Accept-Encoding': 'gzip, deflate, br',
|
|||
|
'Accept-Language': 'zh-CN, zh',
|
|||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
|
|||
|
}
|
|||
|
|
|||
|
# 设置一个超时时间 取随机数 是为了防止网站被认定为爬虫,不用修改
|
|||
|
timeout = random.choice(range(80, 180))
|
|||
|
|
|||
|
while True:
|
|||
|
try:
|
|||
|
req = requests.get(url=url, headers=header)
|
|||
|
req.encoding = 'GBK' #这里是网页的编码转换,根据网页的实际需要进行修改,经测试这个编码没有问题
|
|||
|
break
|
|||
|
except Exception as e:
|
|||
|
print('3', e)
|
|||
|
time.sleep(random.choice(range(5, 10)))
|
|||
|
return req.text
|
|||
|
|
|||
|
"""
|
|||
|
获取下载的章节目录
|
|||
|
"""
|
|||
|
|
|||
|
def get_download_catalogue(self, url):
|
|||
|
# html = self.get_content(url)
|
|||
|
# bf = BeautifulSoup(html, 'html.parser')
|
|||
|
# print(bf)
|
|||
|
# texts = bf.find_all('div', {'class': 'listmain'})
|
|||
|
|
|||
|
finename = "./kkk.txt"
|
|||
|
|
|||
|
f = open(finename,'r', encoding='utf-8') # 返回一个文件对象
|
|||
|
line = f.readline()
|
|||
|
while line:
|
|||
|
# print(line.strip('\n'))
|
|||
|
name, url = self.get_url(line)
|
|||
|
self.names.append(name)
|
|||
|
self.urls.append(self.server + url)
|
|||
|
line = f.readline()
|
|||
|
# print(self.urls)
|
|||
|
self.nums = len(self.urls)
|
|||
|
|
|||
|
|
|||
|
|
|||
|
"""
|
|||
|
获取下载的具体章节
|
|||
|
"""
|
|||
|
|
|||
|
def get_url(self, url_str):
|
|||
|
st = url_str.find("/comic")
|
|||
|
ed = url_str.find("\" title")
|
|||
|
st2 = url_str.find(")")
|
|||
|
ed2 = url_str.find("\">")
|
|||
|
url = url_str[st:ed]
|
|||
|
name = url_str[st2+1:ed2]
|
|||
|
return name, url
|
|||
|
|
|||
|
|
|||
|
|
|||
|
def get_download_content(self, chap, path, name, url):
|
|||
|
#html = self.get_content(url)
|
|||
|
chappath = os.path.join(path, str(chap).zfill(3)+ '话 ' +name)
|
|||
|
os.makedirs(chappath, exist_ok=True)
|
|||
|
html = self.get_content(url)
|
|||
|
bf = BeautifulSoup(html, 'html.parser')
|
|||
|
jscmd = bf.find('script', {'language': 'javascript', 'type': 'text/javascript'}).text
|
|||
|
# print(jscmd)
|
|||
|
jscmd += '''
|
|||
|
\nvar b=base64decode(packed).slice(4);
|
|||
|
var a = eval(b);
|
|||
|
'''
|
|||
|
# print(jscmd)
|
|||
|
jsres = js2py.eval_js(jscmd) #执行js代码
|
|||
|
imgurls = self.get_img_url(jsres)
|
|||
|
page_no = 1
|
|||
|
for imgurl in tqdm(imgurls):
|
|||
|
r=requests.get(imgurl)
|
|||
|
with open(chappath+'/'+str(page_no)+'.jpg','wb') as f:
|
|||
|
f.write(r.content) #写入二进制内容
|
|||
|
page_no += 1
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
"""
|
|||
|
解析url
|
|||
|
"""
|
|||
|
def get_img_url(self, jsres):
|
|||
|
imgserver = 'https://res.img.96youhuiquan.com/'
|
|||
|
imgstrs = jsres.split(";")
|
|||
|
imgurls = []
|
|||
|
for imgstr in imgstrs:
|
|||
|
if len(imgstr)>1:
|
|||
|
st = imgstr.find("]=")
|
|||
|
imgurl = imgstr[st+3:-1]
|
|||
|
imgurls.append(imgserver+imgurl)
|
|||
|
return imgurls
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
def writer(self, path, name, text):
|
|||
|
write_flag = True
|
|||
|
with open(path, 'a', encoding='utf-8') as f:
|
|||
|
f.writelines(name)
|
|||
|
f.write('\n')
|
|||
|
f.writelines(text)
|
|||
|
f.write('\n\n')
|
|||
|
|
|||
|
|
|||
|
if __name__ == '__main__':
|
|||
|
path = './duannao/'
|
|||
|
dl = downloader()
|
|||
|
dl.get_download_catalogue(dl.target)
|
|||
|
for chap_no in range(77-1, dl.nums):
|
|||
|
print("第" + str(chap_no+1) + "话")
|
|||
|
dl.get_download_content(chap_no+1, path, dl.names[chap_no], dl.urls[chap_no])
|