147 lines
4.5 KiB
Python
147 lines
4.5 KiB
Python
#!/usr/bin/python
|
||
# -*- coding:utf-8 -*-
|
||
|
||
import requests # 用来抓取网页的html源码
|
||
import random # 取随机数
|
||
from bs4 import BeautifulSoup # 用于代替正则式 取源码中相应标签中的内容
|
||
import sys
|
||
import time # 时间相关操作
|
||
import js2py
|
||
import os
|
||
from tqdm import tqdm
|
||
|
||
|
||
class downloader(object):
|
||
def __init__(self):
|
||
self.server = 'https://www.iimanhua.cc/'
|
||
self.target = 'https://www.iimanhua.cc/comic/2189/'
|
||
self.names = [] # 章节名
|
||
self.urls = [] # 章节链接
|
||
self.nums = 0 # 章节数
|
||
|
||
"""
|
||
获取html文档内容
|
||
"""
|
||
|
||
def get_content(self, url):
|
||
# 设置headers是为了模拟浏览器访问 否则的话可能会被拒绝 可通过浏览器获取,这里不用修改
|
||
header = {
|
||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||
'Connection': 'keep-alive',
|
||
'Accept-Encoding': 'gzip, deflate, br',
|
||
'Accept-Language': 'zh-CN, zh',
|
||
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
|
||
}
|
||
|
||
# 设置一个超时时间 取随机数 是为了防止网站被认定为爬虫,不用修改
|
||
timeout = random.choice(range(80, 180))
|
||
|
||
while True:
|
||
try:
|
||
req = requests.get(url=url, headers=header)
|
||
req.encoding = 'GBK' #这里是网页的编码转换,根据网页的实际需要进行修改,经测试这个编码没有问题
|
||
break
|
||
except Exception as e:
|
||
print('3', e)
|
||
time.sleep(random.choice(range(5, 10)))
|
||
return req.text
|
||
|
||
"""
|
||
获取下载的章节目录
|
||
"""
|
||
|
||
def get_download_catalogue(self, url):
|
||
# html = self.get_content(url)
|
||
# bf = BeautifulSoup(html, 'html.parser')
|
||
# print(bf)
|
||
# texts = bf.find_all('div', {'class': 'listmain'})
|
||
|
||
finename = "./kkk.txt"
|
||
|
||
f = open(finename,'r', encoding='utf-8') # 返回一个文件对象
|
||
line = f.readline()
|
||
while line:
|
||
# print(line.strip('\n'))
|
||
name, url = self.get_url(line)
|
||
self.names.append(name)
|
||
self.urls.append(self.server + url)
|
||
line = f.readline()
|
||
# print(self.urls)
|
||
self.nums = len(self.urls)
|
||
|
||
|
||
|
||
"""
|
||
获取下载的具体章节
|
||
"""
|
||
|
||
def get_url(self, url_str):
|
||
st = url_str.find("/comic")
|
||
ed = url_str.find("\" title")
|
||
st2 = url_str.find(")")
|
||
ed2 = url_str.find("\">")
|
||
url = url_str[st:ed]
|
||
name = url_str[st2+1:ed2]
|
||
return name, url
|
||
|
||
|
||
|
||
def get_download_content(self, chap, path, name, url):
|
||
#html = self.get_content(url)
|
||
chappath = os.path.join(path, str(chap).zfill(3)+ '话 ' +name)
|
||
os.makedirs(chappath, exist_ok=True)
|
||
html = self.get_content(url)
|
||
bf = BeautifulSoup(html, 'html.parser')
|
||
jscmd = bf.find('script', {'language': 'javascript', 'type': 'text/javascript'}).text
|
||
# print(jscmd)
|
||
jscmd += '''
|
||
\nvar b=base64decode(packed).slice(4);
|
||
var a = eval(b);
|
||
'''
|
||
# print(jscmd)
|
||
jsres = js2py.eval_js(jscmd) #执行js代码
|
||
imgurls = self.get_img_url(jsres)
|
||
page_no = 1
|
||
for imgurl in tqdm(imgurls):
|
||
r=requests.get(imgurl)
|
||
with open(chappath+'/'+str(page_no)+'.jpg','wb') as f:
|
||
f.write(r.content) #写入二进制内容
|
||
page_no += 1
|
||
|
||
|
||
|
||
|
||
"""
|
||
解析url
|
||
"""
|
||
def get_img_url(self, jsres):
|
||
imgserver = 'https://res.img.96youhuiquan.com/'
|
||
imgstrs = jsres.split(";")
|
||
imgurls = []
|
||
for imgstr in imgstrs:
|
||
if len(imgstr)>1:
|
||
st = imgstr.find("]=")
|
||
imgurl = imgstr[st+3:-1]
|
||
imgurls.append(imgserver+imgurl)
|
||
return imgurls
|
||
|
||
|
||
|
||
|
||
def writer(self, path, name, text):
|
||
write_flag = True
|
||
with open(path, 'a', encoding='utf-8') as f:
|
||
f.writelines(name)
|
||
f.write('\n')
|
||
f.writelines(text)
|
||
f.write('\n\n')
|
||
|
||
|
||
if __name__ == '__main__':
|
||
path = './duannao/'
|
||
dl = downloader()
|
||
dl.get_download_catalogue(dl.target)
|
||
for chap_no in range(77-1, dl.nums):
|
||
print("第" + str(chap_no+1) + "话")
|
||
dl.get_download_content(chap_no+1, path, dl.names[chap_no], dl.urls[chap_no])
|