#!/usr/bin/python # -*- coding:utf-8 -*- import requests # 用来抓取网页的html源码 import random # 取随机数 from bs4 import BeautifulSoup # 用于代替正则式 取源码中相应标签中的内容 import sys import time # 时间相关操作 import js2py import os from tqdm import tqdm class downloader(object): def __init__(self): self.server = 'https://www.iimanhua.cc/' self.target = 'https://www.iimanhua.cc/comic/2189/' self.names = [] # 章节名 self.urls = [] # 章节链接 self.nums = 0 # 章节数 """ 获取html文档内容 """ def get_content(self, url): # 设置headers是为了模拟浏览器访问 否则的话可能会被拒绝 可通过浏览器获取,这里不用修改 header = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Connection': 'keep-alive', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN, zh', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36' } # 设置一个超时时间 取随机数 是为了防止网站被认定为爬虫,不用修改 timeout = random.choice(range(80, 180)) while True: try: req = requests.get(url=url, headers=header) req.encoding = 'GBK' #这里是网页的编码转换,根据网页的实际需要进行修改,经测试这个编码没有问题 break except Exception as e: print('3', e) time.sleep(random.choice(range(5, 10))) return req.text """ 获取下载的章节目录 """ def get_download_catalogue(self, url): # html = self.get_content(url) # bf = BeautifulSoup(html, 'html.parser') # print(bf) # texts = bf.find_all('div', {'class': 'listmain'}) finename = "./kkk.txt" f = open(finename,'r', encoding='utf-8') # 返回一个文件对象 line = f.readline() while line: # print(line.strip('\n')) name, url = self.get_url(line) self.names.append(name) self.urls.append(self.server + url) line = f.readline() # print(self.urls) self.nums = len(self.urls) """ 获取下载的具体章节 """ def get_url(self, url_str): st = url_str.find("/comic") ed = url_str.find("\" title") st2 = url_str.find(")") ed2 = url_str.find("\">") url = url_str[st:ed] name = url_str[st2+1:ed2] return name, url def get_download_content(self, chap, path, name, url): #html = self.get_content(url) chappath = os.path.join(path, str(chap).zfill(3)+ '话 ' +name) os.makedirs(chappath, exist_ok=True) html = self.get_content(url) bf = BeautifulSoup(html, 'html.parser') jscmd = bf.find('script', {'language': 'javascript', 'type': 'text/javascript'}).text # print(jscmd) jscmd += ''' \nvar b=base64decode(packed).slice(4); var a = eval(b); ''' # print(jscmd) jsres = js2py.eval_js(jscmd) #执行js代码 imgurls = self.get_img_url(jsres) page_no = 1 for imgurl in tqdm(imgurls): r=requests.get(imgurl) with open(chappath+'/'+str(page_no)+'.jpg','wb') as f: f.write(r.content) #写入二进制内容 page_no += 1 """ 解析url """ def get_img_url(self, jsres): imgserver = 'https://res.img.96youhuiquan.com/' imgstrs = jsres.split(";") imgurls = [] for imgstr in imgstrs: if len(imgstr)>1: st = imgstr.find("]=") imgurl = imgstr[st+3:-1] imgurls.append(imgserver+imgurl) return imgurls def writer(self, path, name, text): write_flag = True with open(path, 'a', encoding='utf-8') as f: f.writelines(name) f.write('\n') f.writelines(text) f.write('\n\n') if __name__ == '__main__': path = './duannao/' dl = downloader() dl.get_download_catalogue(dl.target) for chap_no in range(77-1, dl.nums): print("第" + str(chap_no+1) + "话") dl.get_download_content(chap_no+1, path, dl.names[chap_no], dl.urls[chap_no])