#!/usr/bin/python # -*- coding:utf-8 -*- import requests # 用来抓取网页的html源码 from bs4 import BeautifulSoup # 用于代替正则式 取源码中相应标签中的内容 import time # 时间相关操作 import os from rich.progress import track as tqdm from utils import * import zipfile import shutil import re import pickle from PIL import Image import time import threading from concurrent.futures import ThreadPoolExecutor, wait import pickle from selenium import webdriver from selenium.webdriver.edge.options import Options lock = threading.RLock() class Editer(object): def __init__(self, root_path, head='https://www.linovelib.com', book_no='0000', volume_no=1): self.header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36 Edg/87.0.664.47', 'referer': head, 'cookie':'night=1'} self.url_head = head options = Options() options.add_argument('--start-minimized') self.driver = webdriver.Edge(options = options) self.main_page = f'{self.url_head}/novel/{book_no}.html' self.cata_page = f'{self.url_head}/novel/{book_no}/catalog' self.read_tool_page = f'{self.url_head}/themes/zhmb/js/readtool.js' self.color_chap_name = '插图' self.color_page_name = '彩页' self.html_buffer = dict() main_html = self.get_html(self.main_page) bf = BeautifulSoup(main_html, 'html.parser') self.title = bf.find('meta', {"property": "og:novel:book_name"})['content'] self.author = bf.find('meta', {"property": "og:novel:author"})['content'] try: self.cover_url = re.search(r'src=\"(.*?)\"', str(bf.find('div', {"class": "book-img fl"}))).group(1) except: self.cover_url = 'cid' self.img_url_map = dict() self.volume_no = volume_no self.epub_path = root_path self.temp_path = os.path.join(self.epub_path, 'temp_'+ check_chars(self.title) + '_' + str(self.volume_no)) self.missing_last_chap_list = [] self.is_color_page = True self.page_url_map = dict() self.ignore_urls = [] self.url_buffer = [] self.max_thread_num = 8 self.pool = ThreadPoolExecutor(self.max_thread_num) # 获取html文档内容 def get_html(self, url, is_gbk=False): while True: time.sleep(0.5) self.driver.get(url) req = self.driver.page_source while '
[img:{self.img_url_map[img_url]}]
' if '00' in img_symbol: text_html = text_html.replace(img_urlre, '') #默认第一张为封面图片 不写入彩页 else: text_html = text_html.replace(img_urlre, img_symbol) symbol_index = text_html.index(img_symbol) if text_html[symbol_index-1] != '\n': text_html = text_html[:symbol_index] + '\n' + text_html[symbol_index:] text = BeautifulSoup(text_html, 'html.parser').get_text() return text def get_chap_text(self, url, chap_name, return_next_chapter=False): text_chap = '' page_no = 1 url_ori = url next_chap_url = None while True: if page_no == 1: str_out = chap_name else: str_out = f' 正在下载第{page_no}页......' print(str_out) content_html = self.get_html(url, is_gbk=False) text = self.get_page_text(content_html) text_chap += text url_new = url_ori.replace('.html', '_{}.html'.format(page_no+1))[len(self.url_head):] if url_new in content_html: page_no += 1 url = self.url_head + url_new else: if return_next_chapter: next_chap_url = self.url_head + re.search(r'书签下一章', content_html).group(1) break return text_chap, next_chap_url def get_text(self): self.make_folder() img_strs = [] #记录后文中出现的所有图片位置 text_no=0 #text_no正文章节编号(排除插图) chap_no 是所有章节编号 for chap_no, (chap_name, chap_url) in enumerate(zip(self.volume['chap_names'], self.volume['chap_urls'])): is_fix_next_chap_url = (chap_name in self.missing_last_chap_list) text, next_chap_url = self.get_chap_text(chap_url, chap_name, return_next_chapter=is_fix_next_chap_url) if is_fix_next_chap_url: self.volume['chap_urls'][chap_no+1] = next_chap_url #正向修复 if chap_name == self.color_chap_name: text_html_color = text2htmls(self.color_page_name, text) else: text_html = text2htmls(chap_name, text) textfile = self.text_path + f'/{str(text_no).zfill(2)}.xhtml' with open(textfile, 'w+', encoding='utf-8') as f: f.writelines(text_html) for text_line in text_html: img_str = re.search(r"