import re import urllib.request from markdownify import markdownify as md from bs4 import BeautifulSoup def get(url): try: request = urllib.request.Request(url) response = urllib.request.urlopen(request) text = response.read().decode("utf-8") return text except urllib.error.HTTPError as e: return f"HTTP Error {e.code}" except Exception as e: return str(e) def htmlHandler(html): soup = BeautifulSoup(html, 'html.parser') # 提取title title = soup.title.string if soup.title else "No Title" # 提取.article部分 article_element = soup.select_one('.article') if article_element: content = str(article_element) else: content = html return title, content def html2markdown(html): # 使用markdownify将HTML转换为Markdown,保留格式 return md(html, heading_style="ATX", bullets="-", strip=['script', 'style']) def postMarkdownHandler(markdown): # 在所有图片链接前加上"https://www-new.gwng.edu.cn/" # 匹配markdown中的图片语法 ![alt](src) def add_base_url(match): alt_text = match.group(1) img_url = match.group(2) # 如果URL已经是完整链接,不添加前缀 if img_url.startswith('http://') or img_url.startswith('https://'): return f'![{alt_text}]({img_url})' # 如果URL已经是相对路径且不以斜杠开头,添加斜杠 elif not img_url.startswith('/'): return f'![{alt_text}](https://www-new.gwng.edu.cn/{img_url})' else: return f'![{alt_text}](https://www-new.gwng.edu.cn{img_url})' # 匹配图片语法 pattern = r'!\[(.*?)\]\((.*?)\)' markdown = re.sub(pattern, add_base_url, markdown) return markdown def save(filename, content): import re # 清理文件名中的非法字符 filename = re.sub(r'[\\/*?:"<>|]', '_', filename) with open(f"{filename}.md", "w", encoding="utf-8") as f: f.write(content) # 每次使用将url修改为文章对应的地址 url = 'https://www-new.gwng.edu.cn/xkxy/2020/0724/c1287a48158/page.psp' html = get(url) title, content = htmlHandler(html) markdown = html2markdown(content) markdown = postMarkdownHandler(markdown) save(title, markdown)