|
|
@@ -0,0 +1,78 @@
|
|
|
+import re
|
|
|
+import urllib.request
|
|
|
+from markdownify import markdownify as md
|
|
|
+from bs4 import BeautifulSoup
|
|
|
+
|
|
|
+
|
|
|
+def get(url):
|
|
|
+ try:
|
|
|
+ request = urllib.request.Request(url)
|
|
|
+ response = urllib.request.urlopen(request)
|
|
|
+ text = response.read().decode("utf-8")
|
|
|
+ return text
|
|
|
+ except urllib.error.HTTPError as e:
|
|
|
+ return f"HTTP Error {e.code}"
|
|
|
+ except Exception as e:
|
|
|
+ return str(e)
|
|
|
+
|
|
|
+
|
|
|
+def htmlHandler(html):
|
|
|
+ soup = BeautifulSoup(html, 'html.parser')
|
|
|
+
|
|
|
+ # 提取title
|
|
|
+ title = soup.title.string if soup.title else "No Title"
|
|
|
+
|
|
|
+ # 提取.article部分
|
|
|
+ article_element = soup.select_one('.article')
|
|
|
+ if article_element:
|
|
|
+ content = str(article_element)
|
|
|
+ else:
|
|
|
+ content = html
|
|
|
+
|
|
|
+ return title, content
|
|
|
+
|
|
|
+
|
|
|
+def html2markdown(html):
|
|
|
+ # 使用markdownify将HTML转换为Markdown,保留格式
|
|
|
+ return md(html,
|
|
|
+ heading_style="ATX",
|
|
|
+ bullets="-",
|
|
|
+ strip=['script', 'style'])
|
|
|
+
|
|
|
+
|
|
|
+def postMarkdownHandler(markdown):
|
|
|
+ # 在所有图片链接前加上"https://www-new.gwng.edu.cn/"
|
|
|
+ # 匹配markdown中的图片语法 
|
|
|
+ def add_base_url(match):
|
|
|
+ alt_text = match.group(1)
|
|
|
+ img_url = match.group(2)
|
|
|
+ # 如果URL已经是完整链接,不添加前缀
|
|
|
+ if img_url.startswith('http://') or img_url.startswith('https://'):
|
|
|
+ return f''
|
|
|
+ # 如果URL已经是相对路径且不以斜杠开头,添加斜杠
|
|
|
+ elif not img_url.startswith('/'):
|
|
|
+ return f''
|
|
|
+ else:
|
|
|
+ return f''
|
|
|
+
|
|
|
+ # 匹配图片语法
|
|
|
+ pattern = r'!\[(.*?)\]\((.*?)\)'
|
|
|
+ markdown = re.sub(pattern, add_base_url, markdown)
|
|
|
+
|
|
|
+ return markdown
|
|
|
+
|
|
|
+
|
|
|
+def save(filename, content):
|
|
|
+ import re
|
|
|
+ # 清理文件名中的非法字符
|
|
|
+ filename = re.sub(r'[\\/*?:"<>|]', '_', filename)
|
|
|
+ with open(f"{filename}.md", "w", encoding="utf-8") as f:
|
|
|
+ f.write(content)
|
|
|
+
|
|
|
+# 每次使用将url修改为文章对应的地址
|
|
|
+url = 'https://www-new.gwng.edu.cn/xkxy/2020/0724/c1287a48158/page.psp'
|
|
|
+html = get(url)
|
|
|
+title, content = htmlHandler(html)
|
|
|
+markdown = html2markdown(content)
|
|
|
+markdown = postMarkdownHandler(markdown)
|
|
|
+save(title, markdown)
|