| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778 |
- import re
- import urllib.request
- from markdownify import markdownify as md
- from bs4 import BeautifulSoup
- def get(url):
- try:
- request = urllib.request.Request(url)
- response = urllib.request.urlopen(request)
- text = response.read().decode("utf-8")
- return text
- except urllib.error.HTTPError as e:
- return f"HTTP Error {e.code}"
- except Exception as e:
- return str(e)
- def htmlHandler(html):
- soup = BeautifulSoup(html, 'html.parser')
- # 提取title
- title = soup.title.string if soup.title else "No Title"
- # 提取.article部分
- article_element = soup.select_one('.article')
- if article_element:
- content = str(article_element)
- else:
- content = html
- return title, content
- def html2markdown(html):
- # 使用markdownify将HTML转换为Markdown,保留格式
- return md(html,
- heading_style="ATX",
- bullets="-",
- strip=['script', 'style'])
- def postMarkdownHandler(markdown):
- # 在所有图片链接前加上"https://www-new.gwng.edu.cn/"
- # 匹配markdown中的图片语法 
- def add_base_url(match):
- alt_text = match.group(1)
- img_url = match.group(2)
- # 如果URL已经是完整链接,不添加前缀
- if img_url.startswith('http://') or img_url.startswith('https://'):
- return f''
- # 如果URL已经是相对路径且不以斜杠开头,添加斜杠
- elif not img_url.startswith('/'):
- return f''
- else:
- return f''
- # 匹配图片语法
- pattern = r'!\[(.*?)\]\((.*?)\)'
- markdown = re.sub(pattern, add_base_url, markdown)
- return markdown
- def save(filename, content):
- import re
- # 清理文件名中的非法字符
- filename = re.sub(r'[\\/*?:"<>|]', '_', filename)
- with open(f"{filename}.md", "w", encoding="utf-8") as f:
- f.write(content)
- # 每次使用将url修改为文章对应的地址
- url = 'https://www-new.gwng.edu.cn/xkxy/2020/0724/c1287a48158/page.psp'
- html = get(url)
- title, content = htmlHandler(html)
- markdown = html2markdown(content)
- markdown = postMarkdownHandler(markdown)
- save(title, markdown)
|