artUtil.py 2.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778
  1. import re
  2. import urllib.request
  3. from markdownify import markdownify as md
  4. from bs4 import BeautifulSoup
  5. def get(url):
  6. try:
  7. request = urllib.request.Request(url)
  8. response = urllib.request.urlopen(request)
  9. text = response.read().decode("utf-8")
  10. return text
  11. except urllib.error.HTTPError as e:
  12. return f"HTTP Error {e.code}"
  13. except Exception as e:
  14. return str(e)
  15. def htmlHandler(html):
  16. soup = BeautifulSoup(html, 'html.parser')
  17. # 提取title
  18. title = soup.title.string if soup.title else "No Title"
  19. # 提取.article部分
  20. article_element = soup.select_one('.article')
  21. if article_element:
  22. content = str(article_element)
  23. else:
  24. content = html
  25. return title, content
  26. def html2markdown(html):
  27. # 使用markdownify将HTML转换为Markdown,保留格式
  28. return md(html,
  29. heading_style="ATX",
  30. bullets="-",
  31. strip=['script', 'style'])
  32. def postMarkdownHandler(markdown):
  33. # 在所有图片链接前加上"https://www-new.gwng.edu.cn/"
  34. # 匹配markdown中的图片语法 ![alt](src)
  35. def add_base_url(match):
  36. alt_text = match.group(1)
  37. img_url = match.group(2)
  38. # 如果URL已经是完整链接,不添加前缀
  39. if img_url.startswith('http://') or img_url.startswith('https://'):
  40. return f'![{alt_text}]({img_url})'
  41. # 如果URL已经是相对路径且不以斜杠开头,添加斜杠
  42. elif not img_url.startswith('/'):
  43. return f'![{alt_text}](https://www-new.gwng.edu.cn/{img_url})'
  44. else:
  45. return f'![{alt_text}](https://www-new.gwng.edu.cn{img_url})'
  46. # 匹配图片语法
  47. pattern = r'!\[(.*?)\]\((.*?)\)'
  48. markdown = re.sub(pattern, add_base_url, markdown)
  49. return markdown
  50. def save(filename, content):
  51. import re
  52. # 清理文件名中的非法字符
  53. filename = re.sub(r'[\\/*?:"<>|]', '_', filename)
  54. with open(f"{filename}.md", "w", encoding="utf-8") as f:
  55. f.write(content)
  56. # 每次使用将url修改为文章对应的地址
  57. url = 'https://www-new.gwng.edu.cn/xkxy/2020/0724/c1287a48158/page.psp'
  58. html = get(url)
  59. title, content = htmlHandler(html)
  60. markdown = html2markdown(content)
  61. markdown = postMarkdownHandler(markdown)
  62. save(title, markdown)